Connected to Python 3.10.12

In [1]:
import pandas as pd
import numpy as np
import regex as re 
from cleaning_utils import *
from nltk.tokenize import NLTKWordTokenizer
from utils import *

In [2]:
df = get_tweets_from_db(URI, DB_NAME, 'AAPL')
df = pd.concat([pd.DataFrame(d) for d in df])

# ? replace tags
df['content_cleaned'] = df['content'].apply(replace_tags)
# ? remove tweets with less than 3 words
df = df.loc[df['content_cleaned'].apply(get_length)]

df['true_sentiment'] = df['true_sentiment'].replace(['bullish', 'bearish'], [1, 0])
df = df.reset_index(drop=True)

  df['true_sentiment'] = df['true_sentiment'].replace(['bullish', 'bearish'], [1, 0])


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

df_balanced = pd.concat([df.loc[df['true_sentiment'] == 1].sample(500000), df.loc[df['true_sentiment'] == 0].sample(500000)]).sample(1000000)
df_unbalanced = pd.concat([df.loc[df['true_sentiment'] == 1].sample(800000), df.loc[df['true_sentiment'] == 0].sample(200000)]).sample(1000000)
X_balanced = CountVectorizer(stop_words='english').fit_transform(df_balanced['content_cleaned'])
y_balanced = df_balanced['true_sentiment']

X_unbalanced = CountVectorizer(stop_words='english').fit_transform(df_unbalanced['content_cleaned'])
y_unbalanced = df_unbalanced['true_sentiment']

In [4]:
### NBM is MultinomialNB
model = MultinomialNB(alpha=0.1, fit_prior=True, class_prior=None)

def evaluate_model_on_different_sizes(X, y, model, sizes, cv=5, adjust_weight=False, weights_neg=1):
    results = []
    for size in sizes:
        X_subset, y_subset = X[:size], y[:size]

        if adjust_weight:
            sample_weights = [weights_neg if sent == 0 else 1 for sent in y_subset]

            accuracy = cross_val_score(model, X_subset, y_subset, cv=cv, scoring='balanced_accuracy', params={'sample_weight': sample_weights}).mean()
            mcc = cross_val_score(model, X_subset, y_subset, cv=cv, scoring='matthews_corrcoef', params={'sample_weight': sample_weights}).mean()

            results.append({
                'size': size,
                'balanced_accuracy': accuracy,
                'mcc': mcc
            })


        else:
            accuracy = cross_val_score(model, X_subset, y_subset, cv=cv, scoring='accuracy').mean()
            mcc = cross_val_score(model, X_subset, y_subset, cv=cv, scoring='matthews_corrcoef').mean()

            results.append({
                'size': size,
                'accuracy': accuracy,
                'mcc': mcc
            })
    return pd.DataFrame(results)

# ! TO IMPROVE BY TRAINING ON UNBALANCED AND TESTING ON BALANCED

sizes = [500, 1000, 2500, 5000, 10000, 25000, 50000, 100000, 250000, 500000, 1000000] 

# ? Balanced Evaluation
results = evaluate_model_on_different_sizes(X_balanced, y_balanced, model, sizes)
results

Unnamed: 0,size,accuracy,mcc
0,500,0.602,0.201411
1,1000,0.598,0.195081
2,2500,0.6,0.198906
3,5000,0.6376,0.27504
4,10000,0.6586,0.317587
5,25000,0.67436,0.349476
6,50000,0.69512,0.391052
7,100000,0.70751,0.415858
8,250000,0.720944,0.442782
9,500000,0.729298,0.459519


In [5]:
# ? Unbalanced Evaluation
results_unb = evaluate_model_on_different_sizes(X_unbalanced, y_unbalanced, model, sizes)
results_unb

Unnamed: 0,size,accuracy,mcc
0,500,0.792,0.049279
1,1000,0.808,0.047613
2,2500,0.79,0.066789
3,5000,0.7968,0.132709
4,10000,0.7949,0.139753
5,25000,0.80148,0.225004
6,50000,0.80578,0.286295
7,100000,0.80617,0.313802
8,250000,0.814224,0.35693
9,500000,0.81953,0.37698


In [6]:
# ? Unbalanced Evaluation with Balanced Accuracy Metric
results_unb_bal_acc = evaluate_model_on_different_sizes(X_unbalanced, y_unbalanced, model, sizes, adjust_weight=True, weights_neg=1)
results_unb_bal_acc

Unnamed: 0,size,balanced_accuracy,mcc
0,500,0.507008,0.049279
1,1000,0.507369,0.047613
2,2500,0.514594,0.066789
3,5000,0.529566,0.132709
4,10000,0.538269,0.139753
5,25000,0.578736,0.225004
6,50000,0.614035,0.286295
7,100000,0.633394,0.313802
8,250000,0.655824,0.35693
9,500000,0.665191,0.37698


In [7]:
# ? Unbalanced Evaluation with Balanced Accuracy Metric and Adjusted class weighs for training 
results_unb_bal_acc_class = evaluate_model_on_different_sizes(X_unbalanced, y_unbalanced, model, sizes, adjust_weight=True, weights_neg=4)
results_unb_bal_acc_class

Unnamed: 0,size,balanced_accuracy,mcc
0,500,0.543571,0.084276
1,1000,0.566918,0.125259
2,2500,0.569908,0.126689
3,5000,0.602552,0.185904
4,10000,0.609958,0.193158
5,25000,0.659441,0.271244
6,50000,0.681972,0.306423
7,100000,0.694037,0.324194
8,250000,0.71261,0.352991
9,500000,0.722922,0.368993


In [8]:
def evaluate_model_on_different_grams(X, y, model, grams_to_test, cv=5):
    results = []
    for grams in grams_to_test:
        print(grams)
        X_input = CountVectorizer(stop_words='english', ngram_range=grams).fit_transform(X)

        accuracy = cross_val_score(model, X_input, y, cv=cv, scoring='accuracy').mean()
        mcc = cross_val_score(model, X_input, y, cv=cv, scoring='matthews_corrcoef').mean()

        results.append({
            'ngram_range': grams,
            'accuracy': accuracy,
            'mcc': mcc
        })

    return pd.DataFrame(results)

df_balanced_250 = pd.concat([df.loc[df['true_sentiment'] == 1].sample(125000), df.loc[df['true_sentiment'] == 0].sample(125000)]).sample(250000)

X = df_balanced_250['content_cleaned']
y = df_balanced_250['true_sentiment']

grams_to_test = [(1, 1), (1, 2), (1, 3), (1, 4)]
results_grams = evaluate_model_on_different_grams(X, y, model, grams_to_test)
results_grams

(1, 1)
(1, 2)
(1, 3)
(1, 4)


Unnamed: 0,ngram_range,accuracy,mcc
0,"(1, 1)",0.719452,0.439776
1,"(1, 2)",0.74852,0.498224
2,"(1, 3)",0.749704,0.50152
3,"(1, 4)",0.747316,0.498157


In [39]:
def evaluate_model_on_different_prepro(X, y, model, prepro_params, cv=5):
    results = []

    for prepro in prepro_params:
        print(prepro)

        if prepro.get('token_pattern', 'None') != 'None':

            def clean_reg(text, pattern):
                text = re.sub(pattern, ' ', text)
                text = re.sub(' +', ' ', text)

                return text

            X_emojis = X.copy().apply(clean_reg, pattern=prepro['token_pattern'])
            X_input = CountVectorizer(ngram_range=(1, 2), tokenizer=prepro['tokenizer']).fit_transform(X_emojis)

        else:
            X_input = CountVectorizer(ngram_range=(1, 2), **prepro).fit_transform(X)

        accuracy = cross_val_score(model, X_input, y, cv=cv, scoring='accuracy').mean()
        mcc = cross_val_score(model, X_input, y, cv=cv, scoring='matthews_corrcoef').mean()

        results.append({
            'preprocessing': prepro,
            'accuracy': accuracy,
            'mcc': mcc
        })

    return pd.DataFrame(results)

prepro_params = [
    {},   # ? Benchmark
    {'token_pattern': r'[^(?u)\b\w\w+\b|,|!|\?|%|\+|=|-|:|;|\(|\)|\]]', 'tokenizer': lambda x: NLTKWordTokenizer().tokenize(x)}, # ? Punctuation
    {'tokenizer': lambda x: PorterStemmer().stem(x)}, # ? Stemmer
    {'token_pattern': r'[^(?u)\b\w\w+\b|\p{Extended_Pictographic}]|[0-9]', 'tokenizer': lambda x: NLTKWordTokenizer().tokenize(x)}, # ? Emojis
    {'stop_words': 'english'}, # ? StopWords 
    {'token_pattern': r'[^(?u)\b\w\w+\b|,|!|\?|%|\+|=|-|:|;|\(|\)|\]|/<a?:.+?:\d{18}>|\p{Extended_Pictographic}]|[0-9]', 'tokenizer': lambda x: NLTKWordTokenizer().tokenize(x)}, # ? Emojis + Punctuation
    {'tokenizer': lambda x: [' '.join([t[0], t[1]]) for t in nltk.pos_tag(NLTKWordTokenizer().tokenize(x))]}, # ? PosTagging 
    {'tokenizer': lambda x: nltk.TweetTokenizer().tokenize(x)} # ? Tweet Tokenizer
]

res = evaluate_model_on_different_prepro(X, y, model, prepro_params)
res['preprocessing'] = ['Benchmark',  'Punctuation', 'Stemmer', 'Emojis', 'StopWords', 'Emojis + Punctuation', 'PosTagging', 'Tweet Tokenizer']
res

{}
{'token_pattern': '[^(?u)\\b\\w\\w+\\b|,|!|\\?|%|\\+|=|-|:|;|\\(|\\)|\\]]', 'tokenizer': <function <lambda> at 0x759260f0c160>}




{'tokenizer': <function <lambda> at 0x7592610b80d0>}




{'token_pattern': '[^(?u)\\b\\w\\w+\\b|\\p{Extended_Pictographic}]|[0-9]', 'tokenizer': <function <lambda> at 0x75926119b0a0>}




{'stop_words': 'english'}
{'token_pattern': '[^(?u)\\b\\w\\w+\\b|,|!|\\?|%|\\+|=|-|:|;|\\(|\\)|\\]|/<a?:.+?:\\d{18}>|\\p{Extended_Pictographic}]|[0-9]', 'tokenizer': <function <lambda> at 0x75925e32c670>}




{'tokenizer': <function <lambda> at 0x75925e8c0310>}




{'tokenizer': <function <lambda> at 0x75925e8c3520>}




Unnamed: 0,preprocessing,accuracy,mcc
0,Benchmark,0.767256,0.535504
1,Punctuation,0.77006,0.541255
2,Stemmer,0.624724,0.26186
3,Emojis,0.772316,0.545332
4,StopWords,0.74852,0.498224
5,Emojis + Punctuation,0.773428,0.547655
6,PosTagging,0.766048,0.534285
7,Tweet Tokenizer,0.781828,0.564614
