Connected to Python 3.10.12

In [1]:
import pandas as pd
import numpy as np
import regex as re 
from cleaning_utils import *
from nltk.tokenize import NLTKWordTokenizer
from utils import *

In [2]:
df = get_tweets_from_db(URI, DB_NAME, 'AAPL')
df = pd.concat([pd.DataFrame(d) for d in df])

# ? replace tags
df['content_cleaned'] = df['content'].apply(replace_tags)
# ? remove tweets with less than 3 words
df = df.loc[df['content_cleaned'].apply(get_length)]

df['true_sentiment'] = df['true_sentiment'].replace(['bullish', 'bearish'], [1, 0])
df = df.reset_index(drop=True)

  df['true_sentiment'] = df['true_sentiment'].replace(['bullish', 'bearish'], [1, 0])


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

df_balanced = pd.concat([df.loc[df['true_sentiment'] == 1].sample(500000), df.loc[df['true_sentiment'] == 0].sample(500000)]).sample(1000000)
df_unbalanced = pd.concat([df.loc[df['true_sentiment'] == 1].sample(800000), df.loc[df['true_sentiment'] == 0].sample(200000)]).sample(1000000)
X_balanced = CountVectorizer(stop_words='english').fit_transform(df_balanced['content_cleaned'])
y_balanced = df_balanced['true_sentiment']

X_unbalanced = CountVectorizer(stop_words='english').fit_transform(df_unbalanced['content_cleaned'])
y_unbalanced = df_unbalanced['true_sentiment']

In [4]:
### NBM is MultinomialNB
model = MultinomialNB(alpha=0.1, fit_prior=True, class_prior=None)

def evaluate_model_on_different_sizes(X, y, model, sizes, cv=5, adjust_weight=False, weights_neg=1):
    results = []
    for size in sizes:
        X_subset, y_subset = X[:size], y[:size]

        if adjust_weight:
            sample_weights = [weights_neg if sent == 0 else 1 for sent in y_subset]

            accuracy = cross_val_score(model, X_subset, y_subset, cv=cv, scoring='balanced_accuracy', params={'sample_weight': sample_weights}).mean()
            mcc = cross_val_score(model, X_subset, y_subset, cv=cv, scoring='matthews_corrcoef', params={'sample_weight': sample_weights}).mean()

            results.append({
                'size': size,
                'balanced_accuracy': accuracy,
                'mcc': mcc
            })


        else:
            accuracy = cross_val_score(model, X_subset, y_subset, cv=cv, scoring='accuracy').mean()
            mcc = cross_val_score(model, X_subset, y_subset, cv=cv, scoring='matthews_corrcoef').mean()

            results.append({
                'size': size,
                'accuracy': accuracy,
                'mcc': mcc
            })
    return pd.DataFrame(results)

# ! TO IMPROVE BY TRAINING ON UNBALANCED AND TESTING ON BALANCED

sizes = [500, 1000, 2500, 5000, 10000, 25000, 50000, 100000, 250000, 500000, 1000000] 

# ? Balanced Evaluation
results = evaluate_model_on_different_sizes(X_balanced, y_balanced, model, sizes)
results

Unnamed: 0,size,accuracy,mcc
0,500,0.57,0.140398
1,1000,0.599,0.200327
2,2500,0.6296,0.259992
3,5000,0.6382,0.277038
4,10000,0.6561,0.31244
5,25000,0.67904,0.358552
6,50000,0.69578,0.392277
7,100000,0.70867,0.418148
8,250000,0.720244,0.441397
9,500000,0.727472,0.455926


In [5]:
# ? Unbalanced Evaluation
results_unb = evaluate_model_on_different_sizes(X_unbalanced, y_unbalanced, model, sizes)
results_unb

Unnamed: 0,size,accuracy,mcc
0,500,0.794,0.054983
1,1000,0.781,0.071349
2,2500,0.7688,0.073226
3,5000,0.7778,0.103883
4,10000,0.7888,0.168528
5,25000,0.79112,0.219437
6,50000,0.80044,0.283372
7,100000,0.80408,0.316829
8,250000,0.812672,0.356054
9,500000,0.819028,0.377616


In [6]:
# ? Unbalanced Evaluation with Balanced Accuracy Metric
results_unb_bal_acc = evaluate_model_on_different_sizes(X_unbalanced, y_unbalanced, model, sizes, adjust_weight=True, weights_neg=1)
results_unb_bal_acc

Unnamed: 0,size,balanced_accuracy,mcc
0,500,0.509492,0.054983
1,1000,0.513714,0.071349
2,2500,0.513989,0.073226
3,5000,0.523516,0.103883
4,10000,0.55027,0.168528
5,25000,0.579864,0.219437
6,50000,0.61396,0.283372
7,100000,0.63493,0.316829
8,250000,0.656096,0.356054
9,500000,0.665809,0.377616


In [7]:
# ? Unbalanced Evaluation with Balanced Accuracy Metric and Adjusted class weighs for training 
results_unb_bal_acc_class = evaluate_model_on_different_sizes(X_unbalanced, y_unbalanced, model, sizes, adjust_weight=True, weights_neg=4)
results_unb_bal_acc_class

Unnamed: 0,size,balanced_accuracy,mcc
0,500,0.563314,0.137182
1,1000,0.580096,0.147212
2,2500,0.57508,0.134901
3,5000,0.590621,0.159166
4,10000,0.624286,0.216252
5,25000,0.655217,0.264185
6,50000,0.67806,0.299521
7,100000,0.693292,0.322809
8,250000,0.712222,0.352542
9,500000,0.723626,0.370008


In [8]:
def evaluate_model_on_different_grams(X, y, model, grams_to_test, cv=5):
    results = []
    for grams in grams_to_test:
        print(grams)
        X_input = CountVectorizer(stop_words='english', ngram_range=grams).fit_transform(X)

        accuracy = cross_val_score(model, X_input, y, cv=cv, scoring='accuracy').mean()
        mcc = cross_val_score(model, X_input, y, cv=cv, scoring='matthews_corrcoef').mean()

        results.append({
            'ngram_range': grams,
            'accuracy': accuracy,
            'mcc': mcc
        })

    return pd.DataFrame(results)

df_balanced_250 = pd.concat([df.loc[df['true_sentiment'] == 1].sample(125000), df.loc[df['true_sentiment'] == 0].sample(125000)]).sample(250000)

X = df_balanced_250['content_cleaned']
y = df_balanced_250['true_sentiment']

grams_to_test = [(1, 1), (1, 2), (1, 3), (1, 4)]
results_grams = evaluate_model_on_different_grams(X, y, model, grams_to_test)
results_grams

(1, 1)
(1, 2)
(1, 3)
(1, 4)


Unnamed: 0,ngram_range,accuracy,mcc
0,"(1, 1)",0.721368,0.443512
1,"(1, 2)",0.749568,0.500397
2,"(1, 3)",0.749144,0.500585
3,"(1, 4)",0.746292,0.496323


In [10]:
def evaluate_model_on_different_prepro(X, y, model, prepro_params, cv=5):
    results = []

    for prepro in prepro_params:
        print(prepro)

        if prepro.get('token_pattern', 'None') != 'None':

            def clean_reg(text, pattern):
                text = re.sub(pattern, ' ', text)
                text = re.sub(' +', ' ', text)

                return text

            X_emojis = X.copy().apply(clean_reg, pattern=prepro['token_pattern'])
            X_input = CountVectorizer(ngram_range=(1, 2), token_pattern=r'[^\s]+ ').fit_transform(X_emojis)

        else:
            X_input = CountVectorizer(ngram_range=(1, 2), **prepro).fit_transform(X)

        accuracy = cross_val_score(model, X_input, y, cv=cv, scoring='accuracy').mean()
        mcc = cross_val_score(model, X_input, y, cv=cv, scoring='matthews_corrcoef').mean()

        results.append({
            'preprocessing': prepro,
            'accuracy': accuracy,
            'mcc': mcc
        })

    return pd.DataFrame(results)

prepro_params = [
    {},   # ? Benchmark
    {'token_pattern': r'[^(?u)\b\w\w+\b|,|!|\?|%|\+|=|-|:|;|\(|\)|\]]'}, # ? Punctuation
    {'tokenizer': lambda x: PorterStemmer().stem(x)}, # ? Stemmer
    {'token_pattern': r'[^(?u)\b\w\w+\b|\p{Extended_Pictographic}]|[0-9]'}, # ? Emojis
    {'stop_words': 'english'}, # ? StopWords 
    {'token_pattern': r'[^(?u)\b\w\w+\b|,|!|\?|%|\+|=|-|:|;|\(|\)|\]|/<a?:.+?:\d{18}>|\p{Extended_Pictographic}]|[0-9]'}, # ? Emojis + Punctuation
    {'tokenizer': lambda x: [' '.join([t[0], t[1]]) for t in nltk.pos_tag(NLTKWordTokenizer().tokenize(x))]}, # ? PosTagging 
    {'tokenizer': lambda x: nltk.TweetTokenizer().tokenize(x)} # ? Tweet Tokenizer
]

res = evaluate_model_on_different_prepro(X, y, model, prepro_params)
res['preprocessing'] = ['Benchmark',  'Punctuation', 'Stemmer', 'Emojis', 'StopWords', 'Emojis + Punctuation', 'PosTagging', 'Tweet Tokenizer']
res

{}
{'token_pattern': '[^(?u)\\b\\w\\w+\\b|,|!|\\?|%|\\+|=|-|:|;|\\(|\\)|\\]]'}
{'tokenizer': <function <lambda> at 0x74d842bb0a60>}




{'token_pattern': '[^(?u)\\b\\w\\w+\\b|\\p{Extended_Pictographic}]|[0-9]'}
{'stop_words': 'english'}
{'token_pattern': '[^(?u)\\b\\w\\w+\\b|,|!|\\?|%|\\+|=|-|:|;|\\(|\\)|\\]|/<a?:.+?:\\d{18}>|\\p{Extended_Pictographic}]|[0-9]'}
{'tokenizer': <function <lambda> at 0x74d876ff20e0>}




{'tokenizer': <function <lambda> at 0x74d876ff1510>}




Unnamed: 0,preprocessing,accuracy,mcc
0,Benchmark,0.769292,0.5396
1,Punctuation,0.75238,0.506516
2,Stemmer,0.6264,0.264884
3,Emojis,0.755232,0.511577
4,StopWords,0.749568,0.500397
5,Emojis + Punctuation,0.745316,0.492117
6,PosTagging,0.767004,0.53609
7,Tweet Tokenizer,0.783884,0.568599
