Connected to Python 3.10.12

In [1]:
import pandas as pd
import numpy as np
import regex as re 
from cleaning_utils import *
from nltk.tokenize import NLTKWordTokenizer

In [2]:
df = pd.read_parquet('full_tweets.parquet')

# ? replace tags
df['content_cleaned'] = df['content'].apply(replace_tags)
# ? remove tweets with less than 3 words
df = df.loc[df['content_cleaned'].apply(get_length)]


df['true_sentiment'] = df['true_sentiment'].replace(['bullish', 'bearish'], [1, 0])
df = df.reset_index(drop=True)

  df['true_sentiment'] = df['true_sentiment'].replace(['bullish', 'bearish'], [1, 0])


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

df_balanced = pd.concat([df.loc[df['true_sentiment'] == 1].sample(500000), df.loc[df['true_sentiment'] == 0].sample(500000)]).sample(1000000)
df_unbalanced = pd.concat([df.loc[df['true_sentiment'] == 1].sample(800000), df.loc[df['true_sentiment'] == 0].sample(200000)]).sample(1000000)
X_balanced = CountVectorizer(stop_words='english').fit_transform(df_balanced['content_cleaned'])
y_balanced = df_balanced['true_sentiment']

X_unbalanced = CountVectorizer(stop_words='english').fit_transform(df_unbalanced['content_cleaned'])
y_unbalanced = df_unbalanced['true_sentiment']

In [4]:
### NBM is MultinomialNB
model = MultinomialNB(alpha=0.1, fit_prior=True, class_prior=None)

def evaluate_model_on_different_sizes(X, y, model, sizes, cv=5, adjust_weight=False):
    results = []
    for size in sizes:
        X_subset, y_subset = X[:size], y[:size]

        if adjust_weight:
            sample_weights = [4 if sent == 0 else 1 for sent in y_subset]

            accuracy = cross_val_score(model, X_subset, y_subset, cv=cv, scoring='accuracy', params={'sample_weight': sample_weights}).mean()
            mcc = cross_val_score(model, X_subset, y_subset, cv=cv, scoring='matthews_corrcoef', params={'sample_weight': sample_weights}).mean()

        else:
            accuracy = cross_val_score(model, X_subset, y_subset, cv=cv, scoring='accuracy').mean()
            mcc = cross_val_score(model, X_subset, y_subset, cv=cv, scoring='matthews_corrcoef').mean()

        results.append({
            'size': size,
            'accuracy': accuracy,
            'mcc': mcc
        })
    return pd.DataFrame(results)

# ! TO IMPROVE BY TRAINING ON UNBALANCED AND TESTING ON BALANCED

sizes = [500, 1000, 2500, 5000, 10000, 25000, 50000, 100000, 250000, 500000, 1000000] 
results = evaluate_model_on_different_sizes(X_balanced, y_balanced, model, sizes)
results
# ? Fit Gwen algo on size for both balanced and unbalanced [500, 1000, 2500, 5000, 10000, 25000, 50000, 100000, 250000, 500000, 1000000]

Unnamed: 0,size,accuracy,mcc
0,500,0.556,0.113326
1,1000,0.599,0.197996
2,2500,0.6144,0.229231
3,5000,0.6328,0.265649
4,10000,0.65,0.300162
5,25000,0.68548,0.371189
6,50000,0.70276,0.406013
7,100000,0.71,0.420605
8,250000,0.721776,0.444175
9,500000,0.729682,0.460252


In [5]:
results_unb = evaluate_model_on_different_sizes(X_unbalanced, y_unbalanced, model, sizes)
results_unb

Unnamed: 0,size,accuracy,mcc
0,500,0.782,0.003534
1,1000,0.8,0.009626
2,2500,0.8076,0.066034
3,5000,0.7916,0.124845
4,10000,0.7978,0.190525
5,25000,0.79864,0.231514
6,50000,0.79972,0.269425
7,100000,0.80719,0.317422
8,250000,0.814072,0.355603
9,500000,0.81924,0.378025


In [6]:
def evaluate_model_on_different_grams(X, y, model, grams_to_test, cv=5):
    results = []
    for grams in grams_to_test:
        print(grams)
        X_input = CountVectorizer(stop_words='english', ngram_range=grams).fit_transform(X)

        accuracy = cross_val_score(model, X_input, y, cv=cv, scoring='accuracy').mean()
        mcc = cross_val_score(model, X_input, y, cv=cv, scoring='matthews_corrcoef').mean()

        results.append({
            'ngram_range': grams,
            'accuracy': accuracy,
            'mcc': mcc
        })

    return pd.DataFrame(results)

df_balanced_250 = pd.concat([df.loc[df['true_sentiment'] == 1].sample(125000), df.loc[df['true_sentiment'] == 0].sample(125000)]).sample(250000)

X = df_balanced_250['content_cleaned']
y = df_balanced_250['true_sentiment']

grams_to_test = [(1, 1), (1, 2), (1, 3), (1, 4)]
results_grams = evaluate_model_on_different_grams(X, y, model, grams_to_test)
results_grams

(1, 1)
(1, 2)
(1, 3)
(1, 4)


Unnamed: 0,ngram_range,accuracy,mcc
0,"(1, 1)",0.721128,0.443133
1,"(1, 2)",0.749872,0.501025
2,"(1, 3)",0.750364,0.502947
3,"(1, 4)",0.747816,0.499229


In [7]:
def evaluate_model_on_different_prepro(X, y, model, prepro_params, cv=5):
    results = []

    for prepro in prepro_params:
        print(prepro)

        if prepro.get('token_pattern', 'None') != 'None':

            def clean_reg(text, pattern):
                text = re.sub(pattern, ' ', text)
                text = re.sub(' +', ' ', text)

                return text

            X_emojis = X.copy().apply(clean_reg, pattern=prepro['token_pattern'])
            X_input = CountVectorizer(ngram_range=(1, 2), token_pattern=r'[^\s]+ ').fit_transform(X_emojis)

        else:
            X_input = CountVectorizer(ngram_range=(1, 2), **prepro).fit_transform(X)

        accuracy = cross_val_score(model, X_input, y, cv=cv, scoring='accuracy').mean()
        mcc = cross_val_score(model, X_input, y, cv=cv, scoring='matthews_corrcoef').mean()

        results.append({
            'ngram_range': prepro,
            'accuracy': accuracy,
            'mcc': mcc
        })

    return pd.DataFrame(results)

prepro_params = [
    {},   # ? Benchmark
    {'token_pattern': r'[^(?u)\b\w\w+\b|,|!|\?|%|\+|=|-|:|;|\(|\)|\]]'}, # ? Punctuation
    {'tokenizer': lambda x: PorterStemmer().stem(x)}, # ? Stemmer
    {'token_pattern': r'[^(?u)\b\w\w+\b|\p{Extended_Pictographic}]|[0-9]'}, # ? Emojis
    {'stop_words': 'english'}, # ? StopWords 
    {'token_pattern': r'[^(?u)\b\w\w+\b|,|!|\?|%|\+|=|-|:|;|\(|\)|\]|/<a?:.+?:\d{18}>|\p{Extended_Pictographic}]|[0-9]'}, # ? Emojis + Punctuation
    {'tokenizer': lambda x: [' '.join([t[0], t[1]]) for t in nltk.pos_tag(NLTKWordTokenizer().tokenize(x))]} # ? PosTagging 
]

res = evaluate_model_on_different_prepro(X, y, model, prepro_params)
res

{}
{'token_pattern': '[^(?u)\\b\\w\\w+\\b|,|!|\\?|%|\\+|=|-|:|;|\\(|\\)|\\]]'}
{'tokenizer': <function <lambda> at 0x7b15ed1c27a0>}




{'token_pattern': '[^(?u)\\b\\w\\w+\\b|\\p{Extended_Pictographic}]|[0-9]'}
{'stop_words': 'english'}
{'token_pattern': '[^(?u)\\b\\w\\w+\\b|,|!|\\?|%|\\+|=|-|:|;|\\(|\\)|\\]|/<a?:.+?:\\d{18}>|\\p{Extended_Pictographic}]|[0-9]'}
{'tokenizer': <function <lambda> at 0x7b15ecf5e320>}




Unnamed: 0,ngram_range,accuracy,mcc
0,{},0.767136,0.535403
1,"{'token_pattern': '[^(?u)\b\w\w+\b|,|!|\?|%|\+...",0.750128,0.502064
2,{'tokenizer': <function <lambda> at 0x7b15ed1c...,0.624652,0.261176
3,{'token_pattern': '[^(?u)\b\w\w+\b|\p{Extended...,0.753804,0.508775
4,{'stop_words': 'english'},0.749872,0.501025
5,"{'token_pattern': '[^(?u)\b\w\w+\b|,|!|\?|%|\+...",0.742308,0.486142
6,{'tokenizer': <function <lambda> at 0x7b15ecf5...,0.766508,0.535156
