In [2]:
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import ParameterGrid
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_csv('/home/jason/sentiment_data/clean.txt', sep='&&&', header=None, names=['id','sentiment','text'],
                engine='python', index_col=0)

In [4]:
SAMPLE_PERCENTAGE: float = 0.1
RANDOM_SAMPLE_SEED: int = 42

param_template = {
    'alpha': np.arange(0.1, 10.0, 0.5),
    'fit_prior': [True, False],
    'norm': [True, False]
}

parameter_grid = ParameterGrid(param_template)
len(parameter_grid)

80

In [5]:
def preprocess(text):
    # remove links and special characters
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", text).split())

In [6]:
for parameters in parameter_grid:
    df_sample = df.sample(frac=SAMPLE_PERCENTAGE, random_state=RANDOM_SAMPLE_SEED)
    df_sample['text_processed'] = df_sample['text'].apply(preprocess)
    
    cv = CountVectorizer(binary=True, stop_words='english')
    cv.fit(df_sample['text_processed'])
    X = cv.transform(df_sample['text_processed'])


    X_train, X_val, y_train, y_val = train_test_split(
        X, df_sample['sentiment'], train_size = 0.8
    )
    
    nb = ComplementNB(alpha=parameters['alpha'],
                    fit_prior=parameters['fit_prior'],
                    norm=parameters['norm'])
                            
    nb.fit(X_train, y_train)
    parameters['score'] = str(accuracy_score(y_val, nb.predict(X_val)))
    print (str(parameters))
    

{'alpha': 0.1, 'fit_prior': True, 'norm': True, 'score': '0.727'}
{'alpha': 0.1, 'fit_prior': True, 'norm': False, 'score': '0.72465'}
{'alpha': 0.1, 'fit_prior': False, 'norm': True, 'score': '0.72885'}
{'alpha': 0.1, 'fit_prior': False, 'norm': False, 'score': '0.72905'}
{'alpha': 0.6, 'fit_prior': True, 'norm': True, 'score': '0.7345'}
{'alpha': 0.6, 'fit_prior': True, 'norm': False, 'score': '0.743'}
{'alpha': 0.6, 'fit_prior': False, 'norm': True, 'score': '0.7435'}
{'alpha': 0.6, 'fit_prior': False, 'norm': False, 'score': '0.74235'}
{'alpha': 1.1, 'fit_prior': True, 'norm': True, 'score': '0.74245'}
{'alpha': 1.1, 'fit_prior': True, 'norm': False, 'score': '0.74535'}
{'alpha': 1.1, 'fit_prior': False, 'norm': True, 'score': '0.74205'}
{'alpha': 1.1, 'fit_prior': False, 'norm': False, 'score': '0.74965'}
{'alpha': 1.6, 'fit_prior': True, 'norm': True, 'score': '0.74375'}
{'alpha': 1.6, 'fit_prior': True, 'norm': False, 'score': '0.74685'}
{'alpha': 1.6, 'fit_prior': False, 'norm'