In [1]:
# Useful starting lines
%matplotlib inline
from sklearn.model_selection import cross_val_score, GridSearchCV
import numpy as np
from helpers import *
from data_preprocess import *
%load_ext autoreload
%autoreload 2

## Load and Preprocess Data 

In [2]:
# load data and combine train_pos + train_neg
PATH_POS = 'twitter-datasets/train_pos.txt'
PATH_NEG = 'twitter-datasets/train_neg.txt'
PATH_TEST = 'twitter-datasets/test_data.txt'
PATH_COMBINE = 'twitter-datasets/train_combine.txt'

data_pos = ""
data_neg = ""

with open(PATH_POS) as fp: 
    data_pos = fp.read() 
    
with open(PATH_NEG) as fp: 
    data_neg = fp.read() 

data = data_pos + data_neg 
  
with open (PATH_COMBINE, 'w') as fp: 
    fp.write(data)

# data preprocess
PATH_TRAIN_DATA = "twitter-datasets/train_preprocess"
PATH_TEST_DATA = "twitter-datasets/test_preprocess"
data_process(PATH_COMBINE, PATH_TRAIN_DATA, PATH_TEST, PATH_TEST_DATA)

# read data
train_data = read_train(PATH_TRAIN_DATA)

## Compute FastText word embedding

In [3]:
# we use continuous bag of words for word embedding
dimension = 100
model = fasttext.train_unsupervised(PATH_TRAIN_DATA, model = 'cbow', dim=dimension)
vocabulary = model.words
word_embeddings = np.array([model[word] for word in vocabulary])

# create our final training data
x_train = compute_word_embedding(model, train_data, dimension, vocabulary)
y_train = [1] * 100000 + [0] * 100000 # change the number to 1250000 if you want to use full dataset

## Cross Validation

In [9]:
# Naive Bayes classifier
from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()
scores = cross_val_score(classifier, x_train, y_train, cv=4, scoring='accuracy')
scores_mean = scores.mean()

print(scores)
print(scores_mean)

[0.64852 0.64968 0.64988 0.64354]
0.647905


In [4]:
# Random Forest classifier
from sklearn.ensemble import RandomForestClassifier

# run grid search cross validation to find best parameters
'''rf_param = {
    'n_estimators': [100, 300, 500, 700, 1000],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}
classifier = RandomForestClassifier()

grid_search = GridSearchCV(estimator=classifier, param_grid=rf_param, scoring='accuracy', cv=4, n_jobs=1, verbose=10)
grid_search.fit(x_train, y_train)

# view the best parameters and score
print(grid_search.best_params_)
print(grid_search.best_score_)'''

classifier = RandomForestClassifier(n_estimators=300)
scores = cross_val_score(classifier, x_train, y_train, cv=4, scoring='accuracy')
scores_mean = scores.mean()

print(scores)
print(scores_mean)

[0.7663  0.76078 0.75726 0.75944]
0.7609450000000001


In [5]:
# SVM 
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# run grid search cross validation to find best parameters
'''param_svm = {'C': [0.001, 0.01, 0.1, 1], 'gamma' : ['scale', 'auto'], 'kernel':['linear', 'rbf']}
classifier = SVC()

grid_search = GridSearchCV(estimator=classifier, param_grid=param_svm, cv=4, n_jobs=1, verbose=10)
grid_search.fit(x_train, y_train)

# view the best parameters and score
print(grid_search.best_params_)
print(grid_search.best_score_)'''

classifier = make_pipeline(StandardScaler(), SVC(gamma='auto'))
scores = cross_val_score(classifier, x_train, y_train, cv=4, scoring='accuracy')
scores_mean = scores.mean()

print(scores)
print(scores_mean)

[0.7957  0.79472 0.79212 0.79032]
0.793215
