In [8]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import text as text2
from sklearn import decomposition, ensemble

import pandas, numpy as np, textblob, string
from sklearn.utils import shuffle
import operator as op

In [9]:
def train_model(classifier, feature_vector_train, label, feature_vector_test, is_neural_net=False, should_do_common=False):
	# fit the training dataset on the classifier
	classifier.fit(feature_vector_train, label)
	
	train_preds = classifier.predict(feature_vector_train)
	test_preds = classifier.predict(feature_vector_test)

	# stop_words = text2.ENGLISH_STOP_WORDS.union(["http", "https", "amp", "amb"])
	
	if is_neural_net:
		test_preds = test_preds.argmax(axis=-1)

	if should_do_common:
	    feature_names = count_vect.get_feature_names()
	    diff = classifier.feature_log_prob_[1,:] - np.max(classifier.feature_log_prob_[0:])

	    name_diff = {}
	    for i in range(len(feature_names)):
	       name_diff[feature_names[i]] = diff[i]

	       names_diff_sorted = sorted(name_diff.items(), key = op.itemgetter(1), reverse = True)
	    c = 0
	    i = 0
	    while c < 50:
	       if names_diff_sorted[i][0] in stop_words or len(names_diff_sorted[i][0]) <= 2:
	       	 i += 1
	       	 continue
	       print(names_diff_sorted[i])
	       c += 1
	       i += 1
	
	train_acc = metrics.accuracy_score(train_preds, train_y)
	test_acc = metrics.accuracy_score(test_preds, test_y)
	cm = metrics.confusion_matrix(test_y, test_preds)
	print('Train Accuracy: ', train_acc)
	print('Test Accuracy: ', test_acc)
	print('Confusion matrix: ', cm)
	return (test_acc, cm)
    

In [10]:
# load positive labels
pos = open('Dataset/Positive_tweets(10000).csv').read()
npos = 0
labels, texts = [], []
for i, line in enumerate(pos.split("\n")):
    content = line.split(',')
    if len(content) < 4:
    	continue;
    if content[4] != "English":
    	continue;
    labels.append(1)
    texts.append(content[2])
    npos += 1

# load negative labels (random tweets)
neg = open('Dataset/Negative_tweets(10000).txt').read()
nneg = 0
for i, line in enumerate(neg.split("\n")):
    words = line.split(" ")
    newst = ""
    for j in range(len(words)):
        if(words[j].startswith("http")):
            break
        newst = newst + " " + words[j]
    newst = newst.strip()
    if(newst == ""):
        continue
    
    labels.append(0)
    texts.append(newst)
    nneg += 1

texts, labels = shuffle(texts, labels)

print('Total number of datapoints: ', len(labels))
print('Positive labels: ', npos)
print('Negative labels: ', nneg)

Total number of datapoints:  11898
Positive labels:  4541
Negative labels:  7357


In [11]:
trainDF = pandas.DataFrame()
trainDF['text'] = texts
trainDF['label'] = labels

train_x, test_x, train_y, test_y = model_selection.train_test_split(trainDF['text'], trainDF['label'], train_size=0.8,random_state=42)

print('Size of training set: ', len(train_x))
print('Size of Test set:', len(test_x))

encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

binary_count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', binary=True)
binary_count_vect.fit(trainDF['text'])

count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])

binary_xtrain_count = binary_count_vect.transform(train_x)
binary_xtest_count = binary_count_vect.transform(test_x)

xtrain_count =  count_vect.transform(train_x)
xtest_count =  count_vect.transform(test_x)

Size of training set:  9518
Size of Test set: 2380


In [12]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xtest_tfidf =  tfidf_vect.transform(test_x)

In [13]:
# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xtest_tfidf_ngram =  tfidf_vect_ngram.transform(test_x)

In [14]:
# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(trainDF['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xtest_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_x)

In [15]:
# Naive Bayes on Count Vectors
print("NB, Binary Count Vectors: ")
accuracy = train_model(naive_bayes.MultinomialNB(), binary_xtrain_count, train_y, binary_xtest_count)

# Naive Bayes on Count Vectors
print("NB, Count Vectors: ")
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xtest_count)

# Naive Bayes on Word Level TF IDF Vectors
print("NB, WordLevel TF-IDF: ")
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xtest_tfidf)

# Naive Bayes on Ngram Level TF IDF Vectors
print("NB, N-Gram Vectors: ")
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)

# Naive Bayes on Character Level TF IDF Vectors
print("NB, CharLevel Vectors: ")
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xtest_tfidf_ngram_chars)

NB, Binary Count Vectors: 
Train Accuracy:  0.9825593612103383
Test Accuracy:  0.9689075630252101
Confusion matrix:  [[1450   28]
 [  46  856]]
NB, Count Vectors: 
Train Accuracy:  0.9816137844084892
Test Accuracy:  0.9680672268907563
Confusion matrix:  [[1450   28]
 [  48  854]]
NB, WordLevel TF-IDF: 
Train Accuracy:  0.9724732086572809
Test Accuracy:  0.9626050420168067
Confusion matrix:  [[1454   24]
 [  65  837]]
NB, N-Gram Vectors: 
Train Accuracy:  0.9271905862576172
Test Accuracy:  0.9096638655462185
Confusion matrix:  [[1415   63]
 [ 152  750]]
NB, CharLevel Vectors: 
Train Accuracy:  0.9619667997478462
Test Accuracy:  0.9512605042016806
Confusion matrix:  [[1445   33]
 [  83  819]]


In [16]:
# LR on Count Vectors
print("LR, Binary Count Vectors: ")
accuracy = train_model(linear_model.LogisticRegression(), binary_xtrain_count, train_y, binary_xtest_count)

# Linear Classifier on Count Vectors
print("LR, Count Vectors: ")
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xtest_count)

# Linear Classifier on Word Level TF IDF Vectors
print("LR, WordLevel TF-IDF: ")
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xtest_tfidf)

# Linear Classifier on Ngram Level TF IDF Vectors
print("LR, N-Gram Vectors: ")
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)

# Linear Classifier on Character Level TF IDF Vectors
print("LR, CharLevel Vectors: ")
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xtest_tfidf_ngram_chars)

LR, Binary Count Vectors: 
Train Accuracy:  0.9947467955452827
Test Accuracy:  0.9596638655462185
Confusion matrix:  [[1440   38]
 [  58  844]]
LR, Count Vectors: 
Train Accuracy:  0.9949569237234713
Test Accuracy:  0.957983193277311
Confusion matrix:  [[1442   36]
 [  64  838]]
LR, WordLevel TF-IDF: 
Train Accuracy:  0.9768859003992435
Test Accuracy:  0.9571428571428572
Confusion matrix:  [[1457   21]
 [  81  821]]
LR, N-Gram Vectors: 
Train Accuracy:  0.9097499474679555
Test Accuracy:  0.8899159663865546
Confusion matrix:  [[1462   16]
 [ 246  656]]
LR, CharLevel Vectors: 
Train Accuracy:  0.987287245219584
Test Accuracy:  0.9819327731092437
Confusion matrix:  [[1466   12]
 [  31  871]]


In [17]:
def svm_tune(x, y):
		Cs = [0.001, 0.01, 0.1, 1, 10]
		gammas = [0.001, 0.01, 0.1, 1]
		grid = {'C': Cs, 'gamma': gammas}
		search = GridSearchCV(svm.SVC(kernel='rbf'), grid)
		search.fit(x, y)
		search.best_params_
		return search.best_params_

# SVM on Bin Count Vectors
print("SVM, Binary Count Vectors: ")
accuracy = train_model(svm.SVC(kernel='rbf', C=10, gamma=0.1), binary_xtrain_count, train_y, binary_xtest_count)

# SVM on Count Vectors
print("SVM, Count Vectors: ")
accuracy = train_model(svm.SVC(kernel='rbf', C=10, gamma=0.1), xtrain_count, train_y, xtest_count)

# SVM on Word Level TF IDF Vectors
print("SVM, WordLevel TF-IDF: ")
accuracy = train_model(svm.SVC(kernel='rbf', C=10, gamma=0.1), xtrain_tfidf, train_y, xtest_tfidf)

# SVM on Ngram Level TF IDF Vectors
print("SVM, N-Gram Vectors: ")
accuracy = train_model(svm.SVC(kernel='rbf', C=10, gamma=0.1), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)

# SVM on Character Level TF IDF Vectors
print("SVM, CharLevel Vectors: ")
accuracy = train_model(svm.SVC(kernel='rbf', C=10, gamma=0.1), xtrain_tfidf_ngram_chars, train_y, xtest_tfidf_ngram_chars)

SVM, Binary Count Vectors: 
Train Accuracy:  1.0
Test Accuracy:  0.9592436974789916
Confusion matrix:  [[1443   35]
 [  62  840]]
SVM, Count Vectors: 
Train Accuracy:  1.0
Test Accuracy:  0.9558823529411765
Confusion matrix:  [[1446   32]
 [  73  829]]
SVM, WordLevel TF-IDF: 
Train Accuracy:  0.9940113469216222
Test Accuracy:  0.9642857142857143
Confusion matrix:  [[1438   40]
 [  45  857]]
SVM, N-Gram Vectors: 
Train Accuracy:  0.9447362891363732
Test Accuracy:  0.8970588235294118
Confusion matrix:  [[1424   54]
 [ 191  711]]
SVM, CharLevel Vectors: 
Train Accuracy:  0.9989493591090566
Test Accuracy:  0.9865546218487395
Confusion matrix:  [[1466   12]
 [  20  882]]


In [18]:
print("Linear SVM, Binary Count Vectors: ")
accuracy = train_model(svm.LinearSVC(), binary_xtrain_count, train_y, binary_xtest_count)

# SVM on Count Vectors
print("Linear SVM, Count Vectors: ")
accuracy = train_model(svm.LinearSVC(), xtrain_count, train_y, xtest_count)

# SVM on Word Level TF IDF Vectors
print("Linear SVM, WordLevel TF-IDF: ")
accuracy = train_model(svm.LinearSVC(), xtrain_tfidf, train_y, xtest_tfidf)

# SVM on Ngram Level TF IDF Vectors
print("Linear SVM, N-Gram Vectors: ")
accuracy = train_model(svm.LinearSVC(), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)

# SVM on Character Level TF IDF Vectors
print("Linear SVM, CharLevel Vectors: ")
accuracy = train_model(svm.LinearSVC(), xtrain_tfidf_ngram_chars, train_y, xtest_tfidf_ngram_chars)

Linear SVM, Binary Count Vectors: 
Train Accuracy:  0.9998949359109056
Test Accuracy:  0.9584033613445379
Confusion matrix:  [[1440   38]
 [  61  841]]
Linear SVM, Count Vectors: 
Train Accuracy:  0.9997898718218113
Test Accuracy:  0.9588235294117647
Confusion matrix:  [[1443   35]
 [  63  839]]
Linear SVM, WordLevel TF-IDF: 
Train Accuracy:  0.9956923723471317
Test Accuracy:  0.9647058823529412
Confusion matrix:  [[1441   37]
 [  47  855]]
Linear SVM, N-Gram Vectors: 
Train Accuracy:  0.9485185963437697
Test Accuracy:  0.8991596638655462
Confusion matrix:  [[1423   55]
 [ 185  717]]
Linear SVM, CharLevel Vectors: 
Train Accuracy:  0.9992645513763395
Test Accuracy:  0.9873949579831933
Confusion matrix:  [[1468   10]
 [  20  882]]


In [19]:
print("RF, Binary Count Vectors: ")
accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=100), binary_xtrain_count, train_y, binary_xtest_count)

# RF on Count Vectors
print("RF, Count Vectors: ")
accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=100), xtrain_count, train_y, xtest_count)

# RF on Word Level TF IDF Vectors
print("RF, WordLevel TF-IDF: ")
accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=100), xtrain_tfidf, train_y, xtest_tfidf)

# RF on Ngram Level TF IDF Vectors
print("RF, N-Gram Vectors: ")
accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=100), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)

# RF on Character Level TF IDF Vectors
print("RF, CharLevel Vectors: ")
accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=100), xtrain_tfidf_ngram_chars, train_y, xtest_tfidf_ngram_chars)

RF, Binary Count Vectors: 
Train Accuracy:  1.0
Test Accuracy:  0.9487394957983193
Confusion matrix:  [[1458   20]
 [ 102  800]]
RF, Count Vectors: 
Train Accuracy:  1.0
Test Accuracy:  0.9516806722689075
Confusion matrix:  [[1465   13]
 [ 102  800]]
RF, WordLevel TF-IDF: 
Train Accuracy:  0.9988442950199622
Test Accuracy:  0.954201680672269
Confusion matrix:  [[1448   30]
 [  79  823]]
RF, N-Gram Vectors: 
Train Accuracy:  0.957974364362261
Test Accuracy:  0.8995798319327731
Confusion matrix:  [[1422   56]
 [ 183  719]]
RF, CharLevel Vectors: 
Train Accuracy:  0.9998949359109056
Test Accuracy:  0.984873949579832
Confusion matrix:  [[1463   15]
 [  21  881]]


In [20]:
print("RF, Binary Count Vectors: ")
accuracy = train_model(ensemble.GradientBoostingClassifier(n_estimators=100), binary_xtrain_count, train_y, binary_xtest_count)

# RF on Count Vectors
print("RF, Count Vectors: ")
accuracy = train_model(ensemble.GradientBoostingClassifier(n_estimators=100), xtrain_count, train_y, xtest_count)

# RF on Word Level TF IDF Vectors
print("RF, WordLevel TF-IDF: ")
accuracy = train_model(ensemble.GradientBoostingClassifier(n_estimators=100), xtrain_tfidf, train_y, xtest_tfidf)

# RF on Ngram Level TF IDF Vectors
print("RF, N-Gram Vectors: ")
accuracy = train_model(ensemble.GradientBoostingClassifier(n_estimators=100), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)

# RF on Character Level TF IDF Vectors
print("RF, CharLevel Vectors: ")
accuracy = train_model(ensemble.GradientBoostingClassifier(n_estimators=100), xtrain_tfidf_ngram_chars, train_y, xtest_tfidf_ngram_chars)

RF, Binary Count Vectors: 
Train Accuracy:  0.9373818028997689
Test Accuracy:  0.9331932773109244
Confusion matrix:  [[1338  140]
 [  19  883]]
RF, Count Vectors: 
Train Accuracy:  0.9374868669888632
Test Accuracy:  0.934453781512605
Confusion matrix:  [[1341  137]
 [  19  883]]
RF, WordLevel TF-IDF: 
Train Accuracy:  0.9388527001470898
Test Accuracy:  0.9327731092436975
Confusion matrix:  [[1336  142]
 [  18  884]]
RF, N-Gram Vectors: 
Train Accuracy:  0.8712964908594243
Test Accuracy:  0.8676470588235294
Confusion matrix:  [[1465   13]
 [ 302  600]]
RF, CharLevel Vectors: 
Train Accuracy:  0.987287245219584
Test Accuracy:  0.9869747899159664
Confusion matrix:  [[1475    3]
 [  28  874]]
