In [1]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import text as text2
from sklearn import decomposition, ensemble

import pandas, numpy as np, textblob, string
from sklearn.utils import shuffle
import operator as op

In [2]:
def train_model(classifier, feature_vector_train, label, feature_vector_test, is_neural_net=False, should_do_common=False):
	# fit the training dataset on the classifier
	classifier.fit(feature_vector_train, label)
	
	train_preds = classifier.predict(feature_vector_train)
	test_preds = classifier.predict(feature_vector_test)

	# stop_words = text2.ENGLISH_STOP_WORDS.union(["http", "https", "amp", "amb"])
	
	if is_neural_net:
		test_preds = test_preds.argmax(axis=-1)

	if should_do_common:
	    feature_names = count_vect.get_feature_names()
	    diff = classifier.feature_log_prob_[1,:] - np.max(classifier.feature_log_prob_[0:])

	    name_diff = {}
	    for i in range(len(feature_names)):
	       name_diff[feature_names[i]] = diff[i]

	       names_diff_sorted = sorted(name_diff.items(), key = op.itemgetter(1), reverse = True)
	    c = 0
	    i = 0
	    while c < 50:
	       if names_diff_sorted[i][0] in stop_words or len(names_diff_sorted[i][0]) <= 2:
	       	 i += 1
	       	 continue
	       print(names_diff_sorted[i])
	       c += 1
	       i += 1
	
	train_acc = metrics.accuracy_score(train_preds, train_y)
	test_acc = metrics.accuracy_score(test_preds, test_y)
	cm = metrics.confusion_matrix(test_y, test_preds)
	print('Train Accuracy: ', train_acc)
	print('Test Accuracy: ', test_acc)
	print('Confusion matrix: ', cm)
	return (test_acc, cm)
    

In [3]:
# load positive labels
pos = open('Dataset/Positive_tweets(10000).csv').read()
npos = 0
labels, texts = [], []
for i, line in enumerate(pos.split("\n")):
    content = line.split(',')
    if len(content) < 4:
    	continue;
    if content[4] != "English":
    	continue;
    labels.append(1)
    texts.append(content[2])
    npos += 1

# load negative labels (random tweets)
neg = open('Dataset/Negative_tweets(10000).txt').read()
nneg = 0
for i, line in enumerate(neg.split("\n")):
    labels.append(0)
    texts.append(line)
    nneg += 1

texts, labels = shuffle(texts, labels)

print('Total number of datapoints: ', len(labels))
print('Positive labels: ', npos)
print('Negative labels: ', nneg)

Total number of datapoints:  12514
Positive labels:  4541
Negative labels:  7973


In [10]:
trainDF = pandas.DataFrame()
trainDF['text'] = texts
trainDF['label'] = labels

train_x, test_x, train_y, test_y = model_selection.train_test_split(trainDF['text'], trainDF['label'], train_size=0.8,random_state=42)

print('Size of training set: ', len(train_x))
print('Size of Test set:', len(test_x))

encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

binary_count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', binary=True)
binary_count_vect.fit(trainDF['text'])

count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])

binary_xtrain_count = binary_count_vect.transform(train_x)
binary_xtest_count = binary_count_vect.transform(test_x)

xtrain_count =  count_vect.transform(train_x)
xtest_count =  count_vect.transform(test_x)

Size of training set:  10011
Size of Test set: 2503


In [11]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xtest_tfidf =  tfidf_vect.transform(test_x)

In [12]:
# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xtest_tfidf_ngram =  tfidf_vect_ngram.transform(test_x)

In [13]:
# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(trainDF['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xtest_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_x)

In [16]:
# Naive Bayes on Count Vectors
print("NB, Binary Count Vectors: ")
accuracy = train_model(naive_bayes.MultinomialNB(), binary_xtrain_count, train_y, binary_xtest_count)

# Naive Bayes on Count Vectors
print("NB, Count Vectors: ")
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xtest_count)

# Naive Bayes on Word Level TF IDF Vectors
print("NB, WordLevel TF-IDF: ")
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xtest_tfidf)

# Naive Bayes on Ngram Level TF IDF Vectors
print("NB, N-Gram Vectors: ")
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)

# Naive Bayes on Character Level TF IDF Vectors
print("NB, CharLevel Vectors: ")
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xtest_tfidf_ngram_chars)

NB, Binary Count Vectors: 
Train Accuracy:  0.9770252722005793
Test Accuracy:  0.9460647223332002
Confusion matrix:  [[1561   36]
 [  99  807]]
NB, Count Vectors: 
Train Accuracy:  0.9785236240135851
Test Accuracy:  0.9468637634838194
Confusion matrix:  [[1560   37]
 [  96  810]]
NB, WordLevel TF-IDF: 
Train Accuracy:  0.9517530716212167
Test Accuracy:  0.9328805433479824
Confusion matrix:  [[1565   32]
 [ 136  770]]
NB, N-Gram Vectors: 
Train Accuracy:  0.8871241634202377
Test Accuracy:  0.8589692369157012
Confusion matrix:  [[1514   83]
 [ 270  636]]
NB, CharLevel Vectors: 
Train Accuracy:  0.9375686744580961
Test Accuracy:  0.9304834198961246
Confusion matrix:  [[1578   19]
 [ 155  751]]


In [15]:
# LR on Count Vectors
print("LR, Binary Count Vectors: ")
accuracy = train_model(linear_model.LogisticRegression(), binary_xtrain_count, train_y, binary_xtest_count)

# Linear Classifier on Count Vectors
print("LR, Count Vectors: ")
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xtest_count)

# Linear Classifier on Word Level TF IDF Vectors
print("LR, WordLevel TF-IDF: ")
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xtest_tfidf)

# Linear Classifier on Ngram Level TF IDF Vectors
print("LR, N-Gram Vectors: ")
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)

# Linear Classifier on Character Level TF IDF Vectors
print("LR, CharLevel Vectors: ")
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xtest_tfidf_ngram_chars)

LR, Binary Count Vectors: 
Train Accuracy:  0.991309559484567
Test Accuracy:  0.9428685577307231
Confusion matrix:  [[1557   40]
 [ 103  803]]
LR, Count Vectors: 
Train Accuracy:  0.9916092298471681
Test Accuracy:  0.9424690371554135
Confusion matrix:  [[1555   42]
 [ 102  804]]
LR, WordLevel TF-IDF: 
Train Accuracy:  0.9652382379382679
Test Accuracy:  0.9356771873751498
Confusion matrix:  [[1558   39]
 [ 122  784]]
LR, N-Gram Vectors: 
Train Accuracy:  0.8872240535411048
Test Accuracy:  0.8497802636835797
Confusion matrix:  [[1564   33]
 [ 343  563]]
LR, CharLevel Vectors: 
Train Accuracy:  0.9741284586954351
Test Accuracy:  0.9596484218937276
Confusion matrix:  [[1578   19]
 [  82  824]]


In [17]:
def svm_tune(x, y):
		Cs = [0.001, 0.01, 0.1, 1, 10]
		gammas = [0.001, 0.01, 0.1, 1]
		grid = {'C': Cs, 'gamma': gammas}
		search = GridSearchCV(svm.SVC(kernel='rbf'), grid)
		search.fit(x, y)
		search.best_params_
		return search.best_params_

# SVM on Bin Count Vectors
print("SVM, Binary Count Vectors: ")
accuracy = train_model(svm.SVC(kernel='rbf', C=10, gamma=0.1), binary_xtrain_count, train_y, binary_xtest_count)

# SVM on Count Vectors
print("SVM, Count Vectors: ")
accuracy = train_model(svm.SVC(kernel='rbf', C=10, gamma=0.1), xtrain_count, train_y, xtest_count)

# SVM on Word Level TF IDF Vectors
print("SVM, WordLevel TF-IDF: ")
accuracy = train_model(svm.SVC(kernel='rbf', C=10, gamma=0.1), xtrain_tfidf, train_y, xtest_tfidf)

# SVM on Ngram Level TF IDF Vectors
print("SVM, N-Gram Vectors: ")
accuracy = train_model(svm.SVC(kernel='rbf', C=10, gamma=0.1), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)

# SVM on Character Level TF IDF Vectors
print("SVM, CharLevel Vectors: ")
accuracy = train_model(svm.SVC(kernel='rbf', C=10, gamma=0.1), xtrain_tfidf_ngram_chars, train_y, xtest_tfidf_ngram_chars)

SVM, Binary Count Vectors: 
Train Accuracy:  1.0
Test Accuracy:  0.9452656811825809
Confusion matrix:  [[1549   48]
 [  89  817]]
SVM, Count Vectors: 
Train Accuracy:  1.0
Test Accuracy:  0.9424690371554135
Confusion matrix:  [[1556   41]
 [ 103  803]]
SVM, WordLevel TF-IDF: 
Train Accuracy:  0.9863150534412146
Test Accuracy:  0.9512584898122254
Confusion matrix:  [[1534   63]
 [  59  847]]
SVM, N-Gram Vectors: 
Train Accuracy:  0.9163919688342823
Test Accuracy:  0.8609668397922493
Confusion matrix:  [[1507   90]
 [ 258  648]]
SVM, CharLevel Vectors: 
Train Accuracy:  0.9952052741983818
Test Accuracy:  0.9684378745505393
Confusion matrix:  [[1564   33]
 [  46  860]]


In [19]:
print("Linear SVM, Binary Count Vectors: ")
accuracy = train_model(svm.LinearSVC(), binary_xtrain_count, train_y, binary_xtest_count)

# SVM on Count Vectors
print("Linear SVM, Count Vectors: ")
accuracy = train_model(svm.LinearSVC(), xtrain_count, train_y, xtest_count)

# SVM on Word Level TF IDF Vectors
print("Linear SVM, WordLevel TF-IDF: ")
accuracy = train_model(svm.LinearSVC(), xtrain_tfidf, train_y, xtest_tfidf)

# SVM on Ngram Level TF IDF Vectors
print("Linear SVM, N-Gram Vectors: ")
accuracy = train_model(svm.LinearSVC(), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)

# SVM on Character Level TF IDF Vectors
print("Linear SVM, CharLevel Vectors: ")
accuracy = train_model(svm.LinearSVC(), xtrain_tfidf_ngram_chars, train_y, xtest_tfidf_ngram_chars)

Linear SVM, Binary Count Vectors: 
Train Accuracy:  0.9997003296373989
Test Accuracy:  0.9332800639232921
Confusion matrix:  [[1549   48]
 [ 119  787]]
Linear SVM, Count Vectors: 
Train Accuracy:  0.9997003296373989
Test Accuracy:  0.9344786256492209
Confusion matrix:  [[1549   48]
 [ 116  790]]
Linear SVM, WordLevel TF-IDF: 
Train Accuracy:  0.9882129657376886
Test Accuracy:  0.9452656811825809
Confusion matrix:  [[1557   40]
 [  97  809]]
Linear SVM, N-Gram Vectors: 
Train Accuracy:  0.9208870242732994
Test Accuracy:  0.8581701957650819
Confusion matrix:  [[1511   86]
 [ 269  637]]
Linear SVM, CharLevel Vectors: 
Train Accuracy:  0.9968035161322545
Test Accuracy:  0.9700359568517779
Confusion matrix:  [[1570   27]
 [  48  858]]


In [28]:
print("RF, Binary Count Vectors: ")
accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=100), binary_xtrain_count, train_y, binary_xtest_count)

# RF on Count Vectors
print("RF, Count Vectors: ")
accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=100), xtrain_count, train_y, xtest_count)

# RF on Word Level TF IDF Vectors
print("RF, WordLevel TF-IDF: ")
accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=100), xtrain_tfidf, train_y, xtest_tfidf)

# RF on Ngram Level TF IDF Vectors
print("RF, N-Gram Vectors: ")
accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=100), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)

# RF on Character Level TF IDF Vectors
print("RF, CharLevel Vectors: ")
accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=100), xtrain_tfidf_ngram_chars, train_y, xtest_tfidf_ngram_chars)

RF, Binary Count Vectors: 
Train Accuracy:  1.0
Test Accuracy:  0.9368757491010787
Confusion matrix:  [[1575   22]
 [ 136  770]]
RF, Count Vectors: 
Train Accuracy:  1.0
Test Accuracy:  0.9328805433479824
Confusion matrix:  [[1570   27]
 [ 141  765]]
RF, WordLevel TF-IDF: 
Train Accuracy:  0.9953051643192489
Test Accuracy:  0.9412704754294846
Confusion matrix:  [[1545   52]
 [  95  811]]
RF, N-Gram Vectors: 
Train Accuracy:  0.937368894216362
Test Accuracy:  0.8345984818218138
Confusion matrix:  [[1535   62]
 [ 352  554]]
RF, CharLevel Vectors: 
Train Accuracy:  0.9999001098791329
Test Accuracy:  0.968837395125849
Confusion matrix:  [[1545   52]
 [  26  880]]


In [29]:
print("RF, Binary Count Vectors: ")
accuracy = train_model(ensemble.GradientBoostingClassifier(n_estimators=100), binary_xtrain_count, train_y, binary_xtest_count)

# RF on Count Vectors
print("RF, Count Vectors: ")
accuracy = train_model(ensemble.GradientBoostingClassifier(n_estimators=100), xtrain_count, train_y, xtest_count)

# RF on Word Level TF IDF Vectors
print("RF, WordLevel TF-IDF: ")
accuracy = train_model(ensemble.GradientBoostingClassifier(n_estimators=100), xtrain_tfidf, train_y, xtest_tfidf)

# RF on Ngram Level TF IDF Vectors
print("RF, N-Gram Vectors: ")
accuracy = train_model(ensemble.GradientBoostingClassifier(n_estimators=100), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)

# RF on Character Level TF IDF Vectors
print("RF, CharLevel Vectors: ")
accuracy = train_model(ensemble.GradientBoostingClassifier(n_estimators=100), xtrain_tfidf_ngram_chars, train_y, xtest_tfidf_ngram_chars)

Train Accuracy:  0.892018779342723
Test Accuracy:  0.8905313623651618
Confusion matrix:  [[1360  237]
 [  37  869]]
RF, Count Vectors: 
Train Accuracy:  0.8937169113974628
Test Accuracy:  0.8913304035157811
Confusion matrix:  [[1362  235]
 [  37  869]]
RF, WordLevel TF-IDF: 
Train Accuracy:  0.9123963639996004
Test Accuracy:  0.906512185377547
Confusion matrix:  [[1399  198]
 [  36  870]]
RF, N-Gram Vectors: 
Train Accuracy:  0.7959244830686245
Test Accuracy:  0.7866560127846585
Confusion matrix:  [[1590    7]
 [ 527  379]]
RF, CharLevel Vectors: 
Train Accuracy:  0.9753271401458395
Test Accuracy:  0.9704354774270875
Confusion matrix:  [[1552   45]
 [  29  877]]
