In [15]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import text as text2
from sklearn import decomposition, ensemble

import pandas, numpy as np, textblob, string
from sklearn.utils import shuffle
import operator as op

In [16]:
def train_model(classifier, feature_vector_train, label, feature_vector_test, is_neural_net=False, should_do_common=False):
	# fit the training dataset on the classifier
	classifier.fit(feature_vector_train, label)
	
	train_preds = classifier.predict(feature_vector_train)
	test_preds = classifier.predict(feature_vector_test)

	# stop_words = text2.ENGLISH_STOP_WORDS.union(["http", "https", "amp", "amb"])
	
	if is_neural_net:
		test_preds = test_preds.argmax(axis=-1)

	if should_do_common:
	    feature_names = count_vect.get_feature_names()
	    diff = classifier.feature_log_prob_[1,:] - np.max(classifier.feature_log_prob_[0:])

	    name_diff = {}
	    for i in range(len(feature_names)):
	       name_diff[feature_names[i]] = diff[i]

	       names_diff_sorted = sorted(name_diff.items(), key = op.itemgetter(1), reverse = True)
	    c = 0
	    i = 0
	    while c < 50:
	       if names_diff_sorted[i][0] in stop_words or len(names_diff_sorted[i][0]) <= 2:
	       	 i += 1
	       	 continue
	       print(names_diff_sorted[i])
	       c += 1
	       i += 1
	
	train_acc = metrics.accuracy_score(train_preds, train_y)
	test_acc = metrics.accuracy_score(test_preds, test_y)
	cm = metrics.confusion_matrix(test_y, test_preds)
	f1 = metrics.f1_score(test_y, test_preds)
	print('Train Accuracy: ', train_acc)
	print('Test Accuracy: ', test_acc)
	print('Confusion matrix: ', cm)
	print('F1_score: ', f1)
	return (test_acc, cm)
    

In [17]:
# load positive labels
pos = open('Dataset/Positive_tweets(10000).csv').read()
npos = 0
labels, texts = [], []
for i, line in enumerate(pos.split("\n")):
    content = line.split(',')
    if len(content) < 4:
    	continue;
    if content[4] != "English":
    	continue;
    labels.append(1)
    texts.append(content[2])
    npos += 1

# load negative labels (random tweets)
neg = open('Dataset/Negative_tweets(10000).txt').read()
nneg = 0
for i, line in enumerate(neg.split("\n")):
    words = line.split(" ")
    newst = ""
    for j in range(len(words)):
        if(words[j].startswith("http")):
            break
        newst = newst + " " + words[j]
    newst = newst.strip()
    if(newst == ""):
        continue
    
    labels.append(0)
    texts.append(newst)
    nneg += 1

texts, labels = shuffle(texts, labels)

print('Total number of datapoints: ', len(labels))
print('Positive labels: ', npos)
print('Negative labels: ', nneg)

Total number of datapoints:  11898
Positive labels:  4541
Negative labels:  7357


In [18]:
trainDF = pandas.DataFrame()
trainDF['text'] = texts
trainDF['label'] = labels

train_x, test_x, train_y, test_y = model_selection.train_test_split(trainDF['text'], trainDF['label'], train_size=0.8,random_state=42)

print('Size of training set: ', len(train_x))
print('Size of Test set:', len(test_x))

encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

binary_count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', binary=True)
binary_count_vect.fit(trainDF['text'])

count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])

binary_xtrain_count = binary_count_vect.transform(train_x)
binary_xtest_count = binary_count_vect.transform(test_x)

xtrain_count =  count_vect.transform(train_x)
xtest_count =  count_vect.transform(test_x)

Size of training set:  9518
Size of Test set: 2380


In [19]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xtest_tfidf =  tfidf_vect.transform(test_x)

In [20]:
# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xtest_tfidf_ngram =  tfidf_vect_ngram.transform(test_x)

In [21]:
# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(trainDF['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xtest_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_x)

In [22]:
# Naive Bayes on Count Vectors
print("\nNB, Binary Count Vectors: ")
accuracy = train_model(naive_bayes.MultinomialNB(), binary_xtrain_count, train_y, binary_xtest_count)

# Naive Bayes on Count Vectors
print("\nNB, Count Vectors: ")
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xtest_count)

# Naive Bayes on Word Level TF IDF Vectors
print("\nNB, WordLevel TF-IDF: ")
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xtest_tfidf)

# Naive Bayes on Ngram Level TF IDF Vectors
print("\nNB, N-Gram Vectors: ")
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)

# Naive Bayes on Character Level TF IDF Vectors
print("\nNB, CharLevel Vectors: ")
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xtest_tfidf_ngram_chars)


NB, Binary Count Vectors: 
Train Accuracy:  0.9833998739230931
Test Accuracy:  0.9651260504201681
Confusion matrix:  [[1461   19]
 [  64  836]]
F1_score:  0.9527065527065527

NB, Count Vectors: 
Train Accuracy:  0.982769489388527
Test Accuracy:  0.9613445378151261
Confusion matrix:  [[1458   22]
 [  70  830]]
F1_score:  0.9474885844748859

NB, WordLevel TF-IDF: 
Train Accuracy:  0.9736289136373187
Test Accuracy:  0.9558823529411765
Confusion matrix:  [[1463   17]
 [  88  812]]
F1_score:  0.9392712550607288

NB, N-Gram Vectors: 
Train Accuracy:  0.9296070603067872
Test Accuracy:  0.9042016806722689
Confusion matrix:  [[1425   55]
 [ 173  727]]
F1_score:  0.8644470868014269

NB, CharLevel Vectors: 
Train Accuracy:  0.960075646144148
Test Accuracy:  0.9525210084033613
Confusion matrix:  [[1459   21]
 [  92  808]]
F1_score:  0.9346443030653557


In [23]:
# LR on Count Vectors
print("\nLR, Binary Count Vectors: ")
accuracy = train_model(linear_model.LogisticRegression(), binary_xtrain_count, train_y, binary_xtest_count)

# Linear Classifier on Count Vectors
print("\nLR, Count Vectors: ")
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xtest_count)

# Linear Classifier on Word Level TF IDF Vectors
print("\nLR, WordLevel TF-IDF: ")
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xtest_tfidf)

# Linear Classifier on Ngram Level TF IDF Vectors
print("\nLR, N-Gram Vectors: ")
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)

# Linear Classifier on Character Level TF IDF Vectors
print("\nLR, CharLevel Vectors: ")
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xtest_tfidf_ngram_chars)


LR, Binary Count Vectors: 
Train Accuracy:  0.994851859634377
Test Accuracy:  0.9626050420168067
Confusion matrix:  [[1453   27]
 [  62  838]]
F1_score:  0.9495750708215297

LR, Count Vectors: 
Train Accuracy:  0.9946417314561883
Test Accuracy:  0.9638655462184874
Confusion matrix:  [[1457   23]
 [  63  837]]
F1_score:  0.9511363636363638

LR, WordLevel TF-IDF: 
Train Accuracy:  0.9763605799537718
Test Accuracy:  0.9525210084033613
Confusion matrix:  [[1469   11]
 [ 102  798]]
F1_score:  0.9338794616734932

LR, N-Gram Vectors: 
Train Accuracy:  0.9108005883588989
Test Accuracy:  0.8865546218487395
Confusion matrix:  [[1459   21]
 [ 249  651]]
F1_score:  0.8282442748091604

LR, CharLevel Vectors: 
Train Accuracy:  0.9863416684177349
Test Accuracy:  0.9810924369747899
Confusion matrix:  [[1473    7]
 [  38  862]]
F1_score:  0.9745618993781798


In [24]:
def svm_tune(x, y):
		Cs = [0.001, 0.01, 0.1, 1, 10]
		gammas = [0.001, 0.01, 0.1, 1]
		grid = {'C': Cs, 'gamma': gammas}
		search = GridSearchCV(svm.SVC(kernel='rbf'), grid)
		search.fit(x, y)
		search.best_params_
		return search.best_params_

# SVM on Bin Count Vectors
print("\nSVM, Binary Count Vectors: ")
accuracy = train_model(svm.SVC(kernel='rbf', C=10, gamma=0.1), binary_xtrain_count, train_y, binary_xtest_count)

# SVM on Count Vectors
print("\nSVM, Count Vectors: ")
accuracy = train_model(svm.SVC(kernel='rbf', C=10, gamma=0.1), xtrain_count, train_y, xtest_count)

# SVM on Word Level TF IDF Vectors
print("\nSVM, WordLevel TF-IDF: ")
accuracy = train_model(svm.SVC(kernel='rbf', C=10, gamma=0.1), xtrain_tfidf, train_y, xtest_tfidf)

# SVM on Ngram Level TF IDF Vectors
print("\nSVM, N-Gram Vectors: ")
accuracy = train_model(svm.SVC(kernel='rbf', C=10, gamma=0.1), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)

# SVM on Character Level TF IDF Vectors
print("\nSVM, CharLevel Vectors: ")
accuracy = train_model(svm.SVC(kernel='rbf', C=10, gamma=0.1), xtrain_tfidf_ngram_chars, train_y, xtest_tfidf_ngram_chars)


SVM, Binary Count Vectors: 
Train Accuracy:  0.9998949359109056
Test Accuracy:  0.9642857142857143
Confusion matrix:  [[1459   21]
 [  64  836]]
F1_score:  0.9516220830961867

SVM, Count Vectors: 
Train Accuracy:  1.0
Test Accuracy:  0.9600840336134454
Confusion matrix:  [[1463   17]
 [  78  822]]
F1_score:  0.9453709028177113

SVM, WordLevel TF-IDF: 
Train Accuracy:  0.9942214750998108
Test Accuracy:  0.9613445378151261
Confusion matrix:  [[1444   36]
 [  56  844]]
F1_score:  0.9483146067415731

SVM, N-Gram Vectors: 
Train Accuracy:  0.9449464173145619
Test Accuracy:  0.8957983193277311
Confusion matrix:  [[1418   62]
 [ 186  714]]
F1_score:  0.8520286396181384

SVM, CharLevel Vectors: 
Train Accuracy:  0.9977936541290187
Test Accuracy:  0.9890756302521009
Confusion matrix:  [[1474    6]
 [  20  880]]
F1_score:  0.9854423292273236


In [25]:
print("\nLinear SVM, Binary Count Vectors: ")
accuracy = train_model(svm.LinearSVC(), binary_xtrain_count, train_y, binary_xtest_count)

# SVM on Count Vectors
print("\nLinear SVM, Count Vectors: ")
accuracy = train_model(svm.LinearSVC(), xtrain_count, train_y, xtest_count)

# SVM on Word Level TF IDF Vectors
print("\nLinear SVM, WordLevel TF-IDF: ")
accuracy = train_model(svm.LinearSVC(), xtrain_tfidf, train_y, xtest_tfidf)

# SVM on Ngram Level TF IDF Vectors
print("\nLinear SVM, N-Gram Vectors: ")
accuracy = train_model(svm.LinearSVC(), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)

# SVM on Character Level TF IDF Vectors
print("\nLinear SVM, CharLevel Vectors: ")
accuracy = train_model(svm.LinearSVC(), xtrain_tfidf_ngram_chars, train_y, xtest_tfidf_ngram_chars)


Linear SVM, Binary Count Vectors: 
Train Accuracy:  0.9998949359109056
Test Accuracy:  0.9642857142857143
Confusion matrix:  [[1458   22]
 [  63  837]]
F1_score:  0.9516770892552587

Linear SVM, Count Vectors: 
Train Accuracy:  0.999684807732717
Test Accuracy:  0.9592436974789916
Confusion matrix:  [[1453   27]
 [  70  830]]
F1_score:  0.9447922595332954

Linear SVM, WordLevel TF-IDF: 
Train Accuracy:  0.99516705190166
Test Accuracy:  0.9596638655462185
Confusion matrix:  [[1451   29]
 [  67  833]]
F1_score:  0.945516458569807

Linear SVM, N-Gram Vectors: 
Train Accuracy:  0.9501996217692793
Test Accuracy:  0.8941176470588236
Confusion matrix:  [[1420   60]
 [ 192  708]]
F1_score:  0.8489208633093526

Linear SVM, CharLevel Vectors: 
Train Accuracy:  0.9989493591090566
Test Accuracy:  0.988655462184874
Confusion matrix:  [[1475    5]
 [  22  878]]
F1_score:  0.9848569826135727


In [26]:
print("\nRF, Binary Count Vectors: ")
accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=100), binary_xtrain_count, train_y, binary_xtest_count)

# RF on Count Vectors
print("\nRF, Count Vectors: ")
accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=100), xtrain_count, train_y, xtest_count)

# RF on Word Level TF IDF Vectors
print("\nRF, WordLevel TF-IDF: ")
accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=100), xtrain_tfidf, train_y, xtest_tfidf)

# RF on Ngram Level TF IDF Vectors
print("\nRF, N-Gram Vectors: ")
accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=100), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)

# RF on Character Level TF IDF Vectors
print("\nRF, CharLevel Vectors: ")
accuracy = train_model(ensemble.RandomForestClassifier(n_estimators=100), xtrain_tfidf_ngram_chars, train_y, xtest_tfidf_ngram_chars)


RF, Binary Count Vectors: 
Train Accuracy:  1.0
Test Accuracy:  0.9529411764705882
Confusion matrix:  [[1470   10]
 [ 102  798]]
F1_score:  0.9344262295081966

RF, Count Vectors: 
Train Accuracy:  1.0
Test Accuracy:  0.9512605042016806
Confusion matrix:  [[1470   10]
 [ 106  794]]
F1_score:  0.931924882629108

RF, WordLevel TF-IDF: 
Train Accuracy:  0.9987392309308678
Test Accuracy:  0.9575630252100841
Confusion matrix:  [[1457   23]
 [  78  822]]
F1_score:  0.9421203438395416

RF, N-Gram Vectors: 
Train Accuracy:  0.9592351334313931
Test Accuracy:  0.8953781512605042
Confusion matrix:  [[1426   54]
 [ 195  705]]
F1_score:  0.8499095840867993

RF, CharLevel Vectors: 
Train Accuracy:  0.9998949359109056
Test Accuracy:  0.984873949579832
Confusion matrix:  [[1469   11]
 [  25  875]]
F1_score:  0.9798432250839866


In [27]:
print("\nRF, Binary Count Vectors: ")
accuracy = train_model(ensemble.GradientBoostingClassifier(n_estimators=100), binary_xtrain_count, train_y, binary_xtest_count)

# RF on Count Vectors
print("\nRF, Count Vectors: ")
accuracy = train_model(ensemble.GradientBoostingClassifier(n_estimators=100), xtrain_count, train_y, xtest_count)

# RF on Word Level TF IDF Vectors
print("\nRF, WordLevel TF-IDF: ")
accuracy = train_model(ensemble.GradientBoostingClassifier(n_estimators=100), xtrain_tfidf, train_y, xtest_tfidf)

# RF on Ngram Level TF IDF Vectors
print("\nRF, N-Gram Vectors: ")
accuracy = train_model(ensemble.GradientBoostingClassifier(n_estimators=100), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)

# RF on Character Level TF IDF Vectors
print("\nRF, CharLevel Vectors: ")
accuracy = train_model(ensemble.GradientBoostingClassifier(n_estimators=100), xtrain_tfidf_ngram_chars, train_y, xtest_tfidf_ngram_chars)


RF, Binary Count Vectors: 
Train Accuracy:  0.9349653288505989
Test Accuracy:  0.9378151260504202
Confusion matrix:  [[1359  121]
 [  27  873]]
F1_score:  0.921858500527983

RF, Count Vectors: 
Train Accuracy:  0.9349653288505989
Test Accuracy:  0.9365546218487395
Confusion matrix:  [[1355  125]
 [  26  874]]
F1_score:  0.9204844655081623

RF, WordLevel TF-IDF: 
Train Accuracy:  0.9360159697415423
Test Accuracy:  0.9348739495798319
Confusion matrix:  [[1351  129]
 [  26  874]]
F1_score:  0.9185496584340513

RF, N-Gram Vectors: 
Train Accuracy:  0.8743433494431603
Test Accuracy:  0.8697478991596639
Confusion matrix:  [[1469   11]
 [ 299  601]]
F1_score:  0.7949735449735449

RF, CharLevel Vectors: 
Train Accuracy:  0.9878125656650557
Test Accuracy:  0.9827731092436974
Confusion matrix:  [[1476    4]
 [  37  863]]
F1_score:  0.976796830786644
