In [1]:
import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore",category=DeprecationWarning)
import pandas as pd
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
import re
from sklearn import *
import time



In [2]:
def dataClean(tweets_raw):
	cleanTweets = []
	for each in tweets_raw:
		tweet = each
		tweet = tweet.lower() #convert to lowercase
		tweet = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', tweet) #remove URL
		tweet = re.sub(r'(\s)@\w+', r'', tweet) #remove usernames
		tweet = re.sub(r'@\w+', r'', tweet) #remove usernames
		tweet = re.sub('<[^<]+?>', '', tweet) #remove HTML tags
		tweet = re.sub(r'[<>!#@$:.,%\?-]+', r'', tweet) #remove punctuation and special characters 
		lower_case = tweet.lower() #tokenization 
		words = lower_case.split()
		tweet = ' '.join([w for w in words if not w in nltk.corpus.stopwords.words("english")]) #remove stopwords
		ps = nltk.stem.PorterStemmer()
		stemmedTweet = [ps.stem(word) for word in tweet.split(" ")]
		stemmedTweet = " ".join(stemmedTweet)
		tweet = str(stemmedTweet)
		tweet = tweet.replace("'", "")
		tweet = tweet.replace("\"","")#;lmtzr = WordNetLemmatizer();tweet = lmtzr.lemmatize(tweet)
		cleanTweets.append(tweet)
	return cleanTweets

In [3]:
def vectorization(clean_train_tweets):
	vec = feature_extraction.text.TfidfVectorizer(min_df = 0.00125, max_df = 0.7, sublinear_tf=True, use_idf=True, stop_words=u'english', analyzer= 'word', ngram_range=(1,5),lowercase=True)
	train_vectors = vec.fit_transform(clean_train_tweets)
	return train_vectors

In [4]:
#Loading the sheets into data frames

trainingFile = "train.xlsx"
df_obama = pd.read_excel(trainingFile,sheetname='Obama')
df_romney = pd.read_excel(trainingFile,sheetname='Romney')

#Removing the mixed class and the !!! class

df_obama = df_obama[(df_obama['Class'].isin((1,-1,0)))]
df_romney = df_romney[(df_romney['Class'].isin((1,-1,0)))]

#creating lists for raw tweets and classes

obama_tweets_raw = df_obama['Anootated tweet']
obama_class = df_obama['Class']
romney_tweets_raw = df_romney['Anootated tweet']
romney_class = df_romney['Class']

obama_tweets_raw = obama_tweets_raw.tolist()
romney_tweets_raw = romney_tweets_raw.tolist()
obama_class_train = obama_class.tolist()
romney_class_train = romney_class.tolist()

romney_tweets = dataClean(romney_tweets_raw) #romney tweets cleaning
obama_tweets = dataClean(obama_tweets_raw) #obama tweets cleaning

In [5]:
obama_tweets_vectors = vectorization(obama_tweets)
romney_tweets_vectors = vectorization(romney_tweets)

In [6]:
models = [naive_bayes.BernoulliNB(),svm.SVC(kernel='rbf', gamma=0.58, C=0.81,class_weight='balanced'),tree.DecisionTreeClassifier(random_state=0,class_weight='balanced'),ensemble.RandomForestClassifier(criterion='entropy', n_jobs = 10,class_weight='balanced'),linear_model.LogisticRegression(class_weight='balanced'),linear_model.SGDClassifier()]

In [7]:
j = 0
clf = []
for each in models:
    start_time = time.clock()
    clf.append(each)
    clf_use = clf[j]
    #obama
    preds = model_selection.cross_val_predict(clf_use, obama_tweets_vectors, obama_class_train, cv=10)
    accScore = metrics.accuracy_score(obama_class_train,preds)
    labels = [1,-1]
    precision = metrics.precision_score(obama_class_train,preds,average=None,labels=labels)
    recall = metrics.recall_score(obama_class_train,preds,average=None,labels=labels)
    f1Score = metrics.f1_score(obama_class_train,preds,average=None,labels=labels)
    print(each);print("Obama: \nOverall Acurracy: ",accScore,"\n")
    lbl = ['positive', 'negative']
    for i in range(2):
        print("Precision of %s class: %f" %(lbl[i],precision[i]))
        print("Recall of %s class: %f" %(lbl[i],recall[i]))
        print("F1-Score of %s class: %f" %(lbl[i],f1Score[i]),"\n")
    #romney
    preds = model_selection.cross_val_predict(clf_use, romney_tweets_vectors, romney_class_train, cv=10)
    accScore = metrics.accuracy_score(romney_class_train,preds)
    labels = [1,-1]
    precision = metrics.precision_score(romney_class_train,preds,average=None,labels=labels)
    recall = metrics.recall_score(romney_class_train,preds,average=None,labels=labels)
    f1Score = metrics.f1_score(romney_class_train,preds,average=None,labels=labels)
    print("Romney:\nOverall Acurracy: ",accScore,"\n")
    lbl = ['positive', 'negative']
    for i in range(2):
        print("Precision of %s class: %f" %(lbl[i],precision[i]))
        print("Recall of %s class: %f" %(lbl[i],recall[i]))
        print("F1-Score of %s class: %f" %(lbl[i],f1Score[i]),"\n")
    end_time = time.clock()
    print("Total time taken: %0.2f seconds \n\n"%(end_time-start_time))
    j = j + 1

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
Obama: 
Overall Acurracy:  0.556205446902 

Precision of positive class: 0.546875
Recall of positive class: 0.614035
F1-Score of positive class: 0.578512 

Precision of negative class: 0.603319
Recall of negative class: 0.586368
F1-Score of negative class: 0.594723 

Romney:
Overall Acurracy:  0.544617563739 

Precision of positive class: 0.449909
Recall of positive class: 0.459535
F1-Score of positive class: 0.454671 

Precision of negative class: 0.635932
Recall of negative class: 0.693744
F1-Score of negative class: 0.663581 

Total time taken: 0.12 seconds 


SVC(C=0.81, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.58, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Obama: 
Overall Acurracy:  0.573752513252 

Precision of positive class: 0.601710
Recall of positive class: 0.595886
F1-Score of p

In [8]:
from sklearn.ensemble import VotingClassifier

eclf = VotingClassifier(estimators=[('bnb', clf[0]), ('dt', clf[1]), ('rf', clf[2]),('lr',clf[3]),('sgd',clf[4])], voting='hard')
preds = model_selection.cross_val_predict(eclf, obama_tweets_vectors, obama_class_train, cv=10)
accScore = metrics.accuracy_score(obama_class_train,preds)
labels = [1,-1]
precision = metrics.precision_score(obama_class_train,preds,average=None,labels=labels)
recall = metrics.recall_score(obama_class_train,preds,average=None,labels=labels)
f1Score = metrics.f1_score(obama_class_train,preds,average=None,labels=labels)
print(eclf)
print("Obama: \nOverall Acurracy: ",accScore,"\n")
lbl = ['positive', 'negative']
for i in range(2):
    print("Precision of %s class: %f" %(lbl[i],precision[i]))
    print("Recall of %s class: %f" %(lbl[i],recall[i]))
    print("F1-Score of %s class: %f" %(lbl[i],f1Score[i]),"\n")
#romney
preds = model_selection.cross_val_predict(eclf, romney_tweets_vectors, romney_class_train, cv=10)
accScore = metrics.accuracy_score(romney_class_train,preds)
labels = [1,-1]
precision = metrics.precision_score(romney_class_train,preds,average=None,labels=labels)
recall = metrics.recall_score(romney_class_train,preds,average=None,labels=labels)
f1Score = metrics.f1_score(romney_class_train,preds,average=None,labels=labels)
print("Romney:\nOverall Acurracy: ",accScore,"\n")
lbl = ['positive', 'negative']
for i in range(2):
    print("Precision of %s class: %f" %(lbl[i],precision[i]))
    print("Recall of %s class: %f" %(lbl[i],recall[i]))
    print("F1-Score of %s class: %f" %(lbl[i],f1Score[i]),"\n")
end_time = time.clock()
print("Total time taken: %0.2f seconds \n\n"%(end_time-start_time))

VotingClassifier(estimators=[('bnb', BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)), ('dt', SVC(C=0.81, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.58, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,...ty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))],
         n_jobs=1, voting='hard', weights=None)
Obama: 
Overall Acurracy:  0.572655821605 

Precision of positive class: 0.588201
Recall of positive class: 0.615245
F1-Score of positive class: 0.601419 

Precision of negative class: 0.592972
Recall of negative class: 0.632154
F1-Score of negative class: 0.611937 

Romney:
Overall Acurracy:  0.545502832861 

Precision of positive class: 0.451697
Recall of positive class: 0.482791
F1-Score of positive class: 0.466727 

Precision of negative class: 0.652959
Recall of negative class: 0.675078
F1-Score of negative class: 0.663