In [1]:
import pandas as pd
import nltk
import re
from sklearn import *
import time



In [2]:
def dataClean(tweets_raw):
	cleanTweets = []
	for each in tweets_raw:
		tweet = each
		tweet = tweet.lower() #convert to lowercase
		tweet = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', tweet) #remove URL
		tweet = re.sub(r'(\s)@\w+', r'', tweet) #remove usernames
		tweet = re.sub(r'@\w+', r'', tweet) #remove usernames
		tweet = re.sub('<[^<]+?>', '', tweet) #remove HTML tags
		tweet = re.sub(r'[<>!#@$:.,%\?-]+', r'', tweet) #remove punctuation and special characters 
		lower_case = tweet.lower() #tokenization 
		words = lower_case.split()
		tweet = ' '.join([w for w in words if not w in nltk.corpus.stopwords.words("english")]) #remove stopwords
		ps = nltk.stem.PorterStemmer()
		stemmedTweet = [ps.stem(word) for word in tweet.split(" ")]
		stemmedTweet = " ".join(stemmedTweet)
		tweet = str(stemmedTweet)
		tweet = tweet.replace("'", "")
		tweet = tweet.replace("\"","")
		cleanTweets.append(tweet)
	return cleanTweets

In [3]:
def vectorization(clean_train_tweets,clean_test_tweets):
	vec = feature_extraction.text.TfidfVectorizer(min_df = 0.00125, max_df = 0.7, sublinear_tf=True, use_idf=True, stop_words=u'english', analyzer= 'word', ngram_range=(1,5),lowercase=True)
	train_vectors = vec.fit_transform(clean_train_tweets);test_vectors = vec.transform(clean_test_tweets)
	return train_vectors,test_vectors

In [4]:
#Loading the sheets into data frames

trainingFile = "train.xlsx"
df_obama = pd.read_excel(trainingFile,sheetname='Obama')
df_romney = pd.read_excel(trainingFile,sheetname='Romney')

#Removing the mixed class and the !!! class

df_obama = df_obama[(df_obama['Class'].isin((1,-1,0)))]
df_romney = df_romney[(df_romney['Class'].isin((1,-1,0)))]

#creating lists for raw tweets and classes

obama_tweets_raw = df_obama['Anootated tweet']
obama_class = df_obama['Class']
romney_tweets_raw = df_romney['Anootated tweet']
romney_class = df_romney['Class']

obama_tweets_raw = obama_tweets_raw.tolist()
romney_tweets_raw = romney_tweets_raw.tolist()
obama_class_train = obama_class.tolist()
romney_class_train = romney_class.tolist()

romney_tweets = dataClean(romney_tweets_raw) #romney tweets cleaning
obama_tweets = dataClean(obama_tweets_raw) #obama tweets cleaning

In [5]:
#obama_tweets_vectors = vectorization(obama_tweets)
#romney_tweets_vectors = vectorization(romney_tweets)

In [6]:
def computation(clf):
    start_time = time.clock()
    #obama
    preds = model_selection.cross_val_predict(clf, obama_tweets_vectors, obama_class_train, cv=10)
    accScore = metrics.accuracy_score(obama_class_train,preds)
    labels = [1,-1]
    precision = metrics.precision_score(obama_class_train,preds,average=None,labels=labels)
    recall = metrics.recall_score(obama_class_train,preds,average=None,labels=labels)
    f1Score = metrics.f1_score(obama_class_train,preds,average=None,labels=labels)
    print(clf);print("Obama: \nOverall Acurracy: ",accScore,"\n")
    lbl = ['positive', 'negative']
    for i in range(2):
        print("Precision of %s class: %f" %(lbl[i],precision[i]))
        print("Recall of %s class: %f" %(lbl[i],recall[i]))
        print("F1-Score of %s class: %f" %(lbl[i],f1Score[i]),"\n")
    #romney
    preds = model_selection.cross_val_predict(clf, romney_tweets_vectors, romney_class_train, cv=10)
    accScore = metrics.accuracy_score(romney_class_train,preds)
    labels = [1,-1]
    precision = metrics.precision_score(romney_class_train,preds,average=None,labels=labels)
    recall = metrics.recall_score(romney_class_train,preds,average=None,labels=labels)
    f1Score = metrics.f1_score(romney_class_train,preds,average=None,labels=labels)
    print("Romney:\nOverall Acurracy: ",accScore,"\n")
    lbl = ['positive', 'negative']
    for i in range(2):
        print("Precision of %s class: %f" %(lbl[i],precision[i]))
        print("Recall of %s class: %f" %(lbl[i],recall[i]))
        print("F1-Score of %s class: %f" %(lbl[i],f1Score[i]),"\n")
    end_time = time.clock()
    print("Total time taken: %0.2f seconds \n\n"%(end_time-start_time))

In [7]:
'''
from imblearn.over_sampling import SMOTE

romney_tweets_vectors = romney_tweets_vectors.toarray()
sm = SMOTE(random_state=42)
romney_tweets_vectors, romney_class_train = sm.fit_sample(romney_tweets_vectors, romney_class_train)

obama_tweets_vectors = obama_tweets_vectors.toarray()
sm = SMOTE(random_state=43)
obama_tweets_vectors, obama_class_train = sm.fit_sample(obama_tweets_vectors, obama_class_train)
'''

'\nfrom imblearn.over_sampling import SMOTE\n\nromney_tweets_vectors = romney_tweets_vectors.toarray()\nsm = SMOTE(random_state=42)\nromney_tweets_vectors, romney_class_train = sm.fit_sample(romney_tweets_vectors, romney_class_train)\n\nobama_tweets_vectors = obama_tweets_vectors.toarray()\nsm = SMOTE(random_state=43)\nobama_tweets_vectors, obama_class_train = sm.fit_sample(obama_tweets_vectors, obama_class_train)\n'

In [8]:
models = [naive_bayes.BernoulliNB(),svm.SVC(kernel='rbf', gamma=0.58, C=0.81),tree.DecisionTreeClassifier(random_state=0),ensemble.RandomForestClassifier(criterion='entropy', n_jobs = 10),linear_model.LogisticRegression(),linear_model.SGDClassifier()]

#for each in models:
    #computation(each)

In [9]:
print('Done')

Done


In [10]:
testingFile = "test.xlsx"
df_obama_test = pd.read_excel(testingFile,sheetname='Obama')
df_romney_test = pd.read_excel(testingFile,sheetname='Romney')

#Removing the mixed class and the !!! class

df_obama_test = df_obama_test[(df_obama_test['Class'].isin((1,-1,0)))]
df_romney_test = df_romney_test[(df_romney_test['Class'].isin((1,-1,0)))]

#creating lists for raw tweets and classes

obama_tweets_raw_test = df_obama_test['Anootated tweet']
obama_class_test = df_obama_test['Class']
romney_tweets_raw_test = df_romney_test['Anootated tweet']
romney_class_test = df_romney_test['Class']

obama_tweets_raw_test = obama_tweets_raw_test.tolist()
romney_tweets_raw_test = romney_tweets_raw_test.tolist()
obama_class_train_test = obama_class_test.tolist()
romney_class_train_test = romney_class_test.tolist()

romney_tweets_test = dataClean(romney_tweets_raw_test) #romney tweets cleaning
obama_tweets_test = dataClean(obama_tweets_raw_test) #obama tweets cleaning

obama_tweets_vectors,obama_tweets_vectors_test = vectorization(obama_tweets,obama_tweets_test)
romney_tweets_vectors,romney_tweets_vectors_test = vectorization(romney_tweets,romney_tweets_test)

In [11]:
from imblearn.over_sampling import SMOTE

romney_tweets_vectors = romney_tweets_vectors.toarray()
sm = SMOTE(random_state=42)
romney_tweets_vectors, romney_class_train = sm.fit_sample(romney_tweets_vectors, romney_class_train)

#obama_tweets_vectors = obama_tweets_vectors.toarray()
#sm = SMOTE(random_state=43)
#obama_tweets_vectors, obama_class_train = sm.fit_sample(obama_tweets_vectors, obama_class_train)



In [12]:
def computation_test(clf):
    start_time = time.clock()
    #obama
    #preds = model_selection.cross_val_predict(clf, obama_tweets_vectors, obama_class_train, cv=10)
    clf.fit(obama_tweets_vectors, obama_class_train)
    preds = clf.predict(obama_tweets_vectors_test.toarray())
    accScore = metrics.accuracy_score(obama_class_train_test,preds)
    labels = [1,-1]
    precision = metrics.precision_score(obama_class_train_test,preds,average=None,labels=labels)
    recall = metrics.recall_score(obama_class_train_test,preds,average=None,labels=labels)
    f1Score = metrics.f1_score(obama_class_train_test,preds,average=None,labels=labels)
    print(clf);print("Obama: \nOverall Acurracy: ",accScore,"\n")
    lbl = ['positive', 'negative']
    for i in range(2):
        print("Precision of %s class: %f" %(lbl[i],precision[i]))
        print("Recall of %s class: %f" %(lbl[i],recall[i]))
        print("F1-Score of %s class: %f" %(lbl[i],f1Score[i]),"\n")
    #romney
    #preds = model_selection.cross_val_predict(clf, romney_tweets_vectors, romney_class_train, cv=10)
    clf.fit(romney_tweets_vectors, romney_class_train)
    preds = clf.predict(romney_tweets_vectors_test.toarray())
    accScore = metrics.accuracy_score(romney_class_train_test,preds)
    labels = [1,-1]
    precision = metrics.precision_score(romney_class_train_test,preds,average=None,labels=labels)
    recall = metrics.recall_score(romney_class_train_test,preds,average=None,labels=labels)
    f1Score = metrics.f1_score(romney_class_train_test,preds,average=None,labels=labels)
    print("Romney:\nOverall Acurracy: ",accScore,"\n")
    lbl = ['positive', 'negative']
    for i in range(2):
        print("Precision of %s class: %f" %(lbl[i],precision[i]))
        print("Recall of %s class: %f" %(lbl[i],recall[i]))
        print("F1-Score of %s class: %f" %(lbl[i],f1Score[i]),"\n")
    end_time = time.clock()
    print("Total time taken: %0.2f seconds \n\n"%(end_time-start_time))

In [13]:
for each in models:
    computation_test(each)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
Obama: 
Overall Acurracy:  0.535110199897 

Precision of positive class: 0.541522
Recall of positive class: 0.537801
F1-Score of positive class: 0.539655 

Precision of negative class: 0.579856
Recall of negative class: 0.585756
F1-Score of negative class: 0.582791 

Romney:
Overall Acurracy:  0.559473684211 

Precision of positive class: 0.414414
Recall of positive class: 0.716883
F1-Score of positive class: 0.525214 

Precision of negative class: 0.680401
Recall of negative class: 0.636458
F1-Score of negative class: 0.657696 

Total time taken: 0.43 seconds 


SVC(C=0.81, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.58, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Obama: 
Overall Acurracy:  0.553049718093 

Precision of positive class: 0.600868
Recall of positive class: 0.475945
F1-Score of positiv