In [256]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from statistics import mean
import numpy as np
import re

xls_dir="data/trainingObamaRomneytweets.xlsx"
porter=PorterStemmer()

In [257]:
columns=[3,4]
df=pd.read_excel(xls_dir, sheet_names='Obama', usecols=columns, header=1, names=['Tweet', 'Sentiment'])
#df

In [258]:
df.drop(df[(df.Sentiment!=1) & (df.Sentiment!=-1) & (df.Sentiment!=0)].index, inplace=True)
df.dropna(how='any', inplace=True)
#df

In [259]:
#Remove tags
df['Tweet'] = df['Tweet'].apply(lambda x: re.sub("<.*?>", '', x))

#remove stopwords
stopwords_english=stopwords.words("english")
df['Tweet'] = df['Tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords_english)]))

#lowerCase
df['Tweet'] = df['Tweet'].apply(lambda x: ' '.join([word.lower() for word in x.split()]))
#df

In [260]:
#Stemming
def stem(tweet):
    tokens=word_tokenize(tweet)
    result=[]
    for token in tokens:
        result.extend([porter.stem(token)])
    return result
df['Tweet'] = df['Tweet'].apply(stem)
#df

In [261]:
#remove punctuation
punctuations = ['"',"'",'!','@','#','-','_',',','``',"''",'?',':','$','%','^','(',')','’','.']
def removePunctuation(tweet):
    return [token for token in tweet if token not in string.punctuation]
df['Tweet'] = df['Tweet'].apply(removePunctuation)
corpus = [" ".join(tweet) for tweet in df["Tweet"]]
labels = [int(sentiment) for sentiment in df["Sentiment"]]
#df

In [265]:
kf = StratifiedKFold(n_splits=10)
totalNB = []
totalSVM = []
for train_index, test_index in kf.split(corpus,labels):
    X_train = [corpus[i] for i in train_index]
    X_test = [corpus[i] for i in test_index]
    y_train =  [labels[i] for i in train_index]
    y_test = [labels[i] for i in test_index]
    vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
    train_corpus_tf_idf = vectorizer.fit_transform(X_train) 
    test_corpus_tf_idf = vectorizer.transform(X_test)
    
    model1 = LinearSVC()
    model2 = MultinomialNB()
    model1.fit(train_corpus_tf_idf,y_train)
    model2.fit(train_corpus_tf_idf,y_train)
    result1 = model1.predict(test_corpus_tf_idf)
    result2 = model2.predict(test_corpus_tf_idf)
    cm1 = confusion_matrix(y_test, result1)
    cm2 = confusion_matrix(y_test,result2)
    totalNB.append((cm1[0][0]+cm1[1][1]+cm1[2][2])/np.sum(cm1))
    totalSVM.append((cm2[0][0]+cm2[1][1]+cm2[2][2])/np.sum(cm2))
print (mean(totalNB))
print (mean(totalSVM))

0.555610462954354
0.5724302100636492
