# Classifying with TextBlob

In [1]:
from textblob.classifiers import NLTKClassifier,NaiveBayesClassifier,DecisionTreeClassifier,PositiveNaiveBayesClassifier,MaxEntClassifier
import pandas as pd
import re

path = '/Users/Jared/Documents/CSCE489/489Project/'

### Grabbing the Data

In [2]:
#Load Turk Data
turk_df = pd.read_csv(path + 'Turk/Batch_2606965_batch_results.csv') 
turk_df.head()

Unnamed: 0,HITId,HITTypeId,Title,Description,Keywords,Reward,CreationTime,MaxAssignments,RequesterAnnotation,AssignmentDurationInSeconds,...,RequesterFeedback,WorkTimeInSeconds,LifetimeApprovalRate,Last30DaysApprovalRate,Last7DaysApprovalRate,Input.status_id,Input.content,Answer.sentiment,Approve,Reject
0,301KG0KX9CFC0MHLW3ECT80VTS3H2D,3WROYBU7BMMRCGD41SQ9YGNC31GDCO,Preseidential Election Sentiment Analysis,Analyze the sentiment of the provided Facebook...,sentiment election Donald Trump Hilary Clinton...,$0.02,Wed Nov 23 21:44:16 PST 2016,3,BatchId:2606965;,172800,...,,3,100% (143/143),100% (143/143),100% (143/143),13652355666_10154036300320667,Thousands of demonstrators filled the streets ...,Positive,,
1,301KG0KX9CFC0MHLW3ECT80VTS3H2D,3WROYBU7BMMRCGD41SQ9YGNC31GDCO,Preseidential Election Sentiment Analysis,Analyze the sentiment of the provided Facebook...,sentiment election Donald Trump Hilary Clinton...,$0.02,Wed Nov 23 21:44:16 PST 2016,3,BatchId:2606965;,172800,...,,3,100% (349/349),100% (349/349),100% (349/349),13652355666_10154036300320667,Thousands of demonstrators filled the streets ...,Positive,,
2,301KG0KX9CFC0MHLW3ECT80VTS3H2D,3WROYBU7BMMRCGD41SQ9YGNC31GDCO,Preseidential Election Sentiment Analysis,Analyze the sentiment of the provided Facebook...,sentiment election Donald Trump Hilary Clinton...,$0.02,Wed Nov 23 21:44:16 PST 2016,3,BatchId:2606965;,172800,...,,7,100% (132/132),100% (132/132),100% (132/132),13652355666_10154036300320667,Thousands of demonstrators filled the streets ...,Neutral,,
3,301KG0KX9CFC0MHLW3ECT80VTS42HZ,3WROYBU7BMMRCGD41SQ9YGNC31GDCO,Preseidential Election Sentiment Analysis,Analyze the sentiment of the provided Facebook...,sentiment election Donald Trump Hilary Clinton...,$0.02,Wed Nov 23 21:44:36 PST 2016,3,BatchId:2606965;,172800,...,,51,100% (147/147),100% (147/147),100% (147/147),131459315949_10154056577745950,From Barack Obama's 2003 Senate campaign to Hi...,Positive,,
4,301KG0KX9CFC0MHLW3ECT80VTS42HZ,3WROYBU7BMMRCGD41SQ9YGNC31GDCO,Preseidential Election Sentiment Analysis,Analyze the sentiment of the provided Facebook...,sentiment election Donald Trump Hilary Clinton...,$0.02,Wed Nov 23 21:44:36 PST 2016,3,BatchId:2606965;,172800,...,,3,100% (99/99),100% (99/99),100% (99/99),131459315949_10154056577745950,From Barack Obama's 2003 Senate campaign to Hi...,Neutral,,


In [3]:
#Narrow columns down to relative information
relevant_df = turk_df[['Input.status_id','Input.content','Answer.sentiment']]
relevant_df.columns = ['status_id','content','sentiment']
relevant_df.head()

Unnamed: 0,status_id,content,sentiment
0,13652355666_10154036300320667,Thousands of demonstrators filled the streets ...,Positive
1,13652355666_10154036300320667,Thousands of demonstrators filled the streets ...,Positive
2,13652355666_10154036300320667,Thousands of demonstrators filled the streets ...,Neutral
3,131459315949_10154056577745950,From Barack Obama's 2003 Senate campaign to Hi...,Positive
4,131459315949_10154056577745950,From Barack Obama's 2003 Senate campaign to Hi...,Neutral


### Condense Turk Reviews

In [4]:
#Split into two, one with post content and one with sentiment review. 
#This is necessary so post content doesn't get stacked when combining rows
content_df = relevant_df[['status_id','content']]
sentiment_df = relevant_df[['status_id','sentiment']]

In [5]:
#Removes any non-ascii characters. TextBlob cannot process them
def remove_non_ascii(text):
    return re.sub(r'[^\x00-\x7F]',' ', text)

In [6]:
#Combines all rows with the same status_id. 
#It only keeps the same post (content column) its finds which is ok because they are all the same.
content_df = content_df.groupby(content_df.status_id).first()
content_df.head()

Unnamed: 0_level_0,content
status_id,Unnamed: 1_level_1
131459315949_10152751596575950,"One opinion: ""Hillary Clinton won because all ..."
131459315949_10152753624975950,Second fiddle to Hillary Clinton in the debate...
131459315949_10152754227800950,Donald J. Trump may not attend the next Republ...
131459315949_10152758428025950,Conservatives have criticized Donald J. Trump ...
131459315949_10152766742570950,Hillary Clinton has promised to take on the Na...


In [7]:
#Removes all non-ascii characters from the content column
content_df['content'] = content_df.content.apply(remove_non_ascii)
content_df.head()

Unnamed: 0_level_0,content
status_id,Unnamed: 1_level_1
131459315949_10152751596575950,"One opinion: ""Hillary Clinton won because all ..."
131459315949_10152753624975950,Second fiddle to Hillary Clinton in the debate...
131459315949_10152754227800950,Donald J. Trump may not attend the next Republ...
131459315949_10152758428025950,Conservatives have criticized Donald J. Trump ...
131459315949_10152766742570950,Hillary Clinton has promised to take on the Na...


In [8]:
#Function that counts the number of Positive, Neutral, and Negative sentiment reviews and chooses an official review
#2 or more of review = that review
#Anything else is Inconclusive
def choose_sentiment(label):
    if label.count('Positive') >= 2:
        return 'Positive'
    elif label.count('Neutral') >= 2:
        return 'Neutral'
    elif label.count('Negative') >= 2:
        return 'Negative'
    else:
        return 'Inconclusive'

In [9]:
#Combines all rows with the same statuse id
#Combines all sentiment reviews as well
sentiment_df = sentiment_df.groupby(sentiment_df.status_id).sum()
sentiment_df.head()

Unnamed: 0_level_0,sentiment
status_id,Unnamed: 1_level_1
131459315949_10152751596575950,NeutralNegativeNegative
131459315949_10152753624975950,NeutralPositiveNegative
131459315949_10152754227800950,PositiveNeutralNeutral
131459315949_10152758428025950,NegativeNegativeNeutral
131459315949_10152766742570950,NeutralNegativeNeutral


In [10]:
#Chooses an offical sentiment review based on the three aggregated ones
sentiment_df['sentiment'] = sentiment_df.sentiment.apply(choose_sentiment)
sentiment_df.head()

Unnamed: 0_level_0,sentiment
status_id,Unnamed: 1_level_1
131459315949_10152751596575950,Negative
131459315949_10152753624975950,Inconclusive
131459315949_10152754227800950,Neutral
131459315949_10152758428025950,Negative
131459315949_10152766742570950,Neutral


In [11]:
#Discards all Inconclusive sentiment reviews
sentiment_df = sentiment_df[sentiment_df.sentiment != 'Inconclusive']
sentiment_df.head()

Unnamed: 0_level_0,sentiment
status_id,Unnamed: 1_level_1
131459315949_10152751596575950,Negative
131459315949_10152754227800950,Neutral
131459315949_10152758428025950,Negative
131459315949_10152766742570950,Neutral
131459315949_10152791196665950,Positive


In [12]:
#Merged the sentiment Dataframe and post Dataframe by combining like status_ids
#Note: It uses a right merge, so only items in the sentiment dataframe are kept.
merged_df = content_df.merge(sentiment_df,how='right',left_index=True,right_index=True)
merged_df.head()

Unnamed: 0_level_0,content,sentiment
status_id,Unnamed: 1_level_1,Unnamed: 2_level_1
131459315949_10152751596575950,"One opinion: ""Hillary Clinton won because all ...",Negative
131459315949_10152754227800950,Donald J. Trump may not attend the next Republ...,Neutral
131459315949_10152758428025950,Conservatives have criticized Donald J. Trump ...,Negative
131459315949_10152766742570950,Hillary Clinton has promised to take on the Na...,Neutral
131459315949_10152791196665950,A look at 5 crucial issues in the Canadian ele...,Positive


In [13]:
#Classifies a post as Trump, Clinton, or Other
#Rules:
#If a review talks about only Trump, and not Clinotn, it is Trump
#If a review talks about only Clinton, and not Trump, it is Clinton
#Everything else is other 
def classify_post (row):
   status = row['content'].lower()
   if (('donald' in status) or ('trump' in status)) and (('hillary' in status) or ('clinton' in status)) :
      return 'other'
   elif ('donald' in status) or ('trump' in status) :
      return 'trump'
   elif ('hillary' in status) or ('clinton' in status) :
      return 'clinton'
   else:
      return 'other'

In [14]:
#Labels all posts as Trump, Clinton, or other
labeled_df = merged_df
labeled_df['candidate_label'] = merged_df.apply (lambda row: classify_post (row),axis=1)
labeled_df.head()

Unnamed: 0_level_0,content,sentiment,candidate_label
status_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
131459315949_10152751596575950,"One opinion: ""Hillary Clinton won because all ...",Negative,clinton
131459315949_10152754227800950,Donald J. Trump may not attend the next Republ...,Neutral,trump
131459315949_10152758428025950,Conservatives have criticized Donald J. Trump ...,Negative,trump
131459315949_10152766742570950,Hillary Clinton has promised to take on the Na...,Neutral,clinton
131459315949_10152791196665950,A look at 5 crucial issues in the Canadian ele...,Positive,other


In [15]:
#Function to read in stop words
def stopword_file_to_list(file_name):
    f = open(path + "StopWordLists/" + file_name)
    file_text = f.read()
    return file_text.split("\r")

In [16]:
#Remove Stop Word List into Memory
comprehensive_stopword_list = stopword_file_to_list("comprehensive_stopwords.txt")
default_english_stopword_list = stopword_file_to_list("default_english_stopwords.txt")
mysql_stopword_list = stopword_file_to_list("mysql_stopwords.txt")
print comprehensive_stopword_list[:10]
print default_english_stopword_list[:10]
print mysql_stopword_list[:10]

['a', 'able', 'about', 'above', 'abst', 'accordance', 'according', 'accordingly', 'across', 'act']
['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and']
["a's", 'accordingly', 'again', 'allows', 'also', 'amongst', 'anybody', 'anyways', 'appropriate', 'aside']


In [17]:
#Remove words in list from string
def remove_words(s,words_to_remove):
    word_list = s.split()
    resultwords  = [word for word in word_list if word.lower() not in words_to_remove]
    result = ' '.join(resultwords)
    return result

In [18]:
labeled_df_without_stopwords = labeled_df
labeled_df_without_stopwords['content'] = merged_df.apply (lambda row: remove_words(row['content'],comprehensive_stopword_list),axis=1)
labeled_df_without_stopwords.head()

Unnamed: 0_level_0,content,sentiment,candidate_label
status_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
131459315949_10152751596575950,"opinion: ""Hillary Clinton won opponents terrib...",Negative,clinton
131459315949_10152754227800950,Donald J. Trump attend Republican presidential...,Neutral,trump
131459315949_10152758428025950,Conservatives criticized Donald J. Trump expre...,Negative,trump
131459315949_10152766742570950,Hillary Clinton promised National Rifle Associ...,Neutral,clinton
131459315949_10152791196665950,"5 crucial issues Canadian election, referendum...",Positive,other


In [19]:
#We're done making changes, assign out current work to the final df
final_df = labeled_df_without_stopwords

In [20]:
trump_df = final_df[final_df.candidate_label == 'trump']
trump_df.head()

Unnamed: 0_level_0,content,sentiment,candidate_label
status_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
131459315949_10152754227800950,Donald J. Trump attend Republican presidential...,Neutral,trump
131459315949_10152758428025950,Conservatives criticized Donald J. Trump expre...,Negative,trump
131459315949_10152871191975950,Donald J. Trump wasn't legality closing mosque...,Neutral,trump
131459315949_10152978451960950,Dr. Ben Carson lead Republican presidential ca...,Positive,trump
131459315949_10153076683820950,"fighting politically correct war, Donald J. Tr...",Neutral,trump


In [21]:
clinton_df = final_df[final_df.candidate_label == 'clinton']
clinton_df.head()

Unnamed: 0_level_0,content,sentiment,candidate_label
status_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
131459315949_10152751596575950,"opinion: ""Hillary Clinton won opponents terrib...",Negative,clinton
131459315949_10152766742570950,Hillary Clinton promised National Rifle Associ...,Neutral,clinton
131459315949_10152957933900950,Katy Perry performed rally Hillary Clinton Sat...,Positive,clinton
131459315949_10153018635795950,"Politics Alert: poll, 62 percent Hillary Clint...",Positive,clinton
131459315949_10153173878100950,"voters mobbing Bernie Sanders events Iowa, Hil...",Neutral,clinton


In [22]:
other_df = final_df[final_df.candidate_label == 'other']
other_df.head()

Unnamed: 0_level_0,content,sentiment,candidate_label
status_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
131459315949_10152791196665950,"5 crucial issues Canadian election, referendum...",Positive,other
131459315949_10152983500645950,election June 60 percent country vote oppositi...,Positive,other
131459315949_10153139941305950,Donald J. Trump referenced male anatomy descri...,Negative,other
131459315949_10153248450690950,"taste Hillary Clinton rally full 360 , Demi Lo...",Neutral,other
131459315949_10153257668255950,"""So stake year election? Well, things, fate pl...",Neutral,other


In [23]:
#Creating Tuple Arr from all Posts
tuple_arr = list(zip(labeled_df.content,labeled_df.sentiment))
size = len(tuple_arr)
halfway_pt = size/2

print size

train = tuple_arr[:halfway_pt]
test = tuple_arr[halfway_pt:]
print len(train)
print len(test)

847
423
424


In [24]:
#Creating Tuple Arr from Trump Posts
trump_tuple_arr = list(zip(trump_df.content,trump_df.sentiment))
size = len(trump_tuple_arr)
halfway_pt = size/2

print size

trump_train = trump_tuple_arr[:halfway_pt]
trump_test = trump_tuple_arr[halfway_pt:]
print len(trump_train)
print len(trump_test)

491
245
246


In [25]:
#Creating Tuple Arr from Clinton Posts
clinton_tuple_arr = list(zip(clinton_df.content,clinton_df.sentiment))
size = len(clinton_tuple_arr)
halfway_pt = size/2

print size

clinton_train = clinton_tuple_arr[:halfway_pt]
clinton_test = clinton_tuple_arr[halfway_pt:]
print len(clinton_train)
print len(clinton_test)

188
94
94


In [26]:
#Creating Tuple Arr from Other Posts
other_tuple_arr = list(zip(other_df.content,other_df.sentiment))
size = len(other_tuple_arr)
halfway_pt = size/2

print size

other_train = other_tuple_arr[:halfway_pt]
other_test = other_tuple_arr[halfway_pt:]
print len(other_train)
print len(other_test)

168
84
84


In [22]:
trump_nltk_classifier = NLTKClassifier(trump_train)
print 'Done Training Trump NLTK Classifier!'

Done Training Trump NLTK Classifier!


In [23]:
clinton_nltk_classifier = NLTKClassifier(clinton_train)
print 'Done Training Clinton NLTK Classifier!'

Done Training Clinton NLTK Classifier!


In [24]:
other_nltk_classifier = NLTKClassifier(other_train)
print 'Done Training Other NLTK Classifier!'

Done Training Other NLTK Classifier!


In [25]:
trump_naivebayes_classifier = NaiveBayesClassifier(trump_train)
print 'Done Training Trump Naive Bayes Classifier!'

Done Training Trump Naive Bayes Classifier!


In [26]:
clinton_naivebayes_classifier = NaiveBayesClassifier(clinton_train)
print 'Done Training Clinton Naive Bayes Classifier!'

Done Training Clinton Naive Bayes Classifier!


In [27]:
other_naivebayes_classifier = NaiveBayesClassifier(other_train)
print 'Done Training Other Naive Bayes Classifier!'

Done Training Other Naive Bayes Classifier!


In [28]:
trump_decisiontree_classifier = DecisionTreeClassifier(trump_train)
print 'Done Training Trump Decision Tree Classifier!'

Done Training Trump Decision Tree Classifier!


In [29]:
clinton_decisiontree_classifier = DecisionTreeClassifier(clinton_train)
print 'Done Training Clinton Decision Tree Classifier!'

Done Training Clinton Decision Tree Classifier!


In [30]:
other_decisiontree_classifier = DecisionTreeClassifier(other_train)
print 'Done Training Other Decision Tree Classifier!'

Done Training Other Decision Tree Classifier!


In [31]:
#positivenaivebayes_classifier = PositiveNaiveBayesClassifier(trump_train)
#print 'Done Training Positive Naive Bayes Classifier!'

In [32]:
trump_maxent_classifier = MaxEntClassifier(trump_train)
print 'Done Training Trump Max Ent Classifier!'

Done Training Trump Max Ent Classifier!


In [33]:
clinton_maxent_classifier = MaxEntClassifier(clinton_train)
print 'Done Training Clinton Max Ent Classifier!'

Done Training Clinton Max Ent Classifier!


In [34]:
other_maxent_classifier = MaxEntClassifier(other_train)
print 'Done Training Other Max Ent Classifier!'

Done Training Other Max Ent Classifier!


In [35]:
print "Calculating Trump Accuracy"
#nltk_classifier_accuracy = nltk_classifier.accuracy(test)
trump_naivebayes_classifier_accuracy = trump_naivebayes_classifier.accuracy(trump_test)
trump_decisiontree_classifier_accuracy = trump_decisiontree_classifier.accuracy(trump_test)
#positivenaivebayes_classifier_accuracy = positivenaivebayes_classifier.accuracy(test)
trump_maxent_classifier_accuracy = trump_maxent_classifier.accuracy(trump_test)

print "Calculating Clinton Accuracy"
#nltk_classifier_accuracy = nltk_classifier.accuracy(test)
clinton_naivebayes_classifier_accuracy = clinton_naivebayes_classifier.accuracy(clinton_test)
clinton_decisiontree_classifier_accuracy = clinton_decisiontree_classifier.accuracy(clinton_test)
#positivenaivebayes_classifier_accuracy = positivenaivebayes_classifier.accuracy(test)
clinton_maxent_classifier_accuracy = clinton_maxent_classifier.accuracy(clinton_test)

print "Calculating Other Accuracy"
#nltk_classifier_accuracy = nltk_classifier.accuracy(test)
other_naivebayes_classifier_accuracy = other_naivebayes_classifier.accuracy(other_test)
other_decisiontree_classifier_accuracy = other_decisiontree_classifier.accuracy(other_test)
#positivenaivebayes_classifier_accuracy = positivenaivebayes_classifier.accuracy(test)
other_maxent_classifier_accuracy = other_maxent_classifier.accuracy(other_test)

Calculating Trump Accuracy
  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -1.09861        0.278
         Final               nan        0.278
Calculating Clinton Accuracy
  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -1.09861        0.447
             2          -0.06331        0.989
             3          -0.01710        0.989
             4          -0.01179        0.989
             5          -0.01091        0.989
             6          -0.01074        0.989
             7          -0.01071        0.989
             8          -0.01070        0.989
             9          -0.01070        0.989
            10          -0.01070        0.989
            11          -0.01070        0.989
            12          -0.01070        0.989
            13          -0.01070        0.989
    

In [36]:
print "Trump Accuracy:"
#print "NLTK Accuracy: ", nltk_classifier_accuracy
print "Naive Bayes Accuracy: ", trump_naivebayes_classifier_accuracy
print "Decision Tree Accuracy: ", trump_decisiontree_classifier_accuracy
#print "Positive Naive Bayes Accuracy: ", positivenaivebayes_classifier_accuracy
print "Maxent Accuracy: ", trump_maxent_classifier_accuracy

print "Clinton Accuracy:"
#print "NLTK Accuracy: ", nltk_classifier_accuracy
print "Naive Bayes Accuracy: ", clinton_naivebayes_classifier_accuracy
print "Decision Tree Accuracy: ", clinton_decisiontree_classifier_accuracy
#print "Positive Naive Bayes Accuracy: ", positivenaivebayes_classifier_accuracy
print "Maxent Accuracy: ", clinton_maxent_classifier_accuracy

print "Other Accuracy:"
#print "NLTK Accuracy: ", nltk_classifier_accuracy
print "Naive Bayes Accuracy: ", other_naivebayes_classifier_accuracy
print "Decision Tree Accuracy: ", other_decisiontree_classifier_accuracy
#print "Positive Naive Bayes Accuracy: ", positivenaivebayes_classifier_accuracy
print "Maxent Accuracy: ", other_maxent_classifier_accuracy

Trump Accuracy:
Naive Bayes Accuracy:  0.467479674797
Decision Tree Accuracy:  0.382113821138
Maxent Accuracy:  0.284552845528
Clinton Accuracy:
Naive Bayes Accuracy:  0.5
Decision Tree Accuracy:  0.5
Maxent Accuracy:  0.468085106383
Other Accuracy:
Naive Bayes Accuracy:  0.559523809524
Decision Tree Accuracy:  0.5
Maxent Accuracy:  0.571428571429
