# Classifying with TextBlob

In [1]:
from textblob.classifiers import NLTKClassifier,NaiveBayesClassifier,DecisionTreeClassifier,PositiveNaiveBayesClassifier,MaxEntClassifier
import pandas as pd
import re

path = '/Users/Jared/Documents/CSCE489/489Project/'

### Grabbing the Data

In [None]:
def read_in_turk_csv(path):
    df = pd.read_csv(path)
    df_with_relevant_columns = df[['Input.status_id','Input.content','Answer.sentiment']]
    df_with_relevant_columns.columns = ['status_id','content','sentiment']
    return def_with_relevant_columns

In [2]:
#Load Turk Data
turk_df = pd.read_csv(path + 'Turk/Batch_2606965_batch_results.csv') 
turk_df.head()

Unnamed: 0,HITId,HITTypeId,Title,Description,Keywords,Reward,CreationTime,MaxAssignments,RequesterAnnotation,AssignmentDurationInSeconds,...,RequesterFeedback,WorkTimeInSeconds,LifetimeApprovalRate,Last30DaysApprovalRate,Last7DaysApprovalRate,Input.status_id,Input.content,Answer.sentiment,Approve,Reject
0,301KG0KX9CFC0MHLW3ECT80VTS3H2D,3WROYBU7BMMRCGD41SQ9YGNC31GDCO,Preseidential Election Sentiment Analysis,Analyze the sentiment of the provided Facebook...,sentiment election Donald Trump Hilary Clinton...,$0.02,Wed Nov 23 21:44:16 PST 2016,3,BatchId:2606965;,172800,...,,3,100% (143/143),100% (143/143),100% (143/143),13652355666_10154036300320667,Thousands of demonstrators filled the streets ...,Positive,,
1,301KG0KX9CFC0MHLW3ECT80VTS3H2D,3WROYBU7BMMRCGD41SQ9YGNC31GDCO,Preseidential Election Sentiment Analysis,Analyze the sentiment of the provided Facebook...,sentiment election Donald Trump Hilary Clinton...,$0.02,Wed Nov 23 21:44:16 PST 2016,3,BatchId:2606965;,172800,...,,3,100% (349/349),100% (349/349),100% (349/349),13652355666_10154036300320667,Thousands of demonstrators filled the streets ...,Positive,,
2,301KG0KX9CFC0MHLW3ECT80VTS3H2D,3WROYBU7BMMRCGD41SQ9YGNC31GDCO,Preseidential Election Sentiment Analysis,Analyze the sentiment of the provided Facebook...,sentiment election Donald Trump Hilary Clinton...,$0.02,Wed Nov 23 21:44:16 PST 2016,3,BatchId:2606965;,172800,...,,7,100% (132/132),100% (132/132),100% (132/132),13652355666_10154036300320667,Thousands of demonstrators filled the streets ...,Neutral,,
3,301KG0KX9CFC0MHLW3ECT80VTS42HZ,3WROYBU7BMMRCGD41SQ9YGNC31GDCO,Preseidential Election Sentiment Analysis,Analyze the sentiment of the provided Facebook...,sentiment election Donald Trump Hilary Clinton...,$0.02,Wed Nov 23 21:44:36 PST 2016,3,BatchId:2606965;,172800,...,,51,100% (147/147),100% (147/147),100% (147/147),131459315949_10154056577745950,From Barack Obama's 2003 Senate campaign to Hi...,Positive,,
4,301KG0KX9CFC0MHLW3ECT80VTS42HZ,3WROYBU7BMMRCGD41SQ9YGNC31GDCO,Preseidential Election Sentiment Analysis,Analyze the sentiment of the provided Facebook...,sentiment election Donald Trump Hilary Clinton...,$0.02,Wed Nov 23 21:44:36 PST 2016,3,BatchId:2606965;,172800,...,,3,100% (99/99),100% (99/99),100% (99/99),131459315949_10154056577745950,From Barack Obama's 2003 Senate campaign to Hi...,Neutral,,


In [3]:
#Narrow columns down to relative information
relevant_df = turk_df[['Input.status_id','Input.content','Answer.sentiment']]
relevant_df.columns = ['status_id','content','sentiment']
relevant_df.head()

Unnamed: 0,status_id,content,sentiment
0,13652355666_10154036300320667,Thousands of demonstrators filled the streets ...,Positive
1,13652355666_10154036300320667,Thousands of demonstrators filled the streets ...,Positive
2,13652355666_10154036300320667,Thousands of demonstrators filled the streets ...,Neutral
3,131459315949_10154056577745950,From Barack Obama's 2003 Senate campaign to Hi...,Positive
4,131459315949_10154056577745950,From Barack Obama's 2003 Senate campaign to Hi...,Neutral


### Condense Turk Reviews

In [None]:
#Function that counts the number of Positive, Neutral, and Negative sentiment reviews and chooses an official review
#2 or more of review = that review
#Anything else is Inconclusive
def choose_sentiment(label):
    if label.count('Positive') >= 2:
        return 'Positive'
    elif label.count('Neutral') >= 2:
        return 'Neutral'
    elif label.count('Negative') >= 2:
        return 'Negative'
    else:
        return 'Inconclusive'

def condense_and_label_rows(df):
    #Split into two, one with post content and one with sentiment review. 
    #This is necessary so post content doesn't get stacked when combining rows
    content_df = df[['status_id','content']]
    sentiment_df = df[['status_id','sentiment']]
    
    #Combines all rows with the same status_id. 
    #It only keeps the same post (content column) its finds which is ok because they are all the same.
    content_df = content_df.groupby(content_df.status_id).first()
    #content_df.head()
    
    #Combines all rows with the same statuse id
    #Combines all sentiment reviews as well
    sentiment_df = sentiment_df.groupby(sentiment_df.status_id).sum()
    #sentiment_df.head()
    
    #Chooses an offical sentiment review based on the three aggregated ones
    sentiment_df['sentiment'] = sentiment_df.sentiment.apply(choose_sentiment)
    #sentiment_df.head()
    
    #Merged the sentiment Dataframe and post Dataframe by combining like status_ids
    #Note: It uses a right merge, so only items in the sentiment dataframe are kept.
    merged_df = content_df.merge(sentiment_df,how='right',left_index=True,right_index=True)
    merged_df.head()
    

In [4]:
#Split into two, one with post content and one with sentiment review. 
#This is necessary so post content doesn't get stacked when combining rows
content_df = relevant_df[['status_id','content']]
sentiment_df = relevant_df[['status_id','sentiment']]

In [6]:
#Combines all rows with the same status_id. 
#It only keeps the same post (content column) its finds which is ok because they are all the same.
content_df = content_df.groupby(content_df.status_id).first()
content_df.head()

Unnamed: 0_level_0,content
status_id,Unnamed: 1_level_1
131459315949_10152751596575950,"One opinion: ""Hillary Clinton won because all ..."
131459315949_10152753624975950,Second fiddle to Hillary Clinton in the debate...
131459315949_10152754227800950,Donald J. Trump may not attend the next Republ...
131459315949_10152758428025950,Conservatives have criticized Donald J. Trump ...
131459315949_10152766742570950,Hillary Clinton has promised to take on the Na...


In [5]:
#Removes any non-ascii characters. TextBlob cannot process them
def remove_non_ascii(text):
    return re.sub(r'[^\x00-\x7F]',' ', text)

In [7]:
#Removes all non-ascii characters from the content column
content_df['content'] = content_df.content.apply(remove_non_ascii)
content_df.head()

Unnamed: 0_level_0,content
status_id,Unnamed: 1_level_1
131459315949_10152751596575950,"One opinion: ""Hillary Clinton won because all ..."
131459315949_10152753624975950,Second fiddle to Hillary Clinton in the debate...
131459315949_10152754227800950,Donald J. Trump may not attend the next Republ...
131459315949_10152758428025950,Conservatives have criticized Donald J. Trump ...
131459315949_10152766742570950,Hillary Clinton has promised to take on the Na...


In [8]:
#Function that counts the number of Positive, Neutral, and Negative sentiment reviews and chooses an official review
#2 or more of review = that review
#Anything else is Inconclusive
def choose_sentiment(label):
    if label.count('Positive') >= 2:
        return 'Positive'
    elif label.count('Neutral') >= 2:
        return 'Neutral'
    elif label.count('Negative') >= 2:
        return 'Negative'
    else:
        return 'Inconclusive'

In [9]:
#Combines all rows with the same statuse id
#Combines all sentiment reviews as well
sentiment_df = sentiment_df.groupby(sentiment_df.status_id).sum()
sentiment_df.head()

Unnamed: 0_level_0,sentiment
status_id,Unnamed: 1_level_1
131459315949_10152751596575950,NeutralNegativeNegative
131459315949_10152753624975950,NeutralPositiveNegative
131459315949_10152754227800950,PositiveNeutralNeutral
131459315949_10152758428025950,NegativeNegativeNeutral
131459315949_10152766742570950,NeutralNegativeNeutral


In [10]:
#Chooses an offical sentiment review based on the three aggregated ones
sentiment_df['sentiment'] = sentiment_df.sentiment.apply(choose_sentiment)
sentiment_df.head()

Unnamed: 0_level_0,sentiment
status_id,Unnamed: 1_level_1
131459315949_10152751596575950,Negative
131459315949_10152753624975950,Inconclusive
131459315949_10152754227800950,Neutral
131459315949_10152758428025950,Negative
131459315949_10152766742570950,Neutral


In [11]:
#Discards all Inconclusive sentiment reviews
sentiment_df = sentiment_df[sentiment_df.sentiment != 'Inconclusive']
sentiment_df.head()

Unnamed: 0_level_0,sentiment
status_id,Unnamed: 1_level_1
131459315949_10152751596575950,Negative
131459315949_10152754227800950,Neutral
131459315949_10152758428025950,Negative
131459315949_10152766742570950,Neutral
131459315949_10152791196665950,Positive


In [12]:
#Merged the sentiment Dataframe and post Dataframe by combining like status_ids
#Note: It uses a right merge, so only items in the sentiment dataframe are kept.
merged_df = content_df.merge(sentiment_df,how='right',left_index=True,right_index=True)
merged_df.head()

Unnamed: 0_level_0,content,sentiment
status_id,Unnamed: 1_level_1,Unnamed: 2_level_1
131459315949_10152751596575950,"One opinion: ""Hillary Clinton won because all ...",Negative
131459315949_10152754227800950,Donald J. Trump may not attend the next Republ...,Neutral
131459315949_10152758428025950,Conservatives have criticized Donald J. Trump ...,Negative
131459315949_10152766742570950,Hillary Clinton has promised to take on the Na...,Neutral
131459315949_10152791196665950,A look at 5 crucial issues in the Canadian ele...,Positive


In [13]:
#Classifies a post as Trump, Clinton, or Other
#Rules:
#If a review talks about only Trump, and not Clinotn, it is trump
#If a review talks about only Clinton, and not Trump, it is clinton
#Everything else is other 
def label_post (content):
   status = content.lower()
   if (('donald' in status) or ('trump' in status)) and (('hillary' in status) or ('clinton' in status)) :
      return 'other'
   elif ('donald' in status) or ('trump' in status) :
      return 'trump'
   elif ('hillary' in status) or ('clinton' in status) :
      return 'clinton'
   else:
      return 'other'

def label_df(df):
    df['candidate_label'] = df.content.apply(classify_post)
    return df
    

In [14]:
#Labels all posts as other, Clinton, or other
labeled_df = merged_df
labeled_df['candidate_label'] = merged_df.content.apply(classify_post)
labeled_df.head()

Unnamed: 0_level_0,content,sentiment,candidate_label
status_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
131459315949_10152751596575950,"One opinion: ""Hillary Clinton won because all ...",Negative,clinton
131459315949_10152754227800950,Donald J. Trump may not attend the next Republ...,Neutral,trump
131459315949_10152758428025950,Conservatives have criticized Donald J. Trump ...,Negative,trump
131459315949_10152766742570950,Hillary Clinton has promised to take on the Na...,Neutral,clinton
131459315949_10152791196665950,A look at 5 crucial issues in the Canadian ele...,Positive,other


In [70]:
#Function to remove special characters
def remove_special_characters(s):
    return ''.join(e for e in s if (e.isalpha( ) or (' ' in e)))

#Function to read in stop words
def stopword_file_to_list(file_name):
    f = open(path + "StopWordLists/" + file_name)
    file_text = f.read()
    return file_text.split("\r")

#Function to read in stop words
def stopword_file_to_list(file_name):
    f = open(path + "StopWordLists/" + file_name)
    file_text = f.read()
    return file_text.split("\r")

#Remove words in list from string
def remove_words(s,words_to_remove):
    word_list = s.split()
    resultwords  = [word for word in word_list if word.lower() not in words_to_remove]
    result = ' '.join(resultwords)
    return result

def clean_df(df,stop_word_file_path="comprehensive_stopwords.txt",column_to_clean="content"):
    #Remove all special characters
    labeled_df_without_special_chars = df
    #''.join(e for e in string if (e.isalpha()) ' ' in e)
    labeled_df_without_special_chars[column_to_clean] = labeled_df_without_special_chars.content.apply(remove_special_characters)
    labeled_df_without_special_chars.head()
    
    #Get Stop Word List
    stopword_list = stopword_file_to_list(stop_word_file_path)
    
    labeled_df_without_stopwords = labeled_df_without_special_chars
    labeled_df_without_stopwords[column_to_clean] = merged_df.apply (lambda row: remove_words(row[column_to_clean],comprehensive_stopword_list),axis=1)
    #labeled_df_without_stopwords.head()
    
    #Remove candidate names
    labeled_df_without_candidate_name = labeled_df_without_stopwords
    labeled_df_without_candidate_name[column_to_clean] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('Hillary',''))
    labeled_df_without_candidate_name[column_to_clean] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('hillary',''))
    labeled_df_without_candidate_name[column_to_clean] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('Clinton',''))
    labeled_df_without_candidate_name[column_to_clean] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('clinton',''))
    labeled_df_without_candidate_name[column_to_clean] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('Donald',''))
    labeled_df_without_candidate_name[column_to_clean] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('donald',''))
    labeled_df_without_candidate_name[column_to_clean] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('Trump',''))
    labeled_df_without_candidate_name[column_to_clean] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('trump',''))
    #labeled_df_without_candidate_name.head()
    
    return labeled_df_without_candidate_name
    

In [15]:
#Function to remove special characters
def remove_special_characters(s):
    return ''.join(e for e in s if (e.isalpha( ) or (' ' in e)))

In [16]:
#Remove all special characters
labeled_df_without_special_chars = labeled_df
#''.join(e for e in string if (e.isalpha()) ' ' in e)
labeled_df_without_special_chars['content'] = labeled_df_without_special_chars.content.apply(remove_special_characters)
labeled_df_without_special_chars.head()

Unnamed: 0_level_0,content,sentiment,candidate_label
status_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
131459315949_10152751596575950,One opinion Hillary Clinton won because all of...,Negative,clinton
131459315949_10152754227800950,Donald J Trump may not attend the next Republi...,Neutral,trump
131459315949_10152758428025950,Conservatives have criticized Donald J Trump f...,Negative,trump
131459315949_10152766742570950,Hillary Clinton has promised to take on the Na...,Neutral,clinton
131459315949_10152791196665950,A look at crucial issues in the Canadian elec...,Positive,other


In [17]:
#Function to read in stop words
def stopword_file_to_list(file_name):
    f = open(path + "StopWordLists/" + file_name)
    file_text = f.read()
    return file_text.split("\r")

In [18]:
#Read Stop Word List into Memory
comprehensive_stopword_list = stopword_file_to_list("comprehensive_stopwords.txt")
default_english_stopword_list = stopword_file_to_list("default_english_stopwords.txt")
mysql_stopword_list = stopword_file_to_list("mysql_stopwords.txt")
print comprehensive_stopword_list[:10]
print default_english_stopword_list[:10]
print mysql_stopword_list[:10]

['a', 'able', 'about', 'above', 'abst', 'accordance', 'according', 'accordingly', 'across', 'act']
['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and']
["a's", 'accordingly', 'again', 'allows', 'also', 'amongst', 'anybody', 'anyways', 'appropriate', 'aside']


In [19]:
#Remove words in list from string
def remove_words(s,words_to_remove):
    word_list = s.split()
    resultwords  = [word for word in word_list if word.lower() not in words_to_remove]
    result = ' '.join(resultwords)
    return result

In [20]:
labeled_df_without_stopwords = labeled_df_without_special_chars
labeled_df_without_stopwords['content'] = merged_df.apply (lambda row: remove_words(row['content'],comprehensive_stopword_list),axis=1)
labeled_df_without_stopwords.head()

Unnamed: 0_level_0,content,sentiment,candidate_label
status_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
131459315949_10152751596575950,opinion Hillary Clinton won opponents terrible...,Negative,clinton
131459315949_10152754227800950,Donald Trump attend Republican presidential de...,Neutral,trump
131459315949_10152758428025950,Conservatives criticized Donald Trump expressi...,Negative,trump
131459315949_10152766742570950,Hillary Clinton promised National Rifle Associ...,Neutral,clinton
131459315949_10152791196665950,crucial issues Canadian election referendum Pr...,Positive,other


In [21]:
labeled_df_without_candidate_name = labeled_df_without_stopwords
#labeled_df_without_candidate_name['content'] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('Hillary',''))
#labeled_df_without_candidate_name['content'] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('hillary',''))
#labeled_df_without_candidate_name['content'] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('Clinton',''))
#labeled_df_without_candidate_name['content'] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('clinton',''))
#labeled_df_without_candidate_name['content'] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('Donald',''))
#labeled_df_without_candidate_name['content'] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('donald',''))
#labeled_df_without_candidate_name['content'] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('Trump',''))
#labeled_df_without_candidate_name['content'] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('trump',''))
labeled_df_without_candidate_name.head()

Unnamed: 0_level_0,content,sentiment,candidate_label
status_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
131459315949_10152751596575950,opinion Hillary Clinton won opponents terrible...,Negative,clinton
131459315949_10152754227800950,Donald Trump attend Republican presidential de...,Neutral,trump
131459315949_10152758428025950,Conservatives criticized Donald Trump expressi...,Negative,trump
131459315949_10152766742570950,Hillary Clinton promised National Rifle Associ...,Neutral,clinton
131459315949_10152791196665950,crucial issues Canadian election referendum Pr...,Positive,other


In [22]:
#We're done making changes, assign out current work to the final df
final_df = labeled_df_without_candidate_name
final_df.head()

Unnamed: 0_level_0,content,sentiment,candidate_label
status_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
131459315949_10152751596575950,opinion Hillary Clinton won opponents terrible...,Negative,clinton
131459315949_10152754227800950,Donald Trump attend Republican presidential de...,Neutral,trump
131459315949_10152758428025950,Conservatives criticized Donald Trump expressi...,Negative,trump
131459315949_10152766742570950,Hillary Clinton promised National Rifle Associ...,Neutral,clinton
131459315949_10152791196665950,crucial issues Canadian election referendum Pr...,Positive,other


In [23]:
trump_df = final_df[final_df.candidate_label == 'trump']
trump_df.head()

Unnamed: 0_level_0,content,sentiment,candidate_label
status_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
131459315949_10152754227800950,Donald Trump attend Republican presidential de...,Neutral,trump
131459315949_10152758428025950,Conservatives criticized Donald Trump expressi...,Negative,trump
131459315949_10152871191975950,Donald Trump legality closing mosques looked,Neutral,trump
131459315949_10152978451960950,Dr Ben Carson lead Republican presidential cam...,Positive,trump
131459315949_10153076683820950,fighting politically correct war Donald Trump ...,Neutral,trump


In [24]:
clinton_df = final_df[final_df.candidate_label == 'clinton']
clinton_df.head()

Unnamed: 0_level_0,content,sentiment,candidate_label
status_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
131459315949_10152751596575950,opinion Hillary Clinton won opponents terrible...,Negative,clinton
131459315949_10152766742570950,Hillary Clinton promised National Rifle Associ...,Neutral,clinton
131459315949_10152957933900950,Katy Perry performed rally Hillary Clinton Sat...,Positive,clinton
131459315949_10153018635795950,Politics Alert poll percent Hillary Clinton br...,Positive,clinton
131459315949_10153173878100950,voters mobbing Bernie Sanders events Iowa Hill...,Neutral,clinton


In [25]:
other_df = final_df[final_df.candidate_label == 'other']
other_df.head()

Unnamed: 0_level_0,content,sentiment,candidate_label
status_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
131459315949_10152791196665950,crucial issues Canadian election referendum Pr...,Positive,other
131459315949_10152983500645950,election June percent country vote opposition ...,Positive,other
131459315949_10153139941305950,Donald Trump referenced male anatomy describe ...,Negative,other
131459315949_10153248450690950,taste Hillary Clinton rally full Demi Lovato l...,Neutral,other
131459315949_10153257668255950,stake year election Well things fate planet Re...,Neutral,other


In [26]:
#Creating Tuple Arr from all Posts
tuple_arr = list(zip(final_df.content,final_df.sentiment))
size = len(tuple_arr)
halfway_pt = size/2

print size

train = tuple_arr[:halfway_pt]
test = tuple_arr[halfway_pt:]
print len(train)
print len(test)

847
423
424


In [27]:
#Creating Tuple Arr from trump Posts
trump_tuple_arr = list(zip(trump_df.content,trump_df.sentiment))
size = len(trump_tuple_arr)
trump_halfway_pt = size/2

print size

trump_train = trump_tuple_arr[:trump_halfway_pt]
trump_test = trump_tuple_arr[trump_halfway_pt:]
print len(trump_train)
print len(trump_test)

491
245
246


In [28]:
#Creating Tuple Arr from Clinton Posts
clinton_tuple_arr = list(zip(clinton_df.content,clinton_df.sentiment))
size = len(clinton_tuple_arr)
clinton_halfway_pt = size/2

print size

clinton_train = clinton_tuple_arr[:clinton_halfway_pt]
clinton_test = clinton_tuple_arr[clinton_halfway_pt:]
print len(clinton_train)
print len(clinton_test)

188
94
94


In [29]:
#Creating Tuple Arr from Other Posts
other_tuple_arr = list(zip(other_df.content,other_df.sentiment))
size = len(other_tuple_arr)
other_halfway_pt = size/2

print size

other_train = other_tuple_arr[:other_halfway_pt]
other_test = other_tuple_arr[other_halfway_pt:]
print len(other_train)
print len(other_test)

168
84
84


## Train the Classifiers

In [30]:
all_naivebayes_classifier = NaiveBayesClassifier(train)
print 'Done Training All Post Naive Bayes Classifier!'

Done Training All Post Naive Bayes Classifier!


In [31]:
all_decisiontree_classifier = DecisionTreeClassifier(train)
print 'Done Training ALL post Decision Tree Classifier!'

Done Training ALL post Decision Tree Classifier!


In [32]:
all_maxent_classifier = MaxEntClassifier(train)
print 'Done Training ALL Max Ent Classifier!'

Done Training ALL Max Ent Classifier!


In [33]:
trump_naivebayes_classifier = NaiveBayesClassifier(trump_train)
print 'Done Training other Naive Bayes Classifier!'

Done Training other Naive Bayes Classifier!


In [34]:
clinton_naivebayes_classifier = NaiveBayesClassifier(clinton_train)
print 'Done Training Clinton Naive Bayes Classifier!'

Done Training Clinton Naive Bayes Classifier!


In [35]:
other_naivebayes_classifier = NaiveBayesClassifier(other_train)
print 'Done Training Other Naive Bayes Classifier!'

Done Training Other Naive Bayes Classifier!


In [36]:
trump_decisiontree_classifier = DecisionTreeClassifier(trump_train)
print 'Done Training other Decision Tree Classifier!'

Done Training other Decision Tree Classifier!


In [37]:
clinton_decisiontree_classifier = DecisionTreeClassifier(clinton_train)
print 'Done Training Clinton Decision Tree Classifier!'

Done Training Clinton Decision Tree Classifier!


In [38]:
other_decisiontree_classifier = DecisionTreeClassifier(other_train)
print 'Done Training Other Decision Tree Classifier!'

Done Training Other Decision Tree Classifier!


In [39]:
#positivenaivebayes_classifier = PositiveNaiveBayesClassifier(other_train)
#print 'Done Training Positive Naive Bayes Classifier!'

In [40]:
trump_maxent_classifier = MaxEntClassifier(trump_train)
print 'Done Training other Max Ent Classifier!'

Done Training other Max Ent Classifier!


In [41]:
clinton_maxent_classifier = MaxEntClassifier(clinton_train)
print 'Done Training Clinton Max Ent Classifier!'

Done Training Clinton Max Ent Classifier!


In [42]:
other_maxent_classifier = MaxEntClassifier(other_train)
print 'Done Training Other Max Ent Classifier!'

Done Training Other Max Ent Classifier!


## Classify the Dataframe

In [43]:
#Classify the final DataFrame
final_df["naive_bayes_classification"] = final_df.content.apply(lambda row: all_naivebayes_classifier.classify(row))
final_df["decisiontree_classification"] = final_df.content.apply(lambda row: all_decisiontree_classifier.classify(row))
final_df["maxent_classification"] = final_df.content.apply(lambda row: all_maxent_classifier.classify(row))
print "Done Classifying the final Dataframe"

  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -1.09861        0.314
         Final               nan        0.314
Done Classifying the final Dataframe


In [44]:
#Classify the Trump DataFrame
trump_df["naive_bayes_classification"] = trump_df.content.apply(lambda row: trump_naivebayes_classifier.classify(row))
trump_df["decisiontree_classification"] = trump_df.content.apply(lambda row: trump_decisiontree_classifier.classify(row))
trump_df["maxent_classification"] = trump_df.content.apply(lambda row: trump_maxent_classifier.classify(row))
print "Done Classifying the Trump Dataframe"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -1.09861        0.278
         Final               nan        0.278
Done Classifying the Trump Dataframe


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [45]:
#Classify the Clinton DataFrame with Naive Bayes
clinton_df["naive_bayes_classification"] = clinton_df.content.apply(lambda row: clinton_naivebayes_classifier.classify(row))
clinton_df["decisiontree_classification"] = clinton_df.content.apply(lambda row: clinton_decisiontree_classifier.classify(row))
clinton_df["maxent_classifier"] = clinton_df.content.apply(lambda row: clinton_maxent_classifier.classify(row))
clinton_df.head()
print "Done Classifying the Clinton Dataframe"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -1.09861        0.447
             2          -0.15938        0.989
             3          -0.04926        0.989
             4          -0.02533        0.989
             5          -0.01771        0.989
             6          -0.01461        0.989
             7          -0.01308        0.989
             8          -0.01222        0.989
             9          -0.01170        0.989
            10          -0.01137        0.989
            11          -0.01115        0.989
            12          -0.01100        0.989
            13          -0.01090        0.989
            14          -0.01083        0.989
            15          -0.01079        0.989
            16          -0.01076        0.989
            17          -0.01074        0.989
            18          -0.01072        0.989
            19          -0.01072        0.989
 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [46]:
#Classify the Other DataFrame
other_df["naive_bayes_classification"] = other_df.content.apply(lambda row: other_naivebayes_classifier.classify(row))
other_df["decisiontree_classification"] = other_df.content.apply(lambda row: other_decisiontree_classifier.classify(row))
other_df["maxent_classifier"] = other_df.content.apply(lambda row: other_maxent_classifier.classify(row))
other_df.head()
print "Done Classifying the Other Dataframe"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -1.09861        0.286
             2          -0.26371        1.000
             3          -0.09363        1.000
             4          -0.04424        1.000
             5          -0.02544        1.000
             6          -0.01707        1.000
             7          -0.01293        1.000
             8          -0.01069        1.000
             9          -0.00940        1.000
            10          -0.00861        1.000
            11          -0.00811        1.000
            12          -0.00777        1.000
            13          -0.00754        1.000
            14          -0.00738        1.000
            15          -0.00726        1.000
            16          -0.00718        1.000
            17          -0.00712        1.000
            18          -0.00708        1.000
            19          -0.00705        1.000
 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## Running the Multi-Model Classifier

In [47]:
final_df["multi_model_classification"] = final_df.naive_bayes_classification+final_df.decisiontree_classification+final_df.maxent_classification
final_df["multi_model_classification"] = final_df.multi_model_classification.apply(choose_sentiment)
final_df["multi_model_classification"] = final_df.apply(lambda x: x["naive_bayes_classification"] if x["multi_model_classification"] == "Inconclusive" else x["multi_model_classification"],axis=1)
final_df.head()

Unnamed: 0_level_0,content,sentiment,candidate_label,naive_bayes_classification,decisiontree_classification,maxent_classification,multi_model_classification
status_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
131459315949_10152751596575950,opinion Hillary Clinton won opponents terrible...,Negative,clinton,Negative,Negative,Positive,Negative
131459315949_10152754227800950,Donald Trump attend Republican presidential de...,Neutral,trump,Neutral,Neutral,Positive,Neutral
131459315949_10152758428025950,Conservatives criticized Donald Trump expressi...,Negative,trump,Negative,Negative,Positive,Negative
131459315949_10152766742570950,Hillary Clinton promised National Rifle Associ...,Neutral,clinton,Neutral,Neutral,Positive,Neutral
131459315949_10152791196665950,crucial issues Canadian election referendum Pr...,Positive,other,Positive,Positive,Positive,Positive


In [48]:
trump_df["multi_model_classification"] = trump_df.naive_bayes_classification+trump_df.decisiontree_classification+trump_df.maxent_classification
trump_df["multi_model_classification"] = trump_df.multi_model_classification.apply(choose_sentiment)
trump_df["multi_model_classification"] = trump_df.apply(lambda x: x["naive_bayes_classification"] if x["multi_model_classification"] == "Inconclusive" else x["multi_model_classification"],axis=1)
trump_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Unnamed: 0_level_0,content,sentiment,candidate_label,naive_bayes_classification,decisiontree_classification,maxent_classification,multi_model_classification
status_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
131459315949_10152754227800950,Donald Trump attend Republican presidential de...,Neutral,trump,Neutral,Neutral,Positive,Neutral
131459315949_10152758428025950,Conservatives criticized Donald Trump expressi...,Negative,trump,Negative,Negative,Positive,Negative
131459315949_10152871191975950,Donald Trump legality closing mosques looked,Neutral,trump,Neutral,Neutral,Positive,Neutral
131459315949_10152978451960950,Dr Ben Carson lead Republican presidential cam...,Positive,trump,Positive,Negative,Positive,Positive
131459315949_10153076683820950,fighting politically correct war Donald Trump ...,Neutral,trump,Neutral,Neutral,Positive,Neutral


In [49]:
clinton_df["multi_model_classification"] = clinton_df.naive_bayes_classification+clinton_df.decisiontree_classification+clinton_df.maxent_classifier
clinton_df["multi_model_classification"] = clinton_df.multi_model_classification.apply(choose_sentiment)
clinton_df["multi_model_classification"] = clinton_df.apply(lambda x: x["naive_bayes_classification"] if x["multi_model_classification"] == "Inconclusive" else x["multi_model_classification"],axis=1)
clinton_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Unnamed: 0_level_0,content,sentiment,candidate_label,naive_bayes_classification,decisiontree_classification,maxent_classifier,multi_model_classification
status_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
131459315949_10152751596575950,opinion Hillary Clinton won opponents terrible...,Negative,clinton,Neutral,Negative,Negative,Negative
131459315949_10152766742570950,Hillary Clinton promised National Rifle Associ...,Neutral,clinton,Neutral,Neutral,Neutral,Neutral
131459315949_10152957933900950,Katy Perry performed rally Hillary Clinton Sat...,Positive,clinton,Positive,Positive,Positive,Positive
131459315949_10153018635795950,Politics Alert poll percent Hillary Clinton br...,Positive,clinton,Positive,Positive,Positive,Positive
131459315949_10153173878100950,voters mobbing Bernie Sanders events Iowa Hill...,Neutral,clinton,Neutral,Neutral,Neutral,Neutral


In [50]:
other_df["multi_model_classification"] = other_df.naive_bayes_classification+other_df.decisiontree_classification+other_df.maxent_classifier
other_df["multi_model_classification"] = other_df.multi_model_classification.apply(choose_sentiment)
other_df["multi_model_classification"] = other_df.apply(lambda x: x["naive_bayes_classification"] if x["multi_model_classification"] == "Inconclusive" else x["multi_model_classification"],axis=1)
other_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Unnamed: 0_level_0,content,sentiment,candidate_label,naive_bayes_classification,decisiontree_classification,maxent_classifier,multi_model_classification
status_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
131459315949_10152791196665950,crucial issues Canadian election referendum Pr...,Positive,other,Positive,Positive,Positive,Positive
131459315949_10152983500645950,election June percent country vote opposition ...,Positive,other,Positive,Positive,Positive,Positive
131459315949_10153139941305950,Donald Trump referenced male anatomy describe ...,Negative,other,Negative,Negative,Negative,Negative
131459315949_10153248450690950,taste Hillary Clinton rally full Demi Lovato l...,Neutral,other,Neutral,Neutral,Neutral,Neutral
131459315949_10153257668255950,stake year election Well things fate planet Re...,Neutral,other,Neutral,Neutral,Neutral,Neutral


## Testing Accuracy

In [71]:
def judge_accuracy(df,column_to_judge,correct_column):
    num_of_correct = 0
    for index,row in df.iterrows():
        if row[column_to_judge] == row[correct_column]:
            num_of_correct+=1
    return float(num_of_correct)/len(df.index)

In [72]:
def judge_poor_prediction_accuracy(df,column_to_judge,correct_column):
    num_of_poor_predictions = 0
    for index,row in df.iterrows():
        if ((row[column_to_judge] == 'Positive') and (row[correct_column] == 'Negative')) or ((row[column_to_judge] == 'Negative') and (row[correct_column] == 'Positive')):
            num_of_poor_predictions+=1
    return float(num_of_poor_predictions)/len(df.index)

In [53]:
print "Final Accuracy, "
print "Naive Bayes Accuracy: ", judge_accuracy(final_df.tail(halfway_pt),'naive_bayes_classification','sentiment')
print "Decision Accuracy: ", judge_accuracy(final_df.tail(halfway_pt),'decisiontree_classification','sentiment')
print "Maxent Accuracy: ", judge_accuracy(final_df.tail(halfway_pt),'maxent_classification','sentiment')
print "Multi Model Accuracy: ", judge_accuracy(final_df.tail(halfway_pt),'multi_model_classification','sentiment')

Final Accuracy, 
Naive Bayes Accuracy:  0.463356973995
Decision Accuracy:  0.446808510638
Maxent Accuracy:  0.302600472813
Multi Model Accuracy:  0.45390070922


In [54]:
print "Final Poor Prediction Percent, "
print "Naive Bayes: ", judge_poor_prediction_accuracy(final_df.tail(halfway_pt),'naive_bayes_classification','sentiment')
print "Decision: ", judge_poor_prediction_accuracy(final_df.tail(halfway_pt),'decisiontree_classification','sentiment')
print "Maxent: ", judge_poor_prediction_accuracy(final_df.tail(halfway_pt),'maxent_classification','sentiment')
print "Multi Model: ", judge_poor_prediction_accuracy(final_df.tail(halfway_pt),'multi_model_classification','sentiment')

Final Poor Prediction Percent, 
Naive Bayes:  0.0780141843972
Decision:  0.080378250591
Maxent:  0.278959810875
Multi Model:  0.096926713948


In [55]:
print "Trump Accuracy, "
print "Naive Bayes Accuracy: ", judge_accuracy(trump_df.tail(trump_halfway_pt),'naive_bayes_classification','sentiment')
print "Decision Accuracy: ", judge_accuracy(trump_df.tail(trump_halfway_pt),'decisiontree_classification','sentiment')
print "Maxent Accuracy: ", judge_accuracy(trump_df.tail(trump_halfway_pt),'maxent_classification','sentiment')
print "Multi Model Accuracy: ", judge_accuracy(trump_df.tail(trump_halfway_pt),'multi_model_classification','sentiment')

Trump Accuracy, 
Naive Bayes Accuracy:  0.428571428571
Decision Accuracy:  0.424489795918
Maxent Accuracy:  0.285714285714
Multi Model Accuracy:  0.416326530612


In [56]:
print "Trump Poor Prediction Percent, "
print "Naive Bayes: ", judge_poor_prediction_accuracy(trump_df.tail(trump_halfway_pt),'naive_bayes_classification','sentiment')
print "Decision: ", judge_poor_prediction_accuracy(trump_df.tail(trump_halfway_pt),'decisiontree_classification','sentiment')
print "Maxent: ", judge_poor_prediction_accuracy(trump_df.tail(trump_halfway_pt),'maxent_classification','sentiment')
print "Multi Model: ", judge_poor_prediction_accuracy(trump_df.tail(trump_halfway_pt),'multi_model_classification','sentiment')

Trump Poor Prediction Percent, 
Naive Bayes:  0.118367346939
Decision:  0.0897959183673
Maxent:  0.34693877551
Multi Model:  0.118367346939


In [57]:
print "Clinton Accuracy"
print "Naive Bayes Accuracy: ", judge_accuracy(clinton_df.tail(clinton_halfway_pt),'naive_bayes_classification','sentiment')
print "Decision Accuracy: ", judge_accuracy(clinton_df.tail(clinton_halfway_pt),'decisiontree_classification','sentiment')
print "Maxent Accuracy: ", judge_accuracy(clinton_df.tail(clinton_halfway_pt),'maxent_classifier','sentiment')
print "Multi Model Accuracy: ", judge_accuracy(clinton_df.tail(clinton_halfway_pt),'multi_model_classification','sentiment')

Clinton Accuracy
Naive Bayes Accuracy:  0.553191489362
Decision Accuracy:  0.5
Maxent Accuracy:  0.5
Multi Model Accuracy:  0.531914893617


In [58]:
print "Clinton Poor Prediction Percent"
print "Naive Bayes: ", judge_poor_prediction_accuracy(clinton_df.tail(clinton_halfway_pt),'naive_bayes_classification','sentiment')
print "Decision: ", judge_poor_prediction_accuracy(clinton_df.tail(clinton_halfway_pt),'decisiontree_classification','sentiment')
print "Maxent: ", judge_poor_prediction_accuracy(clinton_df.tail(clinton_halfway_pt),'maxent_classifier','sentiment')
print "Multi Model: ", judge_poor_prediction_accuracy(clinton_df.tail(clinton_halfway_pt),'multi_model_classification','sentiment')

Clinton Poor Prediction Percent
Naive Bayes:  0.0851063829787
Decision:  0.127659574468
Maxent:  0.0957446808511
Multi Model:  0.0851063829787


In [59]:
print "Other Accuracy"
print "Naive Bayes Accuracy: ", judge_accuracy(other_df.tail(other_halfway_pt),'naive_bayes_classification','sentiment')
print "Decision Accuracy: ", judge_accuracy(other_df.tail(other_halfway_pt),'decisiontree_classification','sentiment')
print "Maxent Accuracy: ", judge_accuracy(other_df.tail(other_halfway_pt),'maxent_classifier','sentiment')
print "Multi Model Accuracy: ", judge_accuracy(other_df.tail(other_halfway_pt),'multi_model_classification','sentiment')

Other Accuracy
Naive Bayes Accuracy:  0.559523809524
Decision Accuracy:  0.511904761905
Maxent Accuracy:  0.535714285714
Multi Model Accuracy:  0.547619047619


In [60]:
print "Other Poor Prediction Percent"
print "Naive Bayes: ", judge_poor_prediction_accuracy(other_df.tail(other_halfway_pt),'naive_bayes_classification','sentiment')
print "Decision: ", judge_poor_prediction_accuracy(other_df.tail(other_halfway_pt),'decisiontree_classification','sentiment')
print "Maxent: ", judge_poor_prediction_accuracy(other_df.tail(other_halfway_pt),'maxent_classifier','sentiment')
print "Multi Model: ", judge_poor_prediction_accuracy(other_df.tail(other_halfway_pt),'multi_model_classification','sentiment')

Other Poor Prediction Percent
Naive Bayes:  0.0357142857143
Decision:  0.0595238095238
Maxent:  0.0833333333333
Multi Model:  0.047619047619


## Classifying Posts

In [73]:
def classify_posts(df,classifier,classification_column):
    df[classification_column] = df.content.apply(lambda x: classifier.classify(x))
    

def judge_accuracy(df,column_to_judge,correct_column):
    num_of_correct = 0
    for index,row in df.iterrows():
        if row[column_to_judge] == row[correct_column]:
            num_of_correct+=1
    return float(num_of_correct)/len(df.index)
    
    

In [74]:
cbs_df = pd.read_csv('../Facebook_RAW/CBSNews_facebook_statuses.csv')
cnn_df = pd.read_csv('../Facebook_RAW/cnn_facebook_statuses.csv')
fox_df = pd.read_csv('../Facebook_RAW/FoxNews_facebook_statuses.csv')
msnbc_df = pd.read_csv('../Facebook_RAW/msnbc_facebook_statuses.csv')
nyt_df = pd.read_csv('../Facebook_RAW/nytimes_facebook_statuses.csv')
usatoday_df = pd.read_csv('../Facebook_RAW/usatoday_facebook_statuses.csv')
wsj_df = pd.read_csv('../Facebook_RAW/wsj_facebook_statuses.csv')

In [75]:
cbs_df.head()

Unnamed: 0,status_id,status_message,link_name,status_type,status_link,status_published,num_likes,num_comments,num_shares
0,131459315949_10154079219290950,Management of the complex on the Upper West Si...,Three New York buildings to ditch Trump name,link,http://cbsn.ws/2f2NFK3,2016-11-15 22:18:03,625,178,102
1,131459315949_10154079127695950,Her company says they are “still making adjust...,Ivanka Trump criticized for promoting diamond ...,link,http://cbsn.ws/2fUeDRS,2016-11-15 21:33:03,399,820,187
2,131459315949_10154079007415950,"In the days following the election, the ACLU N...","ACLU, Planned Parenthood are receiving ""unprec...",video,https://www.facebook.com/CBSNews/videos/101540...,2016-11-15 20:37:47,494,151,114
3,131459315949_10154078949670950,"In this city, someone spray painted ""black liv...",More than 200 hate crimes reported since the e...,link,http://cbsn.ws/2fTvJz9,2016-11-15 20:01:06,269,391,427
4,131459315949_10154078822465950,The latest on the Trump transition team:\n- Tr...,LIVE BLOG: Trump transition to the White House,link,http://cbsn.ws/2fTYJHl,2016-11-15 19:01:02,171,322,85


In [69]:
new_cbs_df = clean_df(cbs_df)
new_cbs_df.head()

Unnamed: 0_level_0,content,sentiment,candidate_label,naive_bayes_classification,decisiontree_classification,maxent_classification,multi_model_classification
status_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
131459315949_10152751596575950,opinion won opponents terrible Ryan Lizza Yorker,Negative,clinton,Negative,Negative,Positive,Negative
131459315949_10152754227800950,attend Republican presidential debate criteria...,Neutral,trump,Neutral,Neutral,Positive,Neutral
131459315949_10152758428025950,Conservatives criticized expressing liberal vi...,Negative,trump,Negative,Negative,Positive,Negative
131459315949_10152766742570950,promised National Rifle Association fight,Neutral,clinton,Neutral,Neutral,Positive,Neutral
131459315949_10152791196665950,crucial issues Canadian election referendum Pr...,Positive,other,Positive,Positive,Positive,Positive


## Cleaned Up Run 

## TextBlob Accuracy Test 

In [43]:
print "Calculating other Accuracy"
other_naivebayes_classifier_accuracy = other_naivebayes_classifier.accuracy(other_test)
other_decisiontree_classifier_accuracy = other_decisiontree_classifier.accuracy(other_test)
#positivenaivebayes_classifier_accuracy = positivenaivebayes_classifier.accuracy(test)
other_maxent_classifier_accuracy = other_maxent_classifier.accuracy(other_test)
print "Done Calculating other Accuracy"

print "Calculating Clinton Accuracy"
#nltk_classifier_accuracy = nltk_classifier.accuracy(test)
clinton_naivebayes_classifier_accuracy = clinton_naivebayes_classifier.accuracy(clinton_test)
clinton_decisiontree_classifier_accuracy = clinton_decisiontree_classifier.accuracy(clinton_test)
#positivenaivebayes_classifier_accuracy = positivenaivebayes_classifier.accuracy(test)
clinton_maxent_classifier_accuracy = clinton_maxent_classifier.accuracy(clinton_test)
print "Done Calculating Clinton Accuracy"

print "Calculating Other Accuracy"
#nltk_classifier_accuracy = nltk_classifier.accuracy(test)
other_naivebayes_classifier_accuracy = other_naivebayes_classifier.accuracy(other_test)
other_decisiontree_classifier_accuracy = other_decisiontree_classifier.accuracy(other_test)
#positivenaivebayes_classifier_accuracy = positivenaivebayes_classifier.accuracy(test)
other_maxent_classifier_accuracy = other_maxent_classifier.accuracy(other_test)
print "Done Calculating Other Accuracy"

Calculating Trump Accuracy
  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -1.09861        0.278
         Final               nan        0.278
Done Calculating Trump Accuracy
Calculating Clinton Accuracy
  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -1.09861        0.447
             2          -0.16223        0.989
             3          -0.05147        0.989
             4          -0.02733        0.989
             5          -0.01937        0.989
             6          -0.01587        0.989
             7          -0.01400        0.989
             8          -0.01288        0.989
             9          -0.01216        0.989
            10          -0.01168        0.989
            11          -0.01137        0.989
            12          -0.01115        0.989
            13    

In [44]:
print "other Accuracy:"
print "Naive Bayes Accuracy: ", other_naivebayes_classifier_accuracy
print "Decision Tree Accuracy: ", other_decisiontree_classifier_accuracy
print "Maxent Accuracy: ", other_maxent_classifier_accuracy

print "Clinton Accuracy:"
print "Naive Bayes Accuracy: ", clinton_naivebayes_classifier_accuracy
print "Decision Tree Accuracy: ", clinton_decisiontree_classifier_accuracy
print "Maxent Accuracy: ", clinton_maxent_classifier_accuracy

print "Other Accuracy:"
print "Naive Bayes Accuracy: ", other_naivebayes_classifier_accuracy
print "Decision Tree Accuracy: ", other_decisiontree_classifier_accuracy
print "Maxent Accuracy: ", other_maxent_classifier_accuracy

Trump Accuracy:
Naive Bayes Accuracy:  0.443089430894
Decision Tree Accuracy:  0.422764227642
Maxent Accuracy:  0.284552845528
Clinton Accuracy:
Naive Bayes Accuracy:  0.553191489362
Decision Tree Accuracy:  0.489361702128
Maxent Accuracy:  0.521276595745
Other Accuracy:
Naive Bayes Accuracy:  0.535714285714
Decision Tree Accuracy:  0.511904761905
Maxent Accuracy:  0.535714285714
