In [1]:
from textblob.classifiers import NLTKClassifier,NaiveBayesClassifier,DecisionTreeClassifier,PositiveNaiveBayesClassifier,MaxEntClassifier
import pandas as pd
import re

In [2]:
def read_in_turk_csv(path):
    df = pd.read_csv(path)
    df_with_relevant_columns = df[['WorkerId','Input.statis_id','Input.content','Input.source','Answer.sentiment']]
    df_with_relevant_columns.columns = ['worker_id','status_id','content','source','sentiment']
    return df_with_relevant_columns

In [3]:
#Function that counts the number of Positive, Neutral, and Negative sentiment reviews and chooses an official review
#2 or more of review = that review
#Anything else is Inconclusive
def choose_sentiment(label):
    if label.count('Positive') >= 2:
        return 'Positive'
    elif label.count('Neutral') >= 2:
        return 'Neutral'
    elif label.count('Negative') >= 2:
        return 'Negative'
    else:
        return 'Inconclusive'
    
#Function that determines how well 
def agreement_count(label):
    if (label.count('Positive') == 3) or (label.count('Neutral') == 3) or (label.count('Negative') == 3):
        return 'All'
    elif (label.count('Positive') == 2) or (label.count('Neutral') == 2) or (label.count('Negative') == 2):
        return 'Two'
    else:
        return 'None'
    
#Classifies a post as Trump, Clinton, or Other
#Rules:
#If a review talks about only Trump, and not Clinotn, it is trump
#If a review talks about only Clinton, and not Trump, it is clinton
#Everything else is other 
def label_post (content):
   status = content.lower()
   if (('donald' in status) or ('trump' in status)) and (('hillary' in status) or ('clinton' in status)) :
      return 'other'
   elif ('donald' in status) or ('trump' in status) :
      return 'trump'
   elif ('hillary' in status) or ('clinton' in status) :
      return 'clinton'
   else:
      return 'other'

def label_df(df):
    df['candidate_label'] = df.content.apply(label_post)
    return df

def condense_and_label_rows(df):
    #Split into two, one with post content and one with sentiment review. 
    #This is necessary so post content doesn't get stacked when combining rows
    content_df = df[['status_id','content','worker_id','source']]
    other_df = df[['status_id','sentiment']]
    
    #Combines all rows with the same status_id. 
    #It only keeps the same post (content column) its finds which is ok because they are all the same.
    content_df = content_df.groupby(content_df.status_id).first()
    
    #Combines all rows with the same status id
    #Combines all sentiment reviews as well
    other_df = other_df.groupby(other_df.status_id).sum()
    
    #Determins the Turk agreement percentage
    other_df['turk_agreement'] = other_df.sentiment.apply(agreement_count)
    
    #Chooses an offical sentiment review based on the three aggregated ones
    other_df['sentiment'] = other_df.sentiment.apply(choose_sentiment)
    
    #Merged the sentiment Dataframe and post Dataframe by combining like status_ids
    #Note: It uses a right merge, so only items in the sentiment dataframe are kept.
    merged_df = content_df.merge(other_df,how='right',left_index=True,right_index=True)
    
    labeled_df = label_df(merged_df)
    
    return merged_df
    

In [4]:
#Removes any non-ascii characters. TextBlob cannot process them
def remove_non_ascii(text):
    return re.sub(r'[^\x00-\x7F]',' ', text)

#Function to remove special characters
def remove_special_characters(s):
    return ''.join(e for e in s if (e.isalpha( ) or (' ' in e)))

#Function to read in stop words
def stopword_file_to_list(file_name):
    f = open("../StopWordLists/" + file_name)
    file_text = f.read()
    return file_text.split("\r")

#Function to read in stop words
def stopword_file_to_list(file_name):
    f = open("../StopWordLists/" + file_name)
    file_text = f.read()
    return file_text.split("\r")

#Remove words in list from string
def remove_words(s,words_to_remove):
    word_list = s.split()
    resultwords  = [word for word in word_list if word.lower() not in words_to_remove]
    result = ' '.join(resultwords)
    return result

def clean_df(df,stop_word_file_path="comprehensive_stopwords.txt",column_to_clean="content"):
    #Removes all non-ascii characters
    df[column_to_clean] = df.content.apply(remove_non_ascii)
    
    #Remove all special characters
    labeled_df_without_special_chars = df
    #''.join(e for e in string if (e.isalpha()) ' ' in e)
    labeled_df_without_special_chars[column_to_clean] = labeled_df_without_special_chars.content.apply(remove_special_characters)
    labeled_df_without_special_chars.head()
    
    #Get Stop Word List
    stopword_list = stopword_file_to_list(stop_word_file_path)
    
    labeled_df_without_stopwords = labeled_df_without_special_chars
    labeled_df_without_stopwords[column_to_clean] = labeled_df_without_stopwords.apply (lambda row: remove_words(row[column_to_clean],stopword_list),axis=1)
    #labeled_df_without_stopwords.head()
    
    #Remove candidate names
    labeled_df_without_candidate_name = labeled_df_without_stopwords
    labeled_df_without_candidate_name[column_to_clean] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('Hillary',''))
    labeled_df_without_candidate_name[column_to_clean] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('hillary',''))
    labeled_df_without_candidate_name[column_to_clean] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('Clinton',''))
    labeled_df_without_candidate_name[column_to_clean] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('clinton',''))
    labeled_df_without_candidate_name[column_to_clean] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('Donald',''))
    labeled_df_without_candidate_name[column_to_clean] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('donald',''))
    labeled_df_without_candidate_name[column_to_clean] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('Trump',''))
    labeled_df_without_candidate_name[column_to_clean] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('trump',''))
    #labeled_df_without_candidate_name.head()
    
    return labeled_df_without_candidate_name

In [5]:
def read_clean_and_label(path):
    turk_df = read_in_turk_csv("../Turk/6source_results_filtered.csv")
    turk_df = condense_and_label_rows(turk_df)
    turk_df = clean_df(turk_df)
    return turk_df

In [6]:
#Grabs specific news source
def get_news_source_df(df,news_source):
    df = df[df.source == news_source]
    return df

In [48]:
def agreement_number(df):
    all_number = df['turk_agreement'].value_counts()['All']
    two_number = df['turk_agreement'].value_counts()['Two']
    none_number = df['turk_agreement'].value_counts()['None']
    
    total_rows = all_number+two_number+none_number
    all_percentage = all_number
    two_percentage = two_number
    none_percentage = none_number
    
    return all_number,two_number,none_number

In [8]:
def train_classifiers(df):
    trump_df = df[df.candidate_label == 'trump']
    clinton_df = df[df.candidate_label == 'clinton']
    other_df = df[df.candidate_label == 'other']
    
    trump_train = list(zip(trump_df.content,trump_df.sentiment))
    clinton_train = list(zip(clinton_df.content,clinton_df.sentiment))
    other_train = list(zip(other_df.content,other_df.sentiment))
    
    trump_cl = NaiveBayesClassifier(trump_train)
    clinton_cl = NaiveBayesClassifier(clinton_train)
    other_cl = NaiveBayesClassifier(other_train)
    
    return trump_cl,clinton_cl,other_cl

In [50]:
df = read_clean_and_label("../Turk/6source_results_filtered.csv")
df.head()

Unnamed: 0_level_0,content,worker_id,source,sentiment,turk_agreement,candidate_label
status_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
131459315949_10152776690010950,WATCH kid sad running president cant thinks ...,A34M93NJC830DP,cbs,Neutral,Two,clinton
131459315949_10152912366555950,snubs Obama trade deal,A2XFO0X6RCS98M,cbs,Negative,Two,clinton
131459315949_10152983500645950,Greeces bailout stump Gov Walker race DNA fac...,A2XFO0X6RCS98M,cbs,Neutral,All,trump
131459315949_10153008113730950,Three people die shooting rampage Lafayette La...,A9KPCMO1J1LQF,cbs,Negative,Two,clinton
131459315949_10153013600775950,announces climate change renewable energy agenda,A2HVG934DIDSXQ,cbs,Neutral,Two,clinton


In [51]:
all_number,two_number,none_number = agreement_number(df)

print all_number,two_number,none_number

594 809 97


In [41]:
cbs_df = get_news_source_df(df,'fox')
cbs_df = cbs_df[cbs_df.sentiment != 'Inconclusive']
cbs_df

Unnamed: 0_level_0,content,worker_id,source,sentiment,turk_agreement,candidate_label
status_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
15704546335_10153334647066336,Accusing dividing Americans political panderin...,AEVU71Z2FDTUX,fox,Neutral,Two,clinton
15704546335_10153385284896336,jobs countries making product money fact jobsL...,AGNJKSH5LFKXZ,fox,Negative,Two,trump
15704546335_10153408332966336,Breaking News NBC business relationship dero...,A2JPOXYZM5AJZZ,fox,Negative,All,trump
15704546335_10153516587241336,negotiating deal Ive piece paper thrown face ...,AEVU71Z2FDTUX,fox,Negative,Two,trump
15704546335_10153565420886336,dont anybodys money dont money pointed Jeb Bu...,A1T79J0XQXDDGC,fox,Neutral,Two,trump
15704546335_10153586397196336,Heidis Heidi Klum posted Instagram video sup...,A1G187YBG0DVMQ,fox,Neutral,Two,trump
15704546335_10153587865001336,start process country country going hell told...,A3QZMGTVA4VO44,fox,Neutral,Two,trump
15704546335_10153590617041336,court jury duty fill form state professionI ...,A1VWJ6LH1E3GLN,fox,Neutral,Two,trump
15704546335_10153593169941336,graced cover newest TIME magazine headline ...,AXLHKLLODD9C,fox,Positive,Two,trump
15704546335_10153607579941336,economy closely tied Chinas dragging,AURYD2FH3FUOQ,fox,Neutral,Two,trump


In [42]:
trump_cl,clinton_cl,other_cl = train_classifiers(cbs_df)

In [43]:
raw_df = pd.read_csv('../Facebook_RAW/FoxNews_facebook_statuses.csv')
raw_df['content'] = raw_df['status_message']
raw_df = label_df(raw_df)
raw_df = clean_df(raw_df)
raw_df

Unnamed: 0,status_id,status_message,link_name,status_type,status_link,status_published,num_likes,num_comments,num_shares,content,candidate_label
0,15704546335_10154797081866336,"""We, your fellow residents of Century Hall, re...",Report: Students Demand Classmates Remove Trum...,link,http://insider.foxnews.com/2016/11/15/report-s...,2016-11-15 21:30:03,9372,19996,11474,fellow residents Century Hall request remove f...,other
1,15704546335_10154797323166336,"""These mayors - all uber-liberals - believe th...",,video,https://www.facebook.com/FoxNews/videos/101547...,2016-11-15 20:17:07,10435,1391,2743,mayors uberliberals will harvest votes showing...,trump
2,15704546335_10154796532151336,Tampa Bay Buccaneers wide receiver Mike Evans ...,Bucs Wide Receiver Apologizes for Protesting T...,link,http://insider.foxnews.com/2016/11/15/mike-eva...,2016-11-15 15:35:25,7865,3697,842,Tampa Bay Buccaneers wide receiver Mike Evans ...,trump
3,15704546335_10154796679621336,President-elect Donald J. Trump's granddaughte...,Watch President-Elect Donald Trump’s Granddaug...,link,http://insider.foxnews.com/2016/11/15/video-tr...,2016-11-15 15:00:03,34270,1029,2551,Presidentelect s granddaughter impressive,trump
4,15704546335_10154796562271336,JUST IN: Speaker Paul Ryan unanimously nominat...,Timeline Photos,photo,https://www.facebook.com/FoxNews/photos/a.1840...,2016-11-15 14:09:35,15623,2618,816,Speaker Paul Ryan unanimously nominated fellow...,other
5,15704546335_10154796329281336,"""We have been listening to what he has said in...",Mexican City Rescinds Decree Proclaiming Donal...,link,http://latino.foxnews.com/latino/news/2016/11/...,2016-11-15 13:06:23,8554,1134,1098,listening days rhetoric radical Mexican Mayor ...,trump
6,15704546335_10154796288396336,"""There are going to be a lot more things made ...",,video,https://www.facebook.com/FoxNews/videos/101547...,2016-11-15 12:29:51,24720,1006,4458,going lot things USACongressman Steve Scalise ...,trump
7,15704546335_10154796140646336,Breaking News: Dr. Ben Carson has turned down ...,,video,https://www.facebook.com/FoxNews/videos/101547...,2016-11-15 11:26:52,4280,3701,1808,Breaking News Dr Ben Carson turned considerati...,trump
8,15704546335_10154796119516336,“I think he’s going to be a very successful pr...,,video,https://www.facebook.com/FoxNews/videos/101547...,2016-11-15 11:16:34,26620,2615,3837,going successful president Moments ago House S...,trump
9,15704546335_10154795994621336,Laura Ingraham said on Tucker Carlson Tonight ...,Here's the Latest Buzz on Pres-Elect Trump's P...,link,http://insider.foxnews.com/2016/11/15/latest-b...,2016-11-15 10:24:50,6452,911,356,Laura Ingraham Tucker Carlson Tonight seriousl...,trump


In [44]:
def classify_posts(df,trump_classifier,clinton_classifier,other_classifier):
    trump_df = df[df.candidate_label == 'trump']
    clinton_df = df[df.candidate_label == 'clinton']
    other_df = df[df.candidate_label == 'other']
    
    trump_df['classification'] = trump_df.content.apply(lambda x: trump_classifier.classify(x))
    clinton_df['classification'] = clinton_df.content.apply(lambda x: clinton_classifier.classify(x))
    other_df['classification'] = other_df.content.apply(lambda x: other_classifier.classify(x))
    
    return pd.concat([trump_df,clinton_df,other_df])

In [45]:
classified_df = classify_posts(raw_df,trump_cl,clinton_cl,other_cl)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [46]:
classified_df

Unnamed: 0,status_id,status_message,link_name,status_type,status_link,status_published,num_likes,num_comments,num_shares,content,candidate_label,classification
1,15704546335_10154797323166336,"""These mayors - all uber-liberals - believe th...",,video,https://www.facebook.com/FoxNews/videos/101547...,2016-11-15 20:17:07,10435,1391,2743,mayors uberliberals will harvest votes showing...,trump,Neutral
2,15704546335_10154796532151336,Tampa Bay Buccaneers wide receiver Mike Evans ...,Bucs Wide Receiver Apologizes for Protesting T...,link,http://insider.foxnews.com/2016/11/15/mike-eva...,2016-11-15 15:35:25,7865,3697,842,Tampa Bay Buccaneers wide receiver Mike Evans ...,trump,Neutral
3,15704546335_10154796679621336,President-elect Donald J. Trump's granddaughte...,Watch President-Elect Donald Trump’s Granddaug...,link,http://insider.foxnews.com/2016/11/15/video-tr...,2016-11-15 15:00:03,34270,1029,2551,Presidentelect s granddaughter impressive,trump,Neutral
5,15704546335_10154796329281336,"""We have been listening to what he has said in...",Mexican City Rescinds Decree Proclaiming Donal...,link,http://latino.foxnews.com/latino/news/2016/11/...,2016-11-15 13:06:23,8554,1134,1098,listening days rhetoric radical Mexican Mayor ...,trump,Negative
6,15704546335_10154796288396336,"""There are going to be a lot more things made ...",,video,https://www.facebook.com/FoxNews/videos/101547...,2016-11-15 12:29:51,24720,1006,4458,going lot things USACongressman Steve Scalise ...,trump,Neutral
7,15704546335_10154796140646336,Breaking News: Dr. Ben Carson has turned down ...,,video,https://www.facebook.com/FoxNews/videos/101547...,2016-11-15 11:26:52,4280,3701,1808,Breaking News Dr Ben Carson turned considerati...,trump,Neutral
8,15704546335_10154796119516336,“I think he’s going to be a very successful pr...,,video,https://www.facebook.com/FoxNews/videos/101547...,2016-11-15 11:16:34,26620,2615,3837,going successful president Moments ago House S...,trump,Neutral
9,15704546335_10154795994621336,Laura Ingraham said on Tucker Carlson Tonight ...,Here's the Latest Buzz on Pres-Elect Trump's P...,link,http://insider.foxnews.com/2016/11/15/latest-b...,2016-11-15 10:24:50,6452,911,356,Laura Ingraham Tucker Carlson Tonight seriousl...,trump,Neutral
10,15704546335_10154794322026336,"""President Obama surrounded himself with peopl...",Cain: Obama Is Only Being Told What He Wants t...,link,http://insider.foxnews.com/2016/11/14/herman-c...,2016-11-15 07:37:15,27066,859,2535,President Obama surrounded people ideologues t...,trump,Neutral
12,15704546335_10154794405636336,President Obama commented on Donald J. Trump's...,Timeline Photos,photo,https://www.facebook.com/FoxNews/photos/a.1840...,2016-11-15 01:00:00,44135,1687,10803,President Obama commented s election victory ...,trump,Neutral


In [47]:
classified_df.to_csv('../facebook_raw_classified/classified_fox_posts.csv')