In [2]:
from textblob.classifiers import NLTKClassifier,NaiveBayesClassifier,DecisionTreeClassifier,PositiveNaiveBayesClassifier,MaxEntClassifier
import pandas as pd
import re

In [3]:
def read_in_turk_csv(path):
    df = pd.read_csv(path)
    df_with_relevant_columns = df[['WorkerId','Input.statis_id','Input.content','Input.source','Answer.sentiment']]
    df_with_relevant_columns.columns = ['worker_id','status_id','content','source','sentiment']
    return df_with_relevant_columns

In [22]:
#Function that counts the number of Positive, Neutral, and Negative sentiment reviews and chooses an official review
#2 or more of review = that review
#Anything else is Inconclusive
def choose_sentiment(label):
    if label.count('Positive') >= 2:
        return 'Positive'
    elif label.count('Neutral') >= 2:
        return 'Neutral'
    elif label.count('Negative') >= 2:
        return 'Negative'
    else:
        return 'Inconclusive'
    
#Function that determines how well 
def agreement_count(label):
    if (label.count('Positive') == 3) or (label.count('Neutral') == 3) or (label.count('Negative') == 3):
        return 'All'
    elif (label.count('Positive') == 2) or (label.count('Neutral') == 2) or (label.count('Negative') == 2):
        return 'Two'
    else:
        return 'None'
    
#Classifies a post as Trump, Clinton, or Other
#Rules:
#If a review talks about only Trump, and not Clinotn, it is trump
#If a review talks about only Clinton, and not Trump, it is clinton
#Everything else is other 
def label_post (content):
   status = content.lower()
   if (('donald' in status) or ('trump' in status)) and (('hillary' in status) or ('clinton' in status)) :
      return 'other'
   elif ('donald' in status) or ('trump' in status) :
      return 'trump'
   elif ('hillary' in status) or ('clinton' in status) :
      return 'clinton'
   else:
      return 'other'

def label_df(df):
    df['candidate_label'] = df.content.apply(label_post)
    return df

def condense_and_label_rows(df):
    #Split into two, one with post content and one with sentiment review. 
    #This is necessary so post content doesn't get stacked when combining rows
    content_df = df[['status_id','content','worker_id','source']]
    other_df = df[['status_id','sentiment']]
    
    #Combines all rows with the same status_id. 
    #It only keeps the same post (content column) its finds which is ok because they are all the same.
    content_df = content_df.groupby(content_df.status_id).first()
    
    #Combines all rows with the same status id
    #Combines all sentiment reviews as well
    other_df = other_df.groupby(other_df.status_id).sum()
    
    #Determins the Turk agreement percentage
    other_df['turk_agreement'] = other_df.sentiment.apply(agreement_count)
    
    #Chooses an offical sentiment review based on the three aggregated ones
    other_df['sentiment'] = other_df.sentiment.apply(choose_sentiment)
    
    #Merged the sentiment Dataframe and post Dataframe by combining like status_ids
    #Note: It uses a right merge, so only items in the sentiment dataframe are kept.
    merged_df = content_df.merge(other_df,how='right',left_index=True,right_index=True)
    
    labeled_df = label_df(merged_df)
    
    return merged_df
    

In [5]:
#Removes any non-ascii characters. TextBlob cannot process them
def remove_non_ascii(text):
    return re.sub(r'[^\x00-\x7F]',' ', text)

#Function to remove special characters
def remove_special_characters(s):
    return ''.join(e for e in s if (e.isalpha( ) or (' ' in e)))

#Function to read in stop words
def stopword_file_to_list(file_name):
    f = open("../StopWordLists/" + file_name)
    file_text = f.read()
    return file_text.split("\r")

#Function to read in stop words
def stopword_file_to_list(file_name):
    f = open("../StopWordLists/" + file_name)
    file_text = f.read()
    return file_text.split("\r")

#Remove words in list from string
def remove_words(s,words_to_remove):
    word_list = s.split()
    resultwords  = [word for word in word_list if word.lower() not in words_to_remove]
    result = ' '.join(resultwords)
    return result

def clean_df(df,stop_word_file_path="comprehensive_stopwords.txt",column_to_clean="content"):
    #Removes all non-ascii characters
    df[column_to_clean] = df.content.apply(remove_non_ascii)
    
    #Remove all special characters
    labeled_df_without_special_chars = df
    #''.join(e for e in string if (e.isalpha()) ' ' in e)
    labeled_df_without_special_chars[column_to_clean] = labeled_df_without_special_chars.content.apply(remove_special_characters)
    labeled_df_without_special_chars.head()
    
    #Get Stop Word List
    stopword_list = stopword_file_to_list(stop_word_file_path)
    
    labeled_df_without_stopwords = labeled_df_without_special_chars
    labeled_df_without_stopwords[column_to_clean] = labeled_df_without_stopwords.apply (lambda row: remove_words(row[column_to_clean],stopword_list),axis=1)
    #labeled_df_without_stopwords.head()
    
    #Remove candidate names
    labeled_df_without_candidate_name = labeled_df_without_stopwords
    labeled_df_without_candidate_name[column_to_clean] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('Hillary',''))
    labeled_df_without_candidate_name[column_to_clean] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('hillary',''))
    labeled_df_without_candidate_name[column_to_clean] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('Clinton',''))
    labeled_df_without_candidate_name[column_to_clean] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('clinton',''))
    labeled_df_without_candidate_name[column_to_clean] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('Donald',''))
    labeled_df_without_candidate_name[column_to_clean] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('donald',''))
    labeled_df_without_candidate_name[column_to_clean] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('Trump',''))
    labeled_df_without_candidate_name[column_to_clean] = labeled_df_without_stopwords.content.apply(lambda row: row.replace('trump',''))
    #labeled_df_without_candidate_name.head()
    
    return labeled_df_without_candidate_name

In [19]:
def read_clean_and_label(path):
    turk_df = read_in_turk_csv("../Turk/6source_results_filtered.csv")
    turk_df = condense_and_label_rows(turk_df)
    turk_df = clean_df(turk_df)
    return turk_df

In [21]:
#Grabs specific news source
def get_news_source_df(df,news_source):
    df = df[df.source == news_source]
    return df

In [36]:
def agreement_percentage(df):
    all_number = df['turk_agreement'].value_counts()['All']
    two_number = df['turk_agreement'].value_counts()['Two']
    none_number = df['turk_agreement'].value_counts()['None']
    
    total_rows = all_number+two_number+none_number
    all_percentage = float(all_number)/total_rows
    two_percentage = float(two_number)/total_rows
    none_percentage = float(none_number)/total_rows
    
    return all_percentage,two_percentage,none_percentage

In [65]:
def train_classifiers(df):
    trump_df = df[df.candidate_label == 'trump']
    clinton_df = df[df.candidate_label == 'clinton']
    other_df = df[df.candidate_label == 'other']
    
    trump_train = list(zip(trump_df.content,trump_df.sentiment))
    clinton_train = list(zip(clinton_df.content,clinton_df.sentiment))
    other_train = list(zip(other_df.content,other_df.sentiment))
    
    trump_cl = NaiveBayesClassifier(trump_train)
    clinton_cl = NaiveBayesClassifier(clinton_train)
    other_cl = NaiveBayesClassifier(other_train)
    
    return trump_cl,clinton_cl,other_cl

In [75]:
df = read_clean_and_label("../Turk/6source_results_filtered.csv")
df.head()

Unnamed: 0_level_0,content,worker_id,source,sentiment,turk_agreement,candidate_label
status_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
131459315949_10152776690010950,WATCH kid sad running president cant thinks ...,A34M93NJC830DP,cbs,Neutral,Two,clinton
131459315949_10152912366555950,snubs Obama trade deal,A2XFO0X6RCS98M,cbs,Negative,Two,clinton
131459315949_10152983500645950,Greeces bailout stump Gov Walker race DNA fac...,A2XFO0X6RCS98M,cbs,Neutral,All,trump
131459315949_10153008113730950,Three people die shooting rampage Lafayette La...,A9KPCMO1J1LQF,cbs,Negative,Two,clinton
131459315949_10153013600775950,announces climate change renewable energy agenda,A2HVG934DIDSXQ,cbs,Neutral,Two,clinton


In [76]:
all_percentage,two_percentage,none_percentage = agreement_percentage(df)

print all_percentage,two_percentage,none_percentage

0.396 0.539333333333 0.0646666666667


In [90]:
cbs_df = get_news_source_df(df,'wsj')
cbs_df = cbs_df[cbs_df.sentiment != 'Inconclusive']
cbs_df

Unnamed: 0_level_0,content,worker_id,source,sentiment,turk_agreement,candidate_label
status_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
8304333127_10153399376053128,FIFA Germany Anguilla apart teams vote ultrade...,A1HSHK6YKJUSL1,wsj,Neutral,Two,other
8304333127_10153433564933128,Sepp Blatter struck defiant tone tumultuous we...,A2XFO0X6RCS98M,wsj,Negative,Two,other
8304333127_10153465651693128,will hard beat Charts,A2XFO0X6RCS98M,wsj,Positive,Two,clinton
8304333127_10153468146798128,GOP candidates referred rally Saturday born ...,A1GKTC682SWY23,wsj,Neutral,Two,clinton
8304333127_10153476888968128,presidential race brings swarm logos candidate...,AEVU71Z2FDTUX,wsj,Neutral,All,clinton
8304333127_10153545455133128,Myanmar poised hold fairest general election d...,A1T79J0XQXDDGC,wsj,Positive,Two,other
8304333127_10153561104483128,Todays top stories Gerard Baker housing market...,A1HSHK6YKJUSL1,wsj,Neutral,All,clinton
8304333127_10153566042733128,Update QA ended participating real chances J...,A2XFO0X6RCS98M,wsj,Neutral,Two,trump
8304333127_10153570678333128,internal government review Secretary State s ...,A314XJY8V1YL12,wsj,Negative,All,clinton
8304333127_10153593502743128,Democrats alternative Joe Biden viable worse...,A2MBRCF27IRWPA,wsj,Neutral,Two,clinton


In [91]:
trump_cl,clinton_cl,other_cl = train_classifiers(cbs_df)

In [93]:
raw_df = pd.read_csv('../Facebook_RAW/wsj_facebook_statuses.csv')
raw_df['content'] = raw_df['status_message']
raw_df = label_df(raw_df)
raw_df = clean_df(raw_df)
raw_df

Unnamed: 0,status_id,status_message,link_name,status_type,status_link,status_published,num_likes,num_comments,num_shares,content,candidate_label
0,8304333127_10154947294848128,Democrats’ poor showing in last week’s electio...,House Democrats Pressure Nancy Pelosi After Tr...,link,http://on.wsj.com/2eYQBUI,2016-11-15 23:45:10,283,115,48,Democrats poor showing week elections begun sh...,other
1,8304333127_10154946953528128,"In a letter to the president-elect, Sen. Eliza...",Elizabeth Warren Sends Donald Trump Letter Cri...,link,http://on.wsj.com/2fDXdK7,2016-11-15 22:20:10,1175,491,208,letter presidentelect Sen Elizabeth Warren thr...,trump
2,8304333127_10154946552533128,House Republicans sent a letter on Tuesday to ...,House Republicans Ask Agencies to Freeze All R...,link,http://on.wsj.com/2eYbGP9,2016-11-15 20:30:07,1073,598,667,House Republicans letter Tuesday government ag...,trump
3,8304333127_10154946080113128,The American Civil Liberties Union says Donald...,ACLU Says It’s Gotten a Deluge of Donations Si...,link,http://on.wsj.com/2eXXuFP,2016-11-15 18:30:21,634,73,36,American Civil Liberties Union s presidential...,trump
4,8304333127_10154945940438128,Ukraine’s President Petro Poroshenko appealed ...,Ukraine President Asks Trump to Counter ‘Russi...,link,http://on.wsj.com/2eXPmFB,2016-11-15 17:30:10,227,54,49,Ukraine President Petro Poroshenko appealed Pr...,trump
5,8304333127_10154945769573128,"""This was a conscious effort by a nation state...",NSA Chief on WikiLeaks,video,https://www.facebook.com/wsj/videos/1015494576...,2016-11-15 16:48:10,219,82,296,conscious effort nation state attempt achieve ...,other
6,8304333127_10154945524868128,House Republicans voted Tuesday to nominate Re...,House Republicans Nominate Paul Ryan for Speaker,link,http://on.wsj.com/2fuZqtp,2016-11-15 15:10:06,363,94,41,House Republicans voted Tuesday nominate Rep P...,other
7,8304333127_10154945361663128,Ford Motor Co. Chief Executive Mark Fields war...,Ford CEO Looking Forward to Working With Presi...,link,http://on.wsj.com/2fCPn37,2016-11-15 14:10:06,290,118,55,Ford Motor Chief Executive Mark Fields warned ...,trump
8,8304333127_10154945246768128,Former House Intelligence Committee Chairman M...,Intelligence Expert Mike Rogers Leaves Trump T...,link,http://on.wsj.com/2eXd2d7,2016-11-15 13:30:16,252,96,109,House Intelligence Committee Chairman Mike Rog...,trump
9,8304333127_10154945050053128,Donald Trump praised the U.S. electoral colleg...,"Behind in Popular Vote, Donald Trump Praises E...",link,http://on.wsj.com/2gdQF8e,2016-11-15 12:43:51,445,381,82,praised electoral college system delivered v...,other


In [94]:
def classify_posts(df,trump_classifier,clinton_classifier,other_classifier):
    trump_df = df[df.candidate_label == 'trump']
    clinton_df = df[df.candidate_label == 'clinton']
    other_df = df[df.candidate_label == 'other']
    
    trump_df['classification'] = trump_df.content.apply(lambda x: trump_classifier.classify(x))
    clinton_df['classification'] = clinton_df.content.apply(lambda x: clinton_classifier.classify(x))
    other_df['classification'] = other_df.content.apply(lambda x: other_classifier.classify(x))
    
    return pd.concat([trump_df,clinton_df,other_df])

In [95]:
classified_df = classify_posts(raw_df,trump_cl,clinton_cl,other_cl)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [96]:
classified_df

Unnamed: 0,status_id,status_message,link_name,status_type,status_link,status_published,num_likes,num_comments,num_shares,content,candidate_label,classification
1,8304333127_10154946953528128,"In a letter to the president-elect, Sen. Eliza...",Elizabeth Warren Sends Donald Trump Letter Cri...,link,http://on.wsj.com/2fDXdK7,2016-11-15 22:20:10,1175,491,208,letter presidentelect Sen Elizabeth Warren thr...,trump,Neutral
2,8304333127_10154946552533128,House Republicans sent a letter on Tuesday to ...,House Republicans Ask Agencies to Freeze All R...,link,http://on.wsj.com/2eYbGP9,2016-11-15 20:30:07,1073,598,667,House Republicans letter Tuesday government ag...,trump,Neutral
3,8304333127_10154946080113128,The American Civil Liberties Union says Donald...,ACLU Says It’s Gotten a Deluge of Donations Si...,link,http://on.wsj.com/2eXXuFP,2016-11-15 18:30:21,634,73,36,American Civil Liberties Union s presidential...,trump,Neutral
4,8304333127_10154945940438128,Ukraine’s President Petro Poroshenko appealed ...,Ukraine President Asks Trump to Counter ‘Russi...,link,http://on.wsj.com/2eXPmFB,2016-11-15 17:30:10,227,54,49,Ukraine President Petro Poroshenko appealed Pr...,trump,Neutral
7,8304333127_10154945361663128,Ford Motor Co. Chief Executive Mark Fields war...,Ford CEO Looking Forward to Working With Presi...,link,http://on.wsj.com/2fCPn37,2016-11-15 14:10:06,290,118,55,Ford Motor Chief Executive Mark Fields warned ...,trump,Negative
8,8304333127_10154945246768128,Former House Intelligence Committee Chairman M...,Intelligence Expert Mike Rogers Leaves Trump T...,link,http://on.wsj.com/2eXd2d7,2016-11-15 13:30:16,252,96,109,House Intelligence Committee Chairman Mike Rog...,trump,Neutral
11,8304333127_10154944687148128,"""People didn't vote for Donald Trump so that h...",,video,https://www.facebook.com/wsj/videos/1015494468...,2016-11-15 11:44:39,1904,925,870,People didnt vote bring white supremacist Wh...,trump,Neutral
12,8304333127_10154944414333128,America’s European allies are insisting that t...,"NATO, Europe Urge No Compromise With Russia Ov...",link,http://on.wsj.com/2fC9AGi,2016-11-15 10:20:08,182,40,32,America European allies insisting West comprom...,trump,Neutral
13,8304333127_10154944332373128,Donald Trump will enter the White House with m...,Donald Trump’s Complex Businesses Bring Potent...,link,http://on.wsj.com/2fubTOd,2016-11-15 10:00:09,245,131,111,will enter White House potential conflicts i...,trump,Negative
14,8304333127_10154944212133128,"Kellyanne Conway, one of President-elect Donal...",Donald Trump Adviser Kellyanne Conway Enjoys a...,link,http://on.wsj.com/2fSzchl,2016-11-15 09:20:08,628,111,72,Kellyanne Conway Presidentelect top advisers...,trump,Neutral


In [97]:
classified_df.to_csv('../facebook_raw_classified/classified_wsj_posts.csv')