# Ideology Dictionary Creator

### Scraping political subreddits for key terms signifying ideology

In [1]:
import praw
import pandas as pd
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# local library
from preproc import *

### Parameters

In [51]:
#Assign subreddits
moderate_left = 'democrat'
moderate_right = 'republican'
far_left = 'AntifascistsofReddit'
far_right = 'trump'

#Ideologiy dictionary parameters
donotwant=['CC','CD','DT','EX','IN','LS','MD','PDT','SYM','TO','UH','PRP','PRP$'] #parts of speech we don't want included
size = 500 #size of dictionary

### Subreddit Scrape

In [3]:
reddit = praw.Reddit(client_id='5-gvrruIFLx5bQ', client_secret='kQSwIc9kLS-GWwLnkEs4jRTid0g', user_agent='political sentiment')

In [4]:
mod_left = []
ml_subreddit = reddit.subreddit(moderate_left)
for post in ml_subreddit.top(limit=100):
    mod_left.append([post.id, post.title, post.score, post.subreddit, post.url, post.num_comments])
ml_posts = pd.DataFrame(mod_left,columns=['post_id','title', 'score', 'subreddit', 'url', 'num_comments'])

In [5]:
mod_right = []
mr_subreddit = reddit.subreddit(moderate_right)
for post in mr_subreddit.top(limit=100):
    mod_right.append([post.id, post.title, post.score, post.subreddit, post.url, post.num_comments])
mr_posts = pd.DataFrame(mod_right,columns=['post_id','title', 'score', 'subreddit', 'url', 'num_comments'])

In [52]:
far_left = []
fl_subreddit = reddit.subreddit(far_left)
for post in fl_subreddit.top(limit=100):
    far_left.append([post.id, post.title, post.score, post.subreddit, post.url, post.num_comments])
fl_posts = pd.DataFrame(far_left,columns=['post_id','title', 'score', 'subreddit', 'url', 'num_comments'])

In [123]:
far_right = []
fr_subreddit = reddit.subreddit(far_right)
for post in fr_subreddit.top(limit=100):
    far_right.append([post.id, post.title, post.score, post.subreddit, post.url, post.num_comments])
fr_posts = pd.DataFrame(far_right,columns=['post_id','title', 'score', 'subreddit', 'url', 'num_comments'])

In [6]:
def get_comments(df):
    commentlist = pd.DataFrame(columns = ['post_id', 'comment'])
    ids = df['post_id']
    for i in ids:
        submission = reddit.submission(id = i)
        submission.comments.replace_more(limit=0)
        for comment in submission.comments.list():
            commentlist = commentlist.append({'post_id': i, 'comment': comment.body}, ignore_index=True)
    return commentlist

In [7]:
ml_comments = get_comments(ml_posts)

In [8]:
ml_comments.to_csv('ml_comments2.csv')

In [9]:
mr_comments = get_comments(mr_posts)

In [10]:
mr_comments.to_csv('mr_comments2.csv')

In [53]:
fl_comments = get_comments(fl_posts)

In [54]:
fl_comments.to_csv('fl_comments2.csv')

In [124]:
fr_comments = get_comments(fr_posts)

In [125]:
fr_comments.to_csv('fr_comments.csv')

### Read in comments from files once created

In [55]:
ml_comments = pd.read_csv('ml_comments2.csv')
mr_comments = pd.read_csv('mr_comments2.csv')
fl_comments = pd.read_csv('fl_comments2.csv')
fr_comments = pd.read_csv('fr_comments.csv')

In [56]:
fl_comments

Unnamed: 0.1,Unnamed: 0,post_id,comment
0,0,gv53cu,Hes heading straight in the direction of dicta...
1,1,gv53cu,Antifa has killed fewer people during the last...
2,2,gv53cu,We need to clarify to people: \n\n1. Antifa is...
3,3,gv53cu,What's disturbing is that he was actually enfo...
4,4,gv53cu,Trump just keeps marching along to where he wa...
...,...,...,...
15561,15561,gwtfq5,"Wait, is that right wing Antifa? Last week I w..."
15562,15562,gwtfq5,Yo whattt iron front is a movement again
15563,15563,gwtfq5,It’s anti-authoritarian unity antifa like how ...
15564,15564,gwtfq5,Iron front was the SDP version of antifa


### TFIDF Analysis

In [21]:
#Create vectorizers
vectorizer_ml = TfidfVectorizer(stop_words='english')
vectorizer_mr = TfidfVectorizer(stop_words='english')
vectorizer_fl = TfidfVectorizer(stop_words='english')
vectorizer_fr = TfidfVectorizer(stop_words='english')

In [22]:
tfidf_ml = vectorizer_ml.fit_transform(ml_comments.comment)
tokens_ml = vectorizer_ml.get_feature_names()

In [23]:
tfidf_mr = vectorizer_mr.fit_transform(mr_comments.comment)
tokens_mr = vectorizer_mr.get_feature_names()

In [57]:
tfidf_fl = vectorizer_fl.fit_transform(fl_comments.comment)
tokens_fl = vectorizer_fl.get_feature_names()

In [25]:
tfidf_fr = vectorizer_fr.fit_transform(fr_comments.comment)
tokens_fr = vectorizer_fr.get_feature_names()

### Determine keywords for each ideology

In [31]:
def create_dict(tfidf, index, columns, donotwant=donotwant, size=size):
    tfidfvect = pd.DataFrame(data = tfidf.toarray(),index = index, columns = columns)
    vals = tfidfvect.sum()
    
    vals=pd.DataFrame(vals).reset_index().rename(columns={"index": "term_str", 0: "tfidf"})
    
    vals['tag'] = nltk.pos_tag(vals.term_str)
    vals['pos'] = vals.tag.apply(lambda x: x[1])
    vals = vals.drop(['tag'], axis=1)
    vals = vals[~vals.pos.isin(donotwant)]

    
    vals=vals.sort_values(by=['tfidf'], ascending=False).iloc[:size]
    
    
    return vals

In [32]:
ml_keywords = create_dict(tfidf_ml, ml_comments.comment, tokens_ml)

In [33]:
mr_keywords = create_dict(tfidf_mr, mr_comments.comment, tokens_mr)

In [58]:
fl_keywords = create_dict(tfidf_fl, fl_comments.comment, tokens_fl)

In [35]:
fr_keywords = create_dict(tfidf_fr, fr_comments.comment, tokens_fr)

In [59]:
allwords = ml_keywords['term_str'].append(mr_keywords['term_str']).append(fl_keywords['term_str']).append(fr_keywords['term_str'])

In [60]:
ideology_dict = pd.DataFrame(allwords).drop_duplicates()

In [61]:
ideology_dict['far_left'] = np.where(ideology_dict['term_str'].isin(fl_keywords.term_str), 1, 0)
ideology_dict['mod_left'] = np.where(ideology_dict['term_str'].isin(ml_keywords.term_str), 1, 0)
ideology_dict['mod_right'] = np.where(ideology_dict['term_str'].isin(mr_keywords.term_str), 1, 0)
ideology_dict['far_right'] = np.where(ideology_dict['term_str'].isin(fr_keywords.term_str), 1, 0)
ideology_dict['sum'] = ideology_dict.sum(axis=1)
ideology_dict = ideology_dict[(ideology_dict['sum'] < 4)] #words that are returned in all 4 ideologies are no help

In [62]:
ideology_dict

Unnamed: 0,term_str,far_left,mod_left,mod_right,far_right,sum
1083,democrat,0,1,1,1,3
3150,republican,0,1,1,1,3
1199,dnc,0,1,0,0,1
1084,democratic,0,1,1,1,3
1817,healthcare,0,1,0,0,1
...,...,...,...,...,...,...
18928,review,0,0,0,1,1
24689,yea,0,0,0,1,1
4881,clarification,0,0,0,1,1
16503,pedophile,0,0,0,1,1


In [63]:
ideology_dict.to_csv('ideology_dict.csv')

### Analyze speeches for ideologies

In [64]:
ideology_dict = pd.read_csv('ideology_dict.csv', usecols=['term_str','mod_left', 'mod_right','far_left','far_right']).set_index('term_str')

In [45]:
emo = 'polarity'

In [46]:
#pd.set_option('display.max_rows', 211)
ideology_dict.query('far_left == 0 & mod_left == 0 & mod_right == 1 & far_right == 1')

Unnamed: 0_level_0,far_left,mod_left,mod_right,far_right
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
blm,0,0,1,1
peaceful,0,0,1,1
crime,0,0,1,1
shot,0,0,1,1
antifa,0,0,1,1
facts,0,0,1,1
sides,0,0,1,1
meme,0,0,1,1
conservatives,0,0,1,1
evidence,0,0,1,1
