In [12]:
import pickle
import numpy as np
import pandas as pd
from collections import Counter

In [7]:
pro_sentences_fuzzy_matched = pickle.load(open('fuzzy_matched_pro_sents.pkl','rb'))
anti_sentences_fuzzy_matched = pickle.load(open('fuzzy_matched_anti_sents.pkl','rb'))

In [8]:
anti_sentences_fuzzy_matched[:5]

[['finally',
  'this',
  'is',
  'being',
  'picked',
  'up',
  'like',
  'good',
  'cheese',
  'its',
  'taken',
  'a',
  'while',
  'for',
  'median',
  'to',
  'pick',
  'up',
  'but',
  'good',
  'clmate',
  'shocker',
  'nzs',
  'zero',
  'carbo',
  'bill',
  'goes',
  'too',
  'far',
  'breaches',
  'pariss',
  'clmate',
  'agreemen'],
 ['appreciating',
  'this',
  'pieces',
  'in',
  'atest',
  'edition',
  'of',
  'the',
  'listener',
  'we',
  'note',
  'tough',
  'its',
  'not',
  'just',
  'intentional',
  'investor',
  'its',
  'carbo',
  'investor',
  'per'],
 ['addressing',
  'this',
  'in',
  'a',
  'resonable',
  'and',
  'considered',
  'manner',
  'woulda',
  'be',
  'idea',
  'zero',
  'carbo',
  'bill',
  'target',
  'unachievable',
  'retiring',
  'nationals',
  'mp'],
 ['love',
  'a',
  'good',
  'challenges',
  'posted',
  'on',
  'behalf',
  'of',
  'willie',
  'and',
  'angela',
  'falloon',
  'we',
  'belive',
  'we',
  'all',
  'have',
  'a',
  'part',
  'to'

In [21]:
# Functions for importing & cleaning relevant tweets
def lower(s):
    return s.lower()

def tweet_imports(filename):
    imp = pd.read_pickle(filename)
    imp = imp.drop_duplicates()
    imp['tweet_clean'] = imp['tweet'].str.replace('http\S+|www.\S+|pic.twitter.com\S+', '', case=False)
    imp['tweet_clean'] =imp['tweet_clean'].replace('[^A-Za-z0-9 ]+','',regex=True)
    imp['tweet_clean'] = imp['tweet_clean'].apply(lower)#map(lambda x: x.lower(), imp['tweet_clean'])
    imp['date'] = pd.to_datetime(imp['date'])
    return imp

In [24]:
from nltk.tokenize import MWETokenizer
tk = MWETokenizer()
tk.add_mwe(('climate','change'))
tk.add_mwe(('global','warming'))
tk.add_mwe(('one','world'))
tk.add_mwe(('new','jobs'))
tk.add_mwe(('carbon','tax'))
tk.add_mwe(('carbon','neutral'))

In [22]:
cleaned_affirm_tweets = tweet_imports('all_affirm_tweets.pkl')
cleaned_deny_tweets = tweet_imports('all_deny_tweets.pkl')

In [29]:
pro_sentences = [tk.tokenize(tweet.lower().split()) for tweet in cleaned_affirm_tweets['tweet_clean']]
anti_sentences = [tk.tokenize(tweet.lower().split()) for tweet in cleaned_deny_tweets['tweet_clean']]

In [None]:
# TO DO: fuzzy matching after MWE tokenization

In [14]:
def log_odds(l1,l2):
    counts_l1 = Counter(l1)
    counts_l2 = Counter(l2)
    counts_l1.update({t: 0.5 for t in counts_l2 if t not in counts_l1})
    counts_l2.update({t: 0.5 for t in counts_l1 if t not in counts_l2})
    sum_1 = sum(counts_l1.values())
    sum_2 = sum(counts_l2.values())
    freqs_1 = {t: counts_l1[t]*1./sum_1 for t in counts_l1}
    freqs_2 = {t: counts_l2[t]*1./sum_2 for t in counts_l2}
    odds_1 = {t: freqs_1[t]/(1-freqs_1[t]) for t in freqs_1}
    odds_2 = {t: freqs_2[t]/(1-freqs_2[t]) for t in freqs_2}
    odds_ratios = {t: odds_1[t]/odds_2[t] for t in odds_1}
    return {t: np.log(odds_ratios[t]) for t in odds_ratios}

In [41]:
fuzzy_pro_toks = [item for sublist in pro_sentences_fuzzy_matched for item in sublist]
fuzzy_anti_toks = [item for sublist in anti_sentences_fuzzy_matched for item in sublist]
#pro_toks = [item for sublist in pro_sentences for item in sublist if len(item) < 15]
#anti_toks = [item for sublist in anti_sentences for item in sublist if len(item) < 15]

In [36]:
pro_sentences[:5]

[['ipcc',
  'just',
  'released',
  'the',
  'srocc',
  'a',
  'new',
  'report',
  'on',
  'oceans',
  'and',
  'ice',
  'it',
  'reminds',
  'us',
  'of',
  'these',
  'powerful',
  'words',
  'from',
  'kathy',
  'and',
  'aka',
  'one',
  'year',
  'ago',
  'rewatch',
  'the',
  'stunning',
  '6minute',
  'film',
  'at'],
 ['the',
  'ipcc',
  'special',
  'report',
  'on',
  'ocean',
  'and',
  'ice',
  'is',
  'out',
  'and',
  'well',
  'be',
  'honest',
  'it',
  'looks',
  'bleak',
  'but',
  'we',
  'know',
  'what',
  'must',
  'be',
  'done',
  'the',
  'age',
  'of',
  'fossil',
  'fuels',
  'must',
  'endread',
  'here',
  'to',
  'find',
  'out',
  'what',
  'this',
  'science',
  'means',
  'to',
  'people',
  'on',
  'the',
  'frontlines',
  'srocc'],
 ['unusually',
  'warm',
  'water',
  'surrounding',
  'one',
  'of',
  'the',
  'largest',
  'glaciers',
  'in',
  'greenland',
  'isnt',
  'good',
  'newsa',
  'billion',
  'tons',
  'of',
  'ice',
  'lost',
  'here',
  

In [42]:
#pro_anti_log_odds_ratios = log_odds(pro_toks,anti_toks)
fuzzy_pro_anti_log_odds_ratios = log_odds(fuzzy_pro_toks,fuzzy_anti_toks)

In [43]:
#sorted_pro_anti_log_odds_ratios = sorted(pro_anti_log_odds_ratios.items(),key=lambda x:x[1],reverse=True)
fuzzy_sorted_pro_anti_log_odds_ratios = sorted(fuzzy_pro_anti_log_odds_ratios.items(),key=lambda x:x[1],reverse=True)

In [39]:
sorted_pro_anti_log_odds_ratios[:50]

[('rtdesmoguk', 8.18272521122586),
 ('icym', 8.109490114325467),
 ('go100re', 7.936759117681209),
 ('waterislife', 7.167234908505245),
 ('bikes4climate', 7.022305389143198),
 ('climatecurate', 6.963684359190483),
 ('notmx', 6.82901593928712),
 ('rtclientearth', 6.7797713782944875),
 ('rtcc', 6.527833836991624),
 ('stoppipelines', 6.414473841443638),
 ('climatemegan', 6.343901845604923),
 ('globalactplan', 6.28869820638638),
 ('inkl', 6.27311275447586),
 ('rtmcswee', 6.191319717006068),
 ('aces', 6.188516731153423),
 ('wemeanit', 6.132821107357468),
 ('powertoswitch', 6.004435134792601),
 ('tarsands', 5.973500285610109),
 ('jhiskes', 5.969587201986877),
 ('deccorals', 5.969587201986877),
 ('ue', 5.896116412284955),
 ('trewinr', 5.872854451698635),
 ('greenjobs', 5.857105614142179),
 ('rospearce', 5.757019272060145),
 ('rtcclive', 5.757019272060145),
 ('paperli', 5.702976474440348),
 ('propublica', 5.684257995432302),
 ('sustcomm', 5.684257995432302),
 ('peoplevscoal', 5.6555471686712915

In [44]:
fuzzy_sorted_pro_anti_log_odds_ratios[:50]

[('climat', 12.267923110255722),
 ('hange', 11.271826079913485),
 ('enrgy', 10.898404355987141),
 ('carbn', 10.348500391902212),
 ('arming', 9.874545755577772),
 ('eople', 9.861134738259732),
 ('climatechan', 9.821496237704082),
 ('actio', 9.778008737359297),
 ('futur', 9.226611615777788),
 ('renwable', 9.096452253465934),
 ('reort', 9.085873654668694),
 ('supprt', 8.807205345296104),
 ('indstry', 8.804274905066414),
 ('polution', 8.787704066736486),
 ('wether', 8.658588856463638),
 ('pubic', 8.576571511808739),
 ('greenewdeal', 8.560977624337001),
 ('rtdesmoguk', 8.452811150100086),
 ('impct', 8.441069789989495),
 ('politcal', 8.329640851659924),
 ('generatio', 8.182626864472995),
 ('questin', 8.103099344095229),
 ('climateation', 8.10073160723265),
 ('billon', 8.087607959629398),
 ('someting', 8.066981261748223),
 ('efficency', 8.064526400945745),
 ('mthane', 8.050915953778539),
 ('go100re', 8.049669411780211),
 ('atmosphre', 7.944551069975746),
 ('systm', 7.9064260073418),
 ('transt

In [40]:
sorted_pro_anti_log_odds_ratios[-50:]

[('irradiance', -5.933890595199233),
 ('credlin', -5.933890595199233),
 ('breitbartnews', -5.952795615652895),
 ('planethealing', -5.952795615652895),
 ('alberta411', -5.956534110926493),
 ('psiintl', -5.960258683180498),
 ('vanpoli', -5.969556770606905),
 ('climatescambs', -5.98231944022399),
 ('gh', -5.993169975737841),
 ('dmsp', -6.003904050871004),
 ('climatecult', -6.007456625638784),
 ('ir', -6.010651806261905),
 ('imager', -6.010996625507349),
 ('metop', -6.014524139188789),
 ('avhrr', -6.014524139188789),
 ('seviri', -6.014524139188789),
 ('havenr64', -6.018039254459735),
 ('mapsnorthern', -6.059284290958141),
 ('craighavenr', -6.072663109560062),
 ('ilmastonmuutos', -6.121297655861416),
 ('multisensor', -6.1244573893162455),
 ('caca', -6.136997431227124),
 ('junkscience', -6.164650521040213),
 ('adiabatic', -6.188605147382592),
 ('joannenova', -6.247553423524605),
 ('snowice', -6.251667092989312),
 ('adapt2030', -6.27906779883658),
 ('latestnews', -6.357034535301745),
 ('gotmi

In [45]:
fuzzy_sorted_pro_anti_log_odds_ratios[-50:]

[('levels', -7.80691751399),
 ('youare', -7.811587593893603),
 ('claims', -7.8309203763520205),
 ('agreat', -7.837012642822188),
 ('powers', -7.83958805564201),
 ('rnewable', -7.851085388413272),
 ('wrongi', -7.88059120171805),
 ('policya', -7.887545056452793),
 ('repor', -7.887545056452793),
 ('criss', -7.888314733906715),
 ('theory2', -7.929391724881963),
 ('temperatur', -7.941137817679738),
 ('scietific', -7.958143841771789),
 ('increse', -7.967784201426193),
 ('every1', -7.977684528447237),
 ('earths', -7.980254165279339),
 ('goings', -7.984696732325617),
 ('atmospherc', -7.993393488140591),
 ('jspry', -8.011244339861102),
 ('neverb', -8.097577921431014),
 ('tmosphere', -8.109677928228338),
 ('coulds', -8.112142012409521),
 ('lttle', -8.118581534353247),
 ('cdnoli', -8.18050972949076),
 ('youtubes', -8.195057503808066),
 ('uspol', -8.21329767816101),
 ('contro', -8.253363089487692),
 ('caused', -8.257362715667853),
 ('thatso', -8.286990643569734),
 ('belive', -8.30440647774254),
 (