In [1]:
from imblearn.under_sampling import RandomUnderSampler
from collections import defaultdict
from collections import Counter
import requests
import fasttext
import pandas as pd
import json

In [2]:
# download 1.2M tweets from US politicians and their Twitter account info from:
#
#   https://www.reddit.com/r/datasets/comments/6fniik/over_one_million_tweets_collected_from_us/
#
# extract and set path below:

data_path = '/Users/mtrencseni/Downloads/US_PoliticalTweets/'

In [3]:
users = []
with open(data_path + 'users.json') as f:
    while True:
        line = f.readline().strip()
        if not line or len(line) == 0:
            break
        user = json.loads(line)
        users.append({'name': user['name'], 'screen_name': user['screen_name']})
print('Total politicans:', len(users), '\n')
print(users[:10], '...')

Total politicans: 548 

[{'name': 'Governor Bill Walker', 'screen_name': 'AkGovBillWalker'}, {'name': 'Amy Klobuchar', 'screen_name': 'amyklobuchar'}, {'name': 'Anthony G. Brown', 'screen_name': 'AnthonyBrownMD4'}, {'name': 'Gov. Asa Hutchinson', 'screen_name': 'AsaHutchinson'}, {'name': 'Rep. Austin Scott', 'screen_name': 'AustinScottGA08'}, {'name': 'RepBThompson', 'screen_name': 'BennieGThompson'}, {'name': 'Bill Cassidy', 'screen_name': 'BillCassidy'}, {'name': 'Gov. Bill Haslam', 'screen_name': 'BillHaslam'}, {'name': 'U.S. Rep. Bob Latta', 'screen_name': 'boblatta'}, {'name': 'Rep. Brad Sherman', 'screen_name': 'BradSherman'}] ...


In [4]:
parties = ['Independent', 'Libertarian', 'Democratic', 'Republican']

def normalize_name(name):
    prefixes = ['governor', 'gov.', 'rep.', 'u.s. rep.', 'gov', 'rep', 'captain', 'cong.']
    for prefix in prefixes:
        if name.lower().startswith(prefix):
            return name[len(prefix):].strip()
    return name.strip()

def fetch_politician_info(name):
    info = {}
    info['name'] = name
    info['wiki_page'] = ''
    info['party'] = ''
    try:
        normalized_name = normalize_name(name)
        search_url = 'https://en.wikipedia.org/w/api.php?action=query&list=search&format=json&srsearch=' + normalized_name + ' american politician'
        wiki_page = requests.get(search_url).json()['query']['search'][0]['title']
        info['wiki_page'] = 'https://en.wikipedia.org/wiki/' + wiki_page.replace(' ', '_')
        wiki_info_url = 'https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&rvsection=0&redirects=1&format=json&titles=' + wiki_page
        result = requests.get(wiki_info_url).json()
        pages = result['query']['pages']
        for _, v in pages.items():
            lines = v['revisions'][0]['*'].split('\n')
            for line in lines:
                if line.startswith('| party') or line.startswith('|party'):
                    for party in parties:
                        if party.lower() in line.lower():
                            info['party'] = party
                            return info
    except:
        pass
    finally:
        return info

In [5]:
fetch_politician_info('Adam Schiff')

{'name': 'Adam Schiff',
 'wiki_page': 'https://en.wikipedia.org/wiki/Adam_Schiff',
 'party': 'Democratic'}

In [6]:
politicians = {}

In [7]:
for i, user in enumerate(users):
    if user['screen_name'] in politicians:
        if politicians[user['screen_name']]['party'] != '':
            continue
    info = fetch_politician_info(user['name'])
    info['screen_name'] = user['screen_name']
    politicians[info['screen_name']] = info
    print(i, ':', info['name'], '->', info['party'], ' '*32, end='\r')

547 : Yvette Clarke -> Democratic                                        

In [8]:
print('Missing party:')
num_missing = 0
for k, v in politicians.items():
    if v['party'] == '':
        print(k, v['name'])
        num_missing += 1
print('Total:', num_missing)

Missing party:
BennieGThompson RepBThompson
GOPLeader Kevin McCarthy
GovChristie Governor Christie
GregHarper Greg Harper
LamarSmithTX21 Lamar Smith
MacTXPress Mac Thornberry Press
RepDonBacon Rep. Don Bacon
RepMeehan Patrick Meehan
RepPeteKing Rep. Pete King
RepWalterJones Rep. Walter Jones
SenSanders Bernie Sanders
SenSchumer Chuck Schumer
tiberipress tiberipress
USRepKCastor US Rep Kathy Castor
Total: 14


In [13]:
politicians['BennieGThompson']['party'] = 'Democratic'
politicians['DrNealDunnFL2']['party']   = 'Republican'
politicians['GregHarper']['party']      = 'Republican'
politicians['justinamash']['party']     = 'Libertarian'
politicians['MacTXPress']['party']      = 'Republican'
politicians['RepBillJohnson']['party']  = 'Republican'
politicians['RepMeehan']['party']       = 'Republican'
politicians['RepPeteKing']['party']     = 'Republican'
politicians['RepWalterJones']['party']  = 'Republican'
politicians['SenSanders']['party']      = 'Independent'
politicians['tiberipress']['party']     = 'Republican'
politicians['GOPLeader']['party']       = 'Republican'
politicians['GovChristie']['party']     = 'Republican'
politicians['LamarSmithTX21']['party']  = 'Republican'
politicians['RepDonBacon']['party']     = 'Republican'
politicians['SenSchumer']['party']      = 'Democratic'
politicians['USRepKCastor']['party']    = 'Democratic'

In [15]:
print('Missing party:')
num_missing = 0
for k, v in politicians.items():
    if v['party'] == '':
        print(k, v['name'])
        num_missing += 1
print('Total:', num_missing)

Missing party:
Total: 0


In [16]:
Counter([v['party'] for k, v in politicians.items()])

Counter({'Independent': 4,
         'Democratic': 242,
         'Republican': 301,
         'Libertarian': 1})

In [17]:
# open tweets and format for fasttext supervised learning

ts, screen_names, txt = [], [], []
with open(data_path + 'tweets.json') as f:
    for i, line in enumerate(f):
        print(i, end='\r')
        line = line.strip()
        if len(line) == 0:
            continue
        tweet = json.loads(line)
        ts.append(tweet['created_at'])
        screen_names.append(tweet['screen_name'])
        txt.append(tweet['text'])
df = pd.DataFrame([ts, screen_names, txt]).T
df.columns = ['ts', 'screen_name', 'text']
df.sort_values(by='ts')
df.tail(10)

1243369

Unnamed: 0,ts,screen_name,text
1243360,1496769064,RepPaulTonko,@urbaninstitute @BrookingsInst @TaxPolicyCente...
1243361,1496769154,SenJeffMerkley,@BetsyDeVosED #Questions4Betsy: How in the nam...
1243362,1496769197,GovChrisSununu,"Welcome to the State House, Moharimet Elementa..."
1243363,1496769215,SenMarkey,@BetsyDeVosED How can you protect American stu...
1243364,1496769236,RepPaulTonko,@urbaninstitute @BrookingsInst @TaxPolicyCente...
1243365,1496769301,PramilaJayapal,Dismantling #DoddFrank returns us to the days ...
1243366,1496769303,RepSarbanes,"In the shadows of the #ComeyHearing, @HouseGOP..."
1243367,1496769317,SenMarkey,@BetsyDeVosED How does a budget that cuts inve...
1243368,1496769357,MarioDB,Thank you @POTUS @NikkiHaley for strong stance...
1243369,1496769360,PramilaJayapal,#WrongCHOICEAct will eliminate consumer protec...


In [18]:
df['party'] = df.apply(lambda x: politicians[x['screen_name']]['party'], axis=1)

In [19]:
df_dems_reps = df.loc[df['party'].isin(['Democratic', 'Republican'])]
df_dems_reps.tail(10)

Unnamed: 0,ts,screen_name,text,party
1243360,1496769064,RepPaulTonko,@urbaninstitute @BrookingsInst @TaxPolicyCente...,Democratic
1243361,1496769154,SenJeffMerkley,@BetsyDeVosED #Questions4Betsy: How in the nam...,Democratic
1243362,1496769197,GovChrisSununu,"Welcome to the State House, Moharimet Elementa...",Republican
1243363,1496769215,SenMarkey,@BetsyDeVosED How can you protect American stu...,Democratic
1243364,1496769236,RepPaulTonko,@urbaninstitute @BrookingsInst @TaxPolicyCente...,Democratic
1243365,1496769301,PramilaJayapal,Dismantling #DoddFrank returns us to the days ...,Democratic
1243366,1496769303,RepSarbanes,"In the shadows of the #ComeyHearing, @HouseGOP...",Democratic
1243367,1496769317,SenMarkey,@BetsyDeVosED How does a budget that cuts inve...,Democratic
1243368,1496769357,MarioDB,Thank you @POTUS @NikkiHaley for strong stance...,Republican
1243369,1496769360,PramilaJayapal,#WrongCHOICEAct will eliminate consumer protec...,Democratic


In [20]:
# split data into train and test
# train is chronologically before test

train_size = 1000*1000
df_train = df_dems_reps.head(train_size)
df_test = df_dems_reps.tail(-train_size)
print(len(df_train), len(df_test))

# balance both train and test between D and R tweets
rus = RandomUnderSampler(sampling_strategy='not minority')
df_train, _ = rus.fit_resample(df_train, df_train['party'])
df_test, _ = rus.fit_resample(df_test, df_test['party'])
print(len(df_train), len(df_test))

1000000 232248
881460 195804


In [21]:
# verify that train and test are after each other
print(df_train[['ts']].agg(['min', 'max']))
print(df_test[['ts']].agg(['min', 'max']))

             ts
min  1217870931
max  1482173307
             ts
min  1482173324
max  1496769360


In [31]:
def remove_atmentions(s):
    words = s.split()
    words = [word for word in words if not word.startswith('@')]
    return ' '.join(words)

def remove_urls(s):
    words = s.split()
    words = [word for word in words if not word.lower().startswith('http')]
    return ' '.join(words)

def write_df(df, filename, label, text_fun=lambda s: s):
    with open(filename, "w") as f:
        f.writelines([text_fun(x['text']) + ' ' + '__label__' + x[label] + '\n' for _, x in df.iterrows()])

In [25]:
write_df(df_train, data_path + 'train_party_full.fts', 'party')
write_df(df_test, data_path + 'test_party_full.fts', 'party')
write_df(df_train, data_path + 'train_party_trunc.fts', 'party', lambda s: remove_urls(remove_atmentions(s)))
write_df(df_test, data_path + 'test_party_trunc.fts', 'party', lambda s: remove_urls(remove_atmentions(s)))

In [26]:
model_party_full    = fasttext.train_supervised(data_path + 'train_party_full.fts')
model_party_trunc = fasttext.train_supervised(data_path + 'train_party_trunc.fts')
# 
# or
#
# model = fasttext.train_supervised(..., wordNgrams=3)
#
# to have fasttext also use 2-grams and 3-grams, but it doesn't help

In [27]:
print('Accuracy on full tweets predicting binary party:')
accuracy = model_party_full.test(data_path + 'train_party_full.fts')[1]
print('Train accuracy:', accuracy)
accuracy = model_party_full.test(data_path + 'test_party_full.fts')[1]
print('Test accuracy: ', accuracy)
print()
print('Accuracy on truncated tweets predicting binary party:')
accuracy = model_party_trunc.test(data_path + 'train_party_trunc.fts')[1]
print('Train accuracy:', accuracy)
accuracy = model_party_trunc.test(data_path + 'test_party_trunc.fts')[1]
print('Test accuracy: ', accuracy)

Accuracy on full tweets predicting binary party:
Train accuracy: 0.9295759308420121
Test accuracy:  0.7467671753386039

Accuracy on truncated tweets predicting binary party:
Train accuracy: 0.85225194563565
Test accuracy:  0.7188412902698617


In [28]:
# sample tweet from Trump
txt = """
Mike Pence didn’t have the courage to do what should have been done
to protect our Country and our Constitution, giving States a chance to
certify a corrected set of facts, not the fraudulent or inaccurate ones
which they were asked to previously certify. USA demands the truth!
""".replace('\n', ' ')
model_party_trunc.predict(txt)

(('__label__Republican',), array([0.79000771]))

In [29]:
# now let's try to predict the user from the tweet
# no need to limit to just D and R, no need to re-sample

df_train = df.head(train_size)
df_test = df.tail(-train_size) 
print(len(df_train), len(df_test))

1000000 243370


In [32]:
write_df(df_train, data_path + 'train_author_full.fts', 'screen_name')
write_df(df_test, data_path + 'test_author_full.fts', 'screen_name')
write_df(df_train, data_path + 'train_author_trunc.fts', 'screen_name', lambda s: remove_urls(remove_atmentions(s)))
write_df(df_test, data_path + 'test_author_trunc.fts', 'screen_name', lambda s: remove_urls(remove_atmentions(s)))

In [33]:
model_author_full    = fasttext.train_supervised(data_path + 'train_author_full.fts')
model_author_trunc   = fasttext.train_supervised(data_path + 'train_author_trunc.fts')

In [34]:
print('Accuracy on full tweets predicting author:')
accuracy = model_author_full.test(data_path + 'train_author_full.fts')[1]
print('Train accuracy:', accuracy)
accuracy = model_author_full.test(data_path + 'test_author_full.fts')[1]
print('Test accuracy: ', accuracy)
print()
print('Accuracy on truncated tweets predicting author:')
accuracy = model_author_trunc.test(data_path + 'train_author_trunc.fts')[1]
print('Train accuracy:', accuracy)
accuracy = model_author_trunc.test(data_path + 'test_author_trunc.fts')[1]
print('Test accuracy: ', accuracy)

Accuracy on full tweets predicting author:
Train accuracy: 0.512116
Test accuracy:  0.22982925637542978

Accuracy on truncated tweets predicting author:
Train accuracy: 0.411818
Test accuracy:  0.18464876301645575
