In [39]:
import os
import sys
import re
import json
import time
import emoji
import random
import numpy as np
import pandas as pd
from pandarallel import pandarallel
from copy import deepcopy
from datetime import date, datetime
from collections import Counter, OrderedDict

from IPython.display import clear_output

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
# t0 = time.time()

In [40]:
cfg = dict()
cfg['input dir']   = 'input'
cfg['sample_size'] = 10000

In [41]:
inFile = os.path.join(cfg['input dir'], 'Tweets_US.csv')
tweets_df = pd.read_csv(inFile, lineterminator='\n')

columns_to_drop = ['user_verified', 'truncated', 'is_retweet', 'coordinates', 'has_media_type', 
                   'in_reply_to_status_id', 'source']
tweets_df.drop(columns=columns_to_drop, inplace=True, errors='ignore')
tweets_df['date'] = tweets_df['timestamp'].apply(lambda x: x[:x.find(' ')])

print(f'Number of tweets: {len(tweets_df) :,}')

Number of tweets: 645,253


In [42]:
tweets_df.head(2)

Unnamed: 0,id,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,timestamp,text,hashtags,retweet_count,favorite_count,date
0,1323052187539353600,Julie Frein,"Headingley, Leeds, UK","BA(Hons) Art Vis Comm. Expat American living in UK, Divorced, walking a positive path. PLEASE, no DMS if only wanting a 'date'. #Resistance #lovepeople",2009-02-19 02:14:14,1222,1282,17866,2020-11-01 23:59:57,"@realDonaldTrump Dumpy Donnie is the candidate of rioters, looters, arsonists, gun-toting Billy-Bubbas, terrorists, lobbyists and special interests. Biden is the candidate of farmers, factory workers, police officers, and hard-working, law-abiding patriots of every race, religion and creed #BIDEN",['BIDEN'],0,0,2020-11-01
1,1323052185270235136,Bev 🏃‍♀️,Canada,🇨🇦 🍁 Outdoor & animal enthusiast! Love my family my dog & Passion Fruit tea! #cdnpoli #onpoli #blacklivesmatter #humanrights #lgbtq 🏳️‍🌈,2012-10-20 15:40:25,3232,4991,64742,2020-11-01 23:59:56,The Time has Come! #VOTE #Election2020 #RaiseYourVoice #BLM #BidenHarris2020ToSaveAmerica #coronavirus #VoteBlueToEndTheNightmare #Trump #RepublicansForBiden https://t.co/H0YpFeNBtG,"['VOTE', 'Election2020', 'RaiseYourVoice', 'BLM', 'BidenHarris2020ToSaveAmerica', 'coronavirus', 'VoteBlueToEndTheNightmare', 'Trump', 'RepublicansForBiden']",0,0,2020-11-01


#### Location

In [43]:
places_df = pd.read_csv(os.path.join(cfg['input dir'], 'City_Country.csv'))
places_df = places_df.applymap(lambda x: str(x).lower())

In [44]:
countries = set(places_df.country.unique())
print(f"countries has {len(countries) :,} entries")

countries has 244 entries


In [45]:
city_to_subcountry = places_df[['city','subcountry']].set_index('city').T.to_dict('records')
city_to_subcountry = city_to_subcountry[0]

  """Entry point for launching an IPython kernel.


In [46]:
city_to_country = places_df[['city','country']].set_index('city').T.to_dict('records')
city_to_country = city_to_country[0]

#Append special cases:
additonals = {
    'uk' : 'united kingdom',
    'us' : 'united states',
    'usa': 'united states',
    '美國': 'united states',
    '香港': 'hong kong',
    '日本': 'japan',
    '東京': 'japan',
    '台灣': 'taiwan',
    '台北': 'taiwan',
    '北京': 'china',
    '上海': 'china',
    '中华人民共和国': 'china'
}

city_to_country.update(additonals)
print(f"city_to_country has {len(city_to_country) :,} entries")

  """Entry point for launching an IPython kernel.


city_to_country has 21,952 entries


In [47]:
subcountry_to_country = places_df[['subcountry', 'country']].set_index('subcountry').T.to_dict('records')
subcountry_to_country = subcountry_to_country[0]
print(f"subcountry_to_country has {len(subcountry_to_country) :,} entries")

  """Entry point for launching an IPython kernel.


subcountry_to_country has 2,592 entries


In [48]:
def find_countries(text):        
    
    if not isinstance(text, str):
        return np.nan
    
    #------------------------------------#
    def lookup(term):        
        if term in countries:
            return term
        elif term in city_to_country:            
            return city_to_country[term]
        elif term in subcountry_to_country:
            return subcountry_to_country[term]
        else:
            return ''
    #------------------------------------#
    
    Y = set()
    terms = text.split()
        
    if len(text) == 0:
        return Y
    
    elif len(terms) == 1:
        result = lookup(terms[0])
        if len(result) > 0:
            Y.add(result)                          
    else:   
        check_single_term = True  
        
        for t1, t2 in zip(terms[:-1], terms[1:]):                      
            result = lookup(f'{t1} {t2}')            
            if len(result) > 0:
                Y.add(result)
                check_single_term = False        
                
            elif check_single_term:
                result = lookup(t1)
                if len(result) > 0:
                    Y.add(result)                    
            else:
                check_single_term = True
          
        if check_single_term:
            result = lookup(terms[-1])
            if len(result) > 0:
                Y.add(result)    

    return Y 
        

In [49]:
def clean(text):
    if isinstance(text, str):
        text = emoji.demojize(text)    
        text = re.sub(r'([^\w]|\_)', ' ', text) 
        return " ".join(text.lower().split())
    else:
        return text

In [50]:
%%time
pandarallel.initialize()

#--------------------------------------------#
tweets_df['user_location'] = tweets_df['user_location'].parallel_apply(lambda x: clean(x))
tweets_df['countries'] = tweets_df['user_location'].parallel_apply(lambda x: find_countries(x))


INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
CPU times: user 1.91 s, sys: 1.31 s, total: 3.22 s
Wall time: 12.2 s


#### Find States within US

In [51]:
states_df = pd.read_csv(os.path.join(cfg['input dir'], 'USA_States_code.csv'))

states = set(state.lower() for state in states_df.State.values)

code_lookup, abbrev_lookup = dict(), dict()
for state, abbrev, code in states_df.values:
    code_lookup[code.lower()] = state.lower()
    abbrev_lookup[abbrev.lower()] = state.lower()

In [52]:
%%time

def get_us_states(df):
    
    #----------------------------------------------#
    def lookup(term):
        if term in states:
            return term
        elif term in code_lookup:
            return code_lookup[term]
        elif term in abbrev_lookup:
            return abbrev_lookup[token]      
        return ''
    #----------------------------------------------#
    
    us_states = []
    for location, countries in df[['user_location', 'countries']].values:           

        Y = set()
        if pd.notnull(countries) and 'united states' in countries:
            tokens = location.lower().split()

            if len(tokens) == 1:
                result = lookup(tokens[0])
                if len(result) > 0:
                    Y.add(result)  

            else:   
                check_single_term = True 

                for t1, t2 in zip(tokens[:-1], tokens[1:]):                      
                    result = lookup(f'{t1} {t2}')            
                    if len(result) > 0:
                        Y.add(result)
                        check_single_term = False        

                    elif check_single_term:
                        result = lookup(t1)
                        if len(result) > 0:
                            Y.add(result)
                    else:
                        check_single_term = True

                if check_single_term:
                    result = lookup(tokens[-1])
                    if len(result) > 0:
                        Y.add(result)  
                        
        if len(Y) == 0:
            Y.add('usa')

        if 'washington' in Y and 'district of columbia' in Y:
            Y.remove('washington')
            
        us_states.append(list(Y))

    return us_states
#----------------------------------------------#

tweets_df['states'] = get_us_states(tweets_df)

CPU times: user 5.94 s, sys: 161 ms, total: 6.1 s
Wall time: 6.1 s


#### Test data set (include training and validation)

In [55]:
export_columns = ['id', 'user_name', 'user_description', 'text', 'states', 'date']

In [56]:
ids = []
for id_, state in tweets_df[['id', 'states']].values:
    if len(state) > 0:
        ids.append(id_)

ids = set(ids)
test_df = tweets_df[tweets_df.id.isin(ids)]

test_df['user_created datetime'] = test_df['user_created'].apply(lambda s: datetime.fromisoformat(s))
test_df = test_df[test_df['user_created datetime'] < datetime.fromisoformat('2020-09-01')]

print(f"test_df: {test_df.shape}")
test_df[export_columns].to_csv('test', index=False)

test_df: (621338, 17)


In [57]:
test_df.head(1)

Unnamed: 0,id,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,timestamp,text,hashtags,retweet_count,favorite_count,date,countries,states,user_created datetime
0,1323052187539353600,Julie Frein,headingley leeds uk,"BA(Hons) Art Vis Comm. Expat American living in UK, Divorced, walking a positive path. PLEASE, no DMS if only wanting a 'date'. #Resistance #lovepeople",2009-02-19 02:14:14,1222,1282,17866,2020-11-01 23:59:57,"@realDonaldTrump Dumpy Donnie is the candidate of rioters, looters, arsonists, gun-toting Billy-Bubbas, terrorists, lobbyists and special interests. Biden is the candidate of farmers, factory workers, police officers, and hard-working, law-abiding patriots of every race, religion and creed #BIDEN",['BIDEN'],0,0,2020-11-01,{united kingdom},[usa],2009-02-19 02:14:14


#### Get sample for training

We assume certain hashtags in text reflect voting intention. This is not always correct, but spot checking samples of a few hundred suggests they are about 90% correct.

In [58]:
for_Trump_hashtags = ['trump2020landslide', 'trump2020landslidevictory', 'trump2020tosaveamerica', 
                      'bidencrimefamily']

for_Biden_hashtags = ['trumpcovid', 'trumpvirus', 'votehimout', 'voteblue', 'trumpisanationaldisgrace', 
                      'trumpmeltdown', 'trumpcrimefamily']

In [59]:
%%time

trump_ht = set(for_Trump_hashtags)
biden_ht = set(for_Biden_hashtags)

Y = []
c = 0
for hashtags in tweets_df.hashtags.values:    
    hashtags = [h.lower() for h in eval(hashtags)]
        
    biden, trump = False, False
    tmp = 'neutral'
    
    if len(set(hashtags).intersection(trump_ht)) > 0:
        tmp = 'trump'
        trump = True
        
    if len(set(hashtags).intersection(biden_ht)) > 0:
        tmp = 'biden'
        biden = True
    
    if biden and trump:
        tmp = 'neutral'
        c += 1 #count of both
        
    Y.append(tmp)

#--------------------------------#
tweets_df['support'] = Y


#--------------------------------#
print(f"All: {len(tweets_df) :,}")
print(f"Neither: {len(tweets_df[tweets_df['support']=='neutral']) :,}")
print(f"Trump: {len(tweets_df[tweets_df['support']=='trump']) :,}")
print(f"Biden: {len(tweets_df[tweets_df['support']=='biden']) :,}")
print(f"Both: {c :,}")

All: 645,253
Neither: 593,515
Trump: 9,223
Biden: 42,515
Both: 557
CPU times: user 10.5 s, sys: 71 ms, total: 10.6 s
Wall time: 10.6 s


In [60]:
sample = dict()
for candidate in ['trump', 'biden']:
    ind = tweets_df[tweets_df.support==candidate].index
    
    sample_size = min(len(ind), cfg['sample_size'])
    ind = random.sample(list(ind), sample_size)
    sample[candidate] = tweets_df[tweets_df.index.isin(ind)]
    sample[candidate].loc[:, 'support'] = candidate
    
# sample = dict()
# for candidate in ['trump', 'biden']:
#     sample[candidate] = tweets_df[tweets_df.support==candidate]
#     sample[candidate].loc[:, 'support'] = candidate

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [61]:
users_df = sample['trump'][['user_name', 'id']].groupby(['user_name']).count()
len(users_df)

5029

In [62]:
users_df = sample['biden'][['user_name', 'id']].groupby(['user_name']).count()
len(users_df)

5416

In [63]:
sample['trump'][['user_name', 'user_location', 'user_created', 'text', 'hashtags', 'support']].head(20)

Unnamed: 0,user_name,user_location,user_created,text,hashtags,support
8,Marios Gooner,london england,2009-12-15 09:23:17,Muslim in Trumps America on ITV. What's with the anti Trump propoganda? What are they scared of hey? #Trump #Trump2020Landslide #MAGA2020,"['Trump', 'Trump2020Landslide', 'MAGA2020']",trump
83,Clark,,2017-04-04 20:16:18,#TRUMP2020ToSaveAmerica #Trump #Trump2020\n\nJoe Biden is Evil.\n\nhttps://t.co/hsVQXIfbaG,"['TRUMP2020ToSaveAmerica', 'Trump', 'Trump2020']",trump
93,kstev99,atwood il,2016-08-23 17:35:24,"Liberals HATE America. It is a FACT. Don't let them take this country down! You may not agree with everything Trump says, but he LOVES America! VOTE !!\n#WalkAway #DemsAreDestroyingAmerica #BidenHarris2020 #Trump2020Landslide #VoteRedToSaveAmerica #Corruption #BLEXIT #Trump","['WalkAway', 'DemsAreDestroyingAmerica', 'BidenHarris2020', 'Trump2020Landslide', 'VoteRedToSaveAmerica', 'Corruption', 'BLEXIT', 'Trump']",trump
104,RoqnRobn,,2013-04-24 00:41:56,"""I'm on my Trump $hit today""\n- Kanye West\nWatch video below ⬇\n#Biden #JoeBiden\n#JOEBIDEN2020\n#KamalaHarris\n#ElectJustice #ElectionDay\n#Elections2020\n#Trump2020\n#Trump2020LandslideVictory\n#TrumpLandslide2020\n#VoteTrump2020\n\nhttps://t.co/dR9Z1Kgtfj","['Biden', 'JoeBiden', 'JOEBIDEN2020', 'KamalaHarris', 'ElectJustice', 'ElectionDay', 'Elections2020', 'Trump2020', 'Trump2020LandslideVictory', 'TrumpLandslide2020', 'VoteTrump2020']",trump
132,ً,cowboy hat face,2018-07-24 06:56:44,IF YOU SUPPORT TRUMP VOTE “ALEXIS” #TRUMP2020 #Trump2020Landslide #TrumpTraintexas #TrumpPence2020 #trump https://t.co/aHKZieLEJL,"['TRUMP2020', 'Trump2020Landslide', 'TrumpTraintexas', 'TrumpPence2020', 'trump']",trump
163,Broad Street Sports,philadelphia pa,2017-01-17 17:05:56,"Great #TrumpParade through Delco PA today, such an honor to be a part of it. Wow is all I can say!!! It was amazing!!!! #Trump2020Landslide #VOTE #VoteRepublican #KeepAmericaGreat #Trump","['TrumpParade', 'Trump2020Landslide', 'VOTE', 'VoteRepublican', 'KeepAmericaGreat', 'Trump']",trump
187,Rebecca Axsom,,2012-04-12 18:26:27,"I voted for #Trump in 2016, and will unashamedly vote #Trump again on #ElectionDay ! Because I want the greatest America possible for my kids and the entire next generation! #TRUMP2020ToSaveAmerica #Trump2020 #GREATESTPRESIDENTEVER45 #MAGA @realDonaldTrump @POTUS https://t.co/q7vVYZgeVE","['Trump', 'Trump', 'ElectionDay', 'TRUMP2020ToSaveAmerica', 'Trump2020', 'GREATESTPRESIDENTEVER45', 'MAGA']",trump
231,Belaaz Fan,,2019-12-08 07:21:07,Earlier this afternoon Governor Mario M. Cuomo Bridge in New York at a complete standstill For the trump car parade.\n\n#Trump #Trump2020 #Trump2020Landslide https://t.co/7JuTrO3J68,"['Trump', 'Trump2020', 'Trump2020Landslide']",trump
269,Belaaz Fan,,2019-12-08 07:21:07,Massive #JewsForTrump parade takes over the Tappan Zee Bridge.\n\n#Trump #Trump2020 #Trump2020Landslide https://t.co/UhHUaoW4iM,"['JewsForTrump', 'Trump', 'Trump2020', 'Trump2020Landslide']",trump
271,⁴MR. RUPERT MARCELLE HUCKS📖Iam I am I am📲&Ūr²;✈),port vue pa,2012-11-30 05:06:40,Happening now.\n@realDonaldTrump\n#Trump\nhttps://t.co/SArwj1IRjq\n#Hickory #NorthCarolina #Trump2020Landslide https://t.co/y0oBYNacTQ,"['Trump', 'Hickory', 'NorthCarolina', 'Trump2020Landslide']",trump


In [64]:
sample['biden'][['user_name', 'user_location', 'user_created', 'text', 'hashtags', 'support']].head(20)

Unnamed: 0,user_name,user_location,user_created,text,hashtags,support
87,archygirl,,2020-01-05 04:53:07,Thank you Lincoln Project. America Deserves Better. #LincolnProject #SenateBlueWave #TrumpHasNoPlan #Biden #BidenHarris #BlueWave #BidenHarris2020ToSaveAmerica #ObamaWasBetterAtEverything #TrumpDeathToll236K #TrumpCrimesAgainstHumanity #VoteBlue #NastyWoman https://t.co/vFNcg3meeM,"['LincolnProject', 'SenateBlueWave', 'TrumpHasNoPlan', 'Biden', 'BidenHarris', 'BlueWave', 'BidenHarris2020ToSaveAmerica', 'ObamaWasBetterAtEverything', 'TrumpDeathToll236K', 'TrumpCrimesAgainstHumanity', 'VoteBlue', 'NastyWoman']",biden
133,archygirl,,2020-01-05 04:53:07,Trump is a monster. #SenateBlueWave #TrumpHasNoPlan #Biden #BidenHarris #BlueWave #BidenHarris2020ToSaveAmerica #ObamaWasBetterAtEverything #TrumpDeathToll236K #TrumpCrimesAgainstHumanity #VoteBlue #NastyWoman https://t.co/YOTLtUZ5cP,"['SenateBlueWave', 'TrumpHasNoPlan', 'Biden', 'BidenHarris', 'BlueWave', 'BidenHarris2020ToSaveAmerica', 'ObamaWasBetterAtEverything', 'TrumpDeathToll236K', 'TrumpCrimesAgainstHumanity', 'VoteBlue', 'NastyWoman']",biden
239,gillian simpson,,2013-12-13 14:47:56,#Biden leads in national opinion polls though the race is seen as close in enough battleground states that #Trump could achieve the 270 votes needed to win in the #ElectoralCollege #vote #voteblue,"['Biden', 'Trump', 'ElectoralCollege', 'vote', 'voteblue']",biden
287,Gary McMurray,planet earth,2016-07-17 08:25:05,"@realDonaldTrump For decades, #Trump &amp; #TrumpCrimeFamily have ripped off #Trump2020 &amp; #TrumpTrain supporters, selling out the #USA to our enemies, funneling $ to his FAILING @Trump resorts, cheating on taxes so the rest of us pay more, pardoning crooks who work for him.\nhttps://t.co/FOqDKZPaG9","['Trump', 'TrumpCrimeFamily', 'Trump2020', 'TrumpTrain', 'USA']",biden
544,#Biden🗣️NoMalarkey,california usa,2016-05-28 05:44:54,"Donald #Trump trashes COVID-19 doctors, compares himself to Jesus\n\n#healthcare\n#healthcareworkers\n#VoteHimOut\n#coronavirus\n#AmericaOrTrump\n#RepublicansAgainstTrump\n\nhttps://t.co/s2Jgq5dyxJ","['Trump', 'healthcare', 'healthcareworkers', 'VoteHimOut', 'coronavirus', 'AmericaOrTrump', 'RepublicansAgainstTrump']",biden
562,The Ubiquitous J-Man,toronto canada,2009-05-24 10:52:00,@realDonaldTrump Tick Tock Donnie...\n🙄\n\n#USPoli \n#Trump \n#TrumpIsANationalDisgrace \n#TrumpIsUnwell #TrumpIsLosing \n#TrumpIsALoser \n#TrumpIsPathetic \n#EnoughIsEnough https://t.co/un2UVHRKrI,"['USPoli', 'Trump', 'TrumpIsANationalDisgrace', 'TrumpIsUnwell', 'TrumpIsLosing', 'TrumpIsALoser', 'TrumpIsPathetic', 'EnoughIsEnough']",biden
575,David Harpin,new haven connecticut,2013-06-27 09:19:02,How Will I Ever Look at America the Same Way Again? #Election2020 #TrumpMeltdown #Trump #nihilism #AmericaOrTrump #AmericasGreatestMistake https://t.co/z48qp07TRk,"['Election2020', 'TrumpMeltdown', 'Trump', 'nihilism', 'AmericaOrTrump', 'AmericasGreatestMistake']",biden
661,Rocky Vendetti,,2020-09-15 18:37:41,@DanCrenshawTX you have some dangerous associates GOP Rep Crenshaw. Men who like to terrorize tour buses and drive through innocent protestors. A man is judged by the company he keeps. Your turn is coming. \n#VoteBlue \n#VoteBlueToSaveAmerica \n#Biden \n#BidenHarris2020ToSaveAmerica https://t.co/e0EAHU3kcY,"['VoteBlue', 'VoteBlueToSaveAmerica', 'Biden', 'BidenHarris2020ToSaveAmerica']",biden
686,Valerie Martin,las vegas nv bwo detroit mi,2011-01-19 02:58:29,@realDonaldTrump #DonaldTrump\n&amp;\nthe #TrumpCrimeFamily\nare\nINCITING VIOLENCE.\n\nCHEERING as maga attempt\nto\nRUN A #Biden bus\nOFF THE FUCKIN' ROAD!\n&amp;\nJOKING they were\nPROTECTING the BUS?\nTHAT'S\nNOT FUCKIN' FUNNY\n\nWHERE IS THE @FBI?\nIT'S ATTEMPTED MURDER!\n#LockThemUp\n#VoteBlue\nhttps://t.co/SPAwHjJbeh,"['DonaldTrump', 'TrumpCrimeFamily', 'Biden', 'LockThemUp', 'VoteBlue']",biden
823,Chris,england,2016-12-19 13:52:49,**Breaking US Election News**\n\n#Trump caught snogging imaginary woman at super spreader rally\n\n#TrumpMeltdown https://t.co/xNgOOae3J5,"['Trump', 'TrumpMeltdown']",biden


In [65]:
training_df = pd.DataFrame()
training_df = pd.concat([training_df, sample['trump']], axis=0)
training_df = pd.concat([training_df, sample['biden']], axis=0)
training_df[export_columns + ['support']].to_csv('training', index=False)

In [66]:
len(training_df)

19223

#### Getting validation samples by using terms in user_name/description

In [67]:
for_trump = ['trump 88022', 'trump2020', 'trump 2020', 'donald trump', 'maga kag', 'trump maga', 
             'donald trump', 'text trump', 'president trump']

for_biden = ['bidenharris2020', 'dump trump', 'biden', 'vote blue', 'biden harris', 'trump bot', 
             'vote biden', 'trumpery resistance', 'joe biden', 'blue wave', 'harris 2020', '2020 blue', 
             'votebluetosaveamerica', 'vote blue', 'biden2020', 'anti trump', 'trump freakouts', 
             'water_wave water_wave']

In [68]:
regex = re.compile('\W')
url_regex = re.compile(r'(www|http)\S+')
        
def tokenize(text):
    if isinstance(text, str):
        text = url_regex.sub('', text) #remove link 
        text = emoji.demojize(text)
        return ' '.join([token for token in regex.split(text) if len(token)>0])
    else:
        return ''

In [69]:
%%time
pandarallel.initialize()
tweets_df['user_name_tokens'] = tweets_df['user_name'].parallel_apply(lambda x: tokenize(x))
tweets_df['user_description_tokens'] = tweets_df['user_description'].parallel_apply(lambda x: tokenize(x))

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
CPU times: user 650 ms, sys: 1.44 s, total: 2.09 s
Wall time: 1min 37s


In [70]:
%%time
trump = set(for_trump)
biden = set(for_biden)

Y = []
c = 0
for name, description in zip(tweets_df.user_name_tokens.values, tweets_df.user_description_tokens.values):    
    name_description = name + ' ' + description

    biden, trump = False, False
    tmp = 'neutral'
    
    for term in for_biden:
        if name_description.find(term) >= 0:
            tmp = 'biden'
            biden = True
            break

    for term in for_trump:
        if name_description.find(term) >= 0:            
            tmp = 'trump'
            trump = True
            break
      
    if biden and trump:
        tmp = 'neutral'
        c += 1
        
    Y.append(tmp)
    
#--------------------------------#
tweets_df['support'] = Y


#--------------------------------#
print(f"All: {len(tweets_df) :,}")
print(f"Neither: {len(tweets_df[tweets_df['support']=='neutral']) :,}")
print(f"Trump: {len(tweets_df[tweets_df['support']=='trump']) :,}")
print(f"Biden: {len(tweets_df[tweets_df['support']=='biden']) :,}")
print(f"Both: {c :,}")

All: 645,253
Neither: 635,618
Trump: 774
Biden: 8,861
Both: 8
CPU times: user 5.74 s, sys: 250 ms, total: 5.99 s
Wall time: 6.02 s


In [71]:
trump_df = tweets_df[tweets_df['support']=='trump']
biden_df = tweets_df[tweets_df['support']=='biden']
biden_df = biden_df.groupby('user_name').head(1)

print(f"Trump: {len(trump_df) :,}")
print(f"Biden: {len(biden_df) :,}")

Trump: 774
Biden: 1,648


In [72]:
val_df = pd.DataFrame()
val_df = pd.concat([val_df, trump_df], axis=0)
val_df = pd.concat([val_df, biden_df], axis=0)
val_df[export_columns + ['support']].to_csv('validation', index=False)

In [73]:
biden_df[['user_name', 'user_description', 'user_location', 'user_created', 'text', 'hashtags', 'support']].head(20)

Unnamed: 0,user_name,user_description,user_location,user_created,text,hashtags,support
31,"Dr. Jimmy O’Shët, PsyD 🌊🌊🌊",I am a Republican against Trump! His brand of politics is counter to the core beliefs of America. I #Resist to restore decency to our country. 🌊🌊🌊 #GoJoe2020,,2011-02-15 18:09:45,@FalexGJ @TeamTrump @realDonaldTrump #trump would say speak English... https://t.co/L4hhtwWDcZ\n\n#TrumpIsARacist,"['trump', 'TrumpIsARacist']",biden
43,Blue Wave 🌊🌊🌊,I call out INJUSTICE where I see it 👀\nDigitalActorvist. 🔥\nI tweet A Lot because I can.\n\n#Decency #Democracy #BlueTsunami\n💙🔥🌎🌈☀🌊 🌊🌊\n#ActorVist #SAGAFTRA,united states of america,2020-03-15 23:43:08,"@realDonaldTrump is shaking in his shoes, and is terrified of @JoeBiden. He SHOULD be because #Biden is winning fair and square by doing half as much. \n#BidenHarrisToSaveAmerica https://t.co/Y5dofle1DV","['Biden', 'BidenHarrisToSaveAmerica']",biden
336,ᗷᖇIᗩᑎ ᖴᒪᗩKE🌊🌊🌊,"#LGBT, Extremely liberal, completely unfiltered. #Bots, #MAGAts, and #Trolls blocked immediately! #BidenHarris2020 \n#FBR\n#Resist \n#Resistance\n#RidinWithBiden",lincoln il,2017-11-06 23:32:31,This costume is hilarious.😂\n#BlueWave\n#BlueWave2018\n#Deplorables\n#MAGA\n#MAGAts\n#putinsbitch\n#Resist\n#TheResistance \n#Trump\n#Trumpkin\n#TrumpSupporter\n#TrumpTrain2018\n#TrumpTrainPortal\n#Vote\n#VoteBlueAndBringAFriend \n#VoteBlueNov6th\n#VoteBlueToEndThisNightmare https://t.co/ktjV2TvzVV,"['BlueWave', 'BlueWave2018', 'Deplorables', 'MAGA', 'MAGAts', 'putinsbitch', 'Resist', 'TheResistance', 'Trump', 'Trumpkin', 'TrumpSupporter', 'TrumpTrain2018', 'TrumpTrainPortal', 'Vote', 'VoteBlueAndBringAFriend', 'VoteBlueNov6th', 'VoteBlueToEndThisNightmare']",biden
447,🦄⚡️💎💙☯️ Empathetic Golgi Apparatus ☯️ 💙 💎⚡️🦄,#Resister #BidenHarris2020 🌊🌊 #FBR #BlackLivesMatter #ALLY🏳️‍🌈 #Skeptic #Humanist #Atheist #ThinkForYourself #CrohnsWarrior #MentalHealth #Education💙He/him,minnesota,2018-06-19 23:23:57,Listening to the AG of Pennsylvania makes me feel better... \nWho care WTF tRump says. The voice of the PEOPLE will be heard! #VoteBlue2020 #BidenHarris #Biden #VOTE,"['VoteBlue2020', 'BidenHarris', 'Biden', 'VOTE']",biden
471,James,Person. Woman. Man. Camera. TV🌊🌊🌊You can’t make a Tomelette without breaking some Greggs. Follow for follow back. #Resist 🌊 #Biden2020 #FBR #TheResistence,,2018-09-14 00:25:18,Businesses are bordering up! \n\nPutin is laughing at America. \n\nRemember that....and then remember how this orange fool conducted himself in Finland in 2018. 🤦‍♂️\n\nVOTE HIM OUT!\n\n#AmericaOrTrump #VoteHimOut #BidenHarris #Biden #TrumpMeltdown,"['AmericaOrTrump', 'VoteHimOut', 'BidenHarris', 'Biden', 'TrumpMeltdown']",biden
509,SoxOnTheBrain ⚾️,"Die-hard Red Sox fan and liberal Democrat, fighting for humanity in a world gone amok. Civil rights, social justice, politics, the Arts, sports, Law, LIFE. ⚖️🌊🌊",boston and cape cod,2011-06-02 04:59:28,"If u vote for #Trump: an evil,racist,homophobic, misogynistic,reckless,narcissistic,psychotic,sociopathic, lying,unethical, inept, traitorous,ignorant,arrogant, ill-mannered, fradulent blowhard, depraved rapist &amp; Fascist POS, u have SOLD your soul, and are hereby CANCELED. #Biden","['Trump', 'Biden']",biden
544,#Biden🗣️NoMalarkey,🌊https://t.co/TuEMDkhScx \n🌊Democrat #BlackLivesMatter #California #VoteBlue #FactsMatter \n🚫 conspiracy theories\n#ElectionSecurity \n#PuertoRico #BidenHarris2020✌🏾,california usa,2016-05-28 05:44:54,"Donald #Trump trashes COVID-19 doctors, compares himself to Jesus\n\n#healthcare\n#healthcareworkers\n#VoteHimOut\n#coronavirus\n#AmericaOrTrump\n#RepublicansAgainstTrump\n\nhttps://t.co/s2Jgq5dyxJ","['Trump', 'healthcare', 'healthcareworkers', 'VoteHimOut', 'coronavirus', 'AmericaOrTrump', 'RepublicansAgainstTrump']",biden
632,onlykamala_,Will always be for Kamala. #khiveforever. love politics. future politician. #kamala202x #perkinsforla #bidenharris basketball fan 🏀,,2019-03-19 22:55:29,This is what y’all have? Really. Bruh y’all desperate. #Trump https://t.co/EZNqq3Fm1E,['Trump'],biden
687,Tbone 🌊🌊,Proud member of the resistance 🌊🌊 #ImpeachedForever #resign #fucktrump. I Bleed green. Animal lover. #ByeDon2020 #votebluenomatterwho #BlackLivesMatters,eagles nation american football,2011-06-18 02:25:36,If you live in Pennsylvania and still have you're mail in ballot you can surrender it at the polls. Make sure you put it in the secret envelope. It won't count otherwise. Everyone if you're in line and the polls close you can still vote. Do not let them tell you otherwise. #biden,['biden'],biden
754,momof4 #resist 🌊🌊🌊🌊🌊#resister #wtp2020,,united states,2019-11-14 02:17:45,#trump cannot declare victory States are responsible to count votes https://t.co/ChLuolKnvo,['trump'],biden
