# Election Tweets

In [1]:
import os
import numpy as np
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt

DATA_DIR = os.path.join(os.environ["HOME"],"Datasets","HTX_Hackathon")
TWEET_DIR = os.path.join(DATA_DIR,"Election_Tweets")

pd.set_option("display.max_columns",100)

# Read Tweets

In [2]:
tweets = pd.read_csv(os.path.join(TWEET_DIR, "hashtag_donaldtrump.csv"),
                     dtype = {"tweet_id":"str", "user_id":"str"},
                     parse_dates=["created_at","user_join_date","collected_at"],
                     lineterminator='\n')

display(tweets.shape)
display(tweets.head())

(725654, 21)

Unnamed: 0,created_at,tweet_id,tweet,likes,retweet_count,source,user_id,user_name,user_screen_name,user_description,user_join_date,user_followers_count,user_location,lat,long,city,country,continent,state,state_code,collected_at
0,2020-10-15 00:00:01,1.316529221557252e+18,#Elecciones2020 | En #Florida: #JoeBiden dice ...,0.0,0.0,TweetDeck,360666534.0,El Sol Latino News,elsollatinonews,🌐 Noticias de interés para latinos de la costa...,2011-08-23 15:33:45,1860.0,"Philadelphia, PA / Miami, FL",25.77427,-80.19366,,United States of America,North America,Florida,FL,2020-10-21 00:00:00.000000000
1,2020-10-15 00:00:01,1.3165292227484303e+18,"Usa 2020, Trump contro Facebook e Twitter: cop...",26.0,9.0,Social Mediaset,331617619.0,Tgcom24,MediasetTgcom24,Profilo ufficiale di Tgcom24: tutte le notizie...,2011-07-08 13:12:20,1067661.0,,,,,,,,,2020-10-21 00:00:00.373216530
2,2020-10-15 00:00:02,1.316529228091847e+18,"#Trump: As a student I used to hear for years,...",2.0,1.0,Twitter Web App,8436472.0,snarke,snarke,"Will mock for food! Freelance writer, blogger,...",2007-08-26 05:56:11,1185.0,Portland,45.520247,-122.674195,Portland,United States of America,North America,Oregon,OR,2020-10-21 00:00:00.746433060
3,2020-10-15 00:00:02,1.316529227471237e+18,2 hours since last tweet from #Trump! Maybe he...,0.0,0.0,Trumpytweeter,8.28355589206057e+17,Trumpytweeter,trumpytweeter,"If he doesn't tweet for some time, should we b...",2017-02-05 21:32:17,32.0,,,,,,,,,2020-10-21 00:00:01.119649591
4,2020-10-15 00:00:08,1.3165292523014513e+18,You get a tie! And you get a tie! #Trump ‘s ra...,4.0,3.0,Twitter for iPhone,47413798.0,Rana Abtar - رنا أبتر,Ranaabtar,"Washington Correspondent, Lebanese-American ,c...",2009-06-15 19:05:35,5393.0,Washington DC,38.894992,-77.036558,Washington,United States of America,North America,District of Columbia,DC,2020-10-21 00:00:01.492866121


# Text Functions

In [3]:
import nltk
from nltk import sent_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

def clean_text_all(input_text):
    stemmer = nltk.stem.SnowballStemmer('english')
    # remove tags and mentions, '@'
    input_text = re.sub("@[A-Za-z0-9_]+","", input_text)
    # remove hashtags, '#'
    input_text = re.sub("#[A-Za-z0-9_]+","", input_text)
    # remove html, xml tags, URL
    input_text = re.sub(r"http\S+", "", input_text)
    input_text = re.sub(r"www.\S+", "", input_text)
    input_text = stemmer.stem(input_text)
    tokens = word_tokenize(input_text)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # remove stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    cleaned_sentense = ' '.join(words)
    # the first 5 words are heading, remove them
    return cleaned_sentense

# Feature Engineering

In [4]:
tweets_df = tweets.copy()

tweets_df["TWEETS_CLEANED"] = tweets_df["tweet"].apply(clean_text_all)

display(tweets_df.shape)
display(tweets_df.head(10))

(725654, 22)

Unnamed: 0,created_at,tweet_id,tweet,likes,retweet_count,source,user_id,user_name,user_screen_name,user_description,user_join_date,user_followers_count,user_location,lat,long,city,country,continent,state,state_code,collected_at,TWEETS_CLEANED
0,2020-10-15 00:00:01,1.316529221557252e+18,#Elecciones2020 | En #Florida: #JoeBiden dice ...,0.0,0.0,TweetDeck,360666534.0,El Sol Latino News,elsollatinonews,🌐 Noticias de interés para latinos de la costa...,2011-08-23 15:33:45,1860.0,"Philadelphia, PA / Miami, FL",25.77427,-80.19366,,United States of America,North America,Florida,FL,2020-10-21 00:00:00.000000000,en dice que solo se preocupa por él mismo el d...
1,2020-10-15 00:00:01,1.3165292227484303e+18,"Usa 2020, Trump contro Facebook e Twitter: cop...",26.0,9.0,Social Mediaset,331617619.0,Tgcom24,MediasetTgcom24,Profilo ufficiale di Tgcom24: tutte le notizie...,2011-07-08 13:12:20,1067661.0,,,,,,,,,2020-10-21 00:00:00.373216530,usa trump contro facebook e twitter coprono biden
2,2020-10-15 00:00:02,1.316529228091847e+18,"#Trump: As a student I used to hear for years,...",2.0,1.0,Twitter Web App,8436472.0,snarke,snarke,"Will mock for food! Freelance writer, blogger,...",2007-08-26 05:56:11,1185.0,Portland,45.520247,-122.674195,Portland,United States of America,North America,Oregon,OR,2020-10-21 00:00:00.746433060,student used hear years ten years heard china ...
3,2020-10-15 00:00:02,1.316529227471237e+18,2 hours since last tweet from #Trump! Maybe he...,0.0,0.0,Trumpytweeter,8.28355589206057e+17,Trumpytweeter,trumpytweeter,"If he doesn't tweet for some time, should we b...",2017-02-05 21:32:17,32.0,,,,,,,,,2020-10-21 00:00:01.119649591,hours since last tweet maybe busy tremendously...
4,2020-10-15 00:00:08,1.3165292523014513e+18,You get a tie! And you get a tie! #Trump ‘s ra...,4.0,3.0,Twitter for iPhone,47413798.0,Rana Abtar - رنا أبتر,Ranaabtar,"Washington Correspondent, Lebanese-American ,c...",2009-06-15 19:05:35,5393.0,Washington DC,38.894992,-77.036558,Washington,United States of America,North America,District of Columbia,DC,2020-10-21 00:00:01.492866121,get tie get tie rally
5,2020-10-15 00:00:17,1.316529291052675e+18,@CLady62 Her 15 minutes were over long time ag...,2.0,0.0,Twitter for Android,1138416104.0,Farris Flagg,FarrisFlagg,#BidenHarris2020 #JoeBiden2020 #KamalaHarrisFo...,2013-02-01 01:37:38,2363.0,"Perris,California",33.782519,-117.228648,,United States of America,North America,California,CA,2020-10-21 00:00:01.866082651,minutes long time ago omarosa never represente...
6,2020-10-15 00:00:17,1.316529289949569e+18,@richardmarx Glad u got out of the house! DICK...,0.0,0.0,Twitter for iPhone,7.674018410302095e+17,Michael Wilson,wilsonfire9,,2016-08-21 16:43:51,75.0,"Powell, TN",,,,,,,,2020-10-21 00:00:02.239299182,glad u got house dick
7,2020-10-15 00:00:18,1.3165292934979625e+18,@DeeviousDenise @realDonaldTrump @nypost There...,0.0,0.0,Twitter for iPhone,9.007610716314296e+17,Stacey Gulledge 🇺🇸 Patriot ♥️ KAG 🙏 👮‍♀️♥️,sm_gulledge,"Patriot, Wife, “Shaken not Stirred” Mom of two...",2017-08-24 16:45:49,766.0,"Ohio, USA",40.225357,-82.68814,,United States of America,North America,Ohio,OH,2020-10-21 00:00:02.612515712,wo nt many unless voting god prevails bo corru...
8,2020-10-15 00:00:20,1.3165293013329183e+18,One of the single most effective remedies to e...,0.0,0.0,Twitter Web App,540476889.0,Jamieo,jamieo33,"Don't know what I am. Can lean left and right,...",2012-03-30 00:30:54,151.0,"Pennsylvania, USA",40.969989,-77.727883,,United States of America,North America,Pennsylvania,PA,2020-10-21 00:00:02.985732243,one single effective remedies eradicate anothe...
9,2020-10-15 00:00:21,1.3165293085763092e+18,#Election2020 #Trump \n#FreedomOfSpeech https:...,0.0,0.0,Twitter Web App,1.305532976998969e+18,Johnny Quest,JohnnyQuest22,Independent\n\nWhat is your American Dream?\nT...,2020-09-14 15:45:18,8.0,,,,,,,,,2020-10-21 00:00:03.358948773,


## Filter United States

In [5]:
tweets_df["country"] = tweets_df["country"].str.upper()
tweets_df = tweets_df.loc[tweets_df["country"] == "UNITED STATES OF AMERICA"]

# capitalize state code
tweets_df["state_code"] = tweets_df["state_code"].str.upper()

display(tweets_df.shape)
display(tweets_df.head(10))

(147395, 22)

Unnamed: 0,created_at,tweet_id,tweet,likes,retweet_count,source,user_id,user_name,user_screen_name,user_description,user_join_date,user_followers_count,user_location,lat,long,city,country,continent,state,state_code,collected_at,TWEETS_CLEANED
0,2020-10-15 00:00:01,1.316529221557252e+18,#Elecciones2020 | En #Florida: #JoeBiden dice ...,0.0,0.0,TweetDeck,360666534.0,El Sol Latino News,elsollatinonews,🌐 Noticias de interés para latinos de la costa...,2011-08-23 15:33:45,1860.0,"Philadelphia, PA / Miami, FL",25.77427,-80.19366,,UNITED STATES OF AMERICA,North America,Florida,FL,2020-10-21 00:00:00.000000000,en dice que solo se preocupa por él mismo el d...
2,2020-10-15 00:00:02,1.316529228091847e+18,"#Trump: As a student I used to hear for years,...",2.0,1.0,Twitter Web App,8436472.0,snarke,snarke,"Will mock for food! Freelance writer, blogger,...",2007-08-26 05:56:11,1185.0,Portland,45.520247,-122.674195,Portland,UNITED STATES OF AMERICA,North America,Oregon,OR,2020-10-21 00:00:00.746433060,student used hear years ten years heard china ...
4,2020-10-15 00:00:08,1.3165292523014513e+18,You get a tie! And you get a tie! #Trump ‘s ra...,4.0,3.0,Twitter for iPhone,47413798.0,Rana Abtar - رنا أبتر,Ranaabtar,"Washington Correspondent, Lebanese-American ,c...",2009-06-15 19:05:35,5393.0,Washington DC,38.894992,-77.036558,Washington,UNITED STATES OF AMERICA,North America,District of Columbia,DC,2020-10-21 00:00:01.492866121,get tie get tie rally
5,2020-10-15 00:00:17,1.316529291052675e+18,@CLady62 Her 15 minutes were over long time ag...,2.0,0.0,Twitter for Android,1138416104.0,Farris Flagg,FarrisFlagg,#BidenHarris2020 #JoeBiden2020 #KamalaHarrisFo...,2013-02-01 01:37:38,2363.0,"Perris,California",33.782519,-117.228648,,UNITED STATES OF AMERICA,North America,California,CA,2020-10-21 00:00:01.866082651,minutes long time ago omarosa never represente...
7,2020-10-15 00:00:18,1.3165292934979625e+18,@DeeviousDenise @realDonaldTrump @nypost There...,0.0,0.0,Twitter for iPhone,9.007610716314296e+17,Stacey Gulledge 🇺🇸 Patriot ♥️ KAG 🙏 👮‍♀️♥️,sm_gulledge,"Patriot, Wife, “Shaken not Stirred” Mom of two...",2017-08-24 16:45:49,766.0,"Ohio, USA",40.225357,-82.68814,,UNITED STATES OF AMERICA,North America,Ohio,OH,2020-10-21 00:00:02.612515712,wo nt many unless voting god prevails bo corru...
8,2020-10-15 00:00:20,1.3165293013329183e+18,One of the single most effective remedies to e...,0.0,0.0,Twitter Web App,540476889.0,Jamieo,jamieo33,"Don't know what I am. Can lean left and right,...",2012-03-30 00:30:54,151.0,"Pennsylvania, USA",40.969989,-77.727883,,UNITED STATES OF AMERICA,North America,Pennsylvania,PA,2020-10-21 00:00:02.985732243,one single effective remedies eradicate anothe...
11,2020-10-15 00:00:25,1.3165293244182405e+18,"In 2020, #NYPost is being #censorship #CENSORE...",0.0,0.0,Twitter for iPhone,19940334.0,Change Illinois | Biden will increase taxes by...,changeillinois,"Illinois, home of Lincoln and Reagan, used to ...",2009-02-02 23:08:28,1396.0,"Chicago, Illinois",41.875562,-87.624421,Chicago,UNITED STATES OF AMERICA,North America,Illinois,IL,2020-10-21 00:00:04.105381834,twitter manipulate us election favor ccp porn ...
12,2020-10-15 00:00:26,1.3165293286084813e+18,#Trump #PresidentTrump #Trump2020LandslideVict...,3.0,5.0,Twitter for Android,1.2433153463979663e+18,Ron Burgundy,Anchorman_USA,"I'm kind of a Big Deal, People know me! I driv...",2020-03-26 23:14:28,496.0,"San Diego, CA",32.717421,-117.162771,San Diego,UNITED STATES OF AMERICA,North America,California,CA,2020-10-21 00:00:04.478598364,
19,2020-10-15 00:01:08,1.3165295062790185e+18,"@cnnbrk #Trump owes #RicardoAguirre $730,000 t...",3.0,2.0,Twitter for iPhone,194650429.0,MoClarker,MoClarker,Media Maven/Scientist/Fan O Fauci,2010-09-24 17:15:43,101.0,Santa Monica Beach,47.005211,-88.96291,,UNITED STATES OF AMERICA,North America,Michigan,MI,2020-10-21 00:00:07.091114077,owes pay mass murder family
22,2020-10-15 00:01:14,1.3165295298971853e+18,"#Trump: Nobody likes to tell you this, but som...",1.0,1.0,Twitter Web App,8436472.0,snarke,snarke,"Will mock for food! Freelance writer, blogger,...",2007-08-26 05:56:11,1185.0,Portland,45.520247,-122.674195,Portland,UNITED STATES OF AMERICA,North America,Oregon,OR,2020-10-21 00:00:08.210763668,nobody likes tell farmers better way working a...


## Select Columns

In [6]:
SEL_COLS = ["created_at","source","tweet_id","TWEETS_CLEANED","user_followers_count",
            "state_code","likes","retweet_count"]

tweets_df = tweets_df[SEL_COLS]

display(tweets_df.shape)
display(tweets_df.head(10))

(147395, 8)

Unnamed: 0,created_at,source,tweet_id,TWEETS_CLEANED,user_followers_count,state_code,likes,retweet_count
0,2020-10-15 00:00:01,TweetDeck,1.316529221557252e+18,en dice que solo se preocupa por él mismo el d...,1860.0,FL,0.0,0.0
2,2020-10-15 00:00:02,Twitter Web App,1.316529228091847e+18,student used hear years ten years heard china ...,1185.0,OR,2.0,1.0
4,2020-10-15 00:00:08,Twitter for iPhone,1.3165292523014513e+18,get tie get tie rally,5393.0,DC,4.0,3.0
5,2020-10-15 00:00:17,Twitter for Android,1.316529291052675e+18,minutes long time ago omarosa never represente...,2363.0,CA,2.0,0.0
7,2020-10-15 00:00:18,Twitter for iPhone,1.3165292934979625e+18,wo nt many unless voting god prevails bo corru...,766.0,OH,0.0,0.0
8,2020-10-15 00:00:20,Twitter Web App,1.3165293013329183e+18,one single effective remedies eradicate anothe...,151.0,PA,0.0,0.0
11,2020-10-15 00:00:25,Twitter for iPhone,1.3165293244182405e+18,twitter manipulate us election favor ccp porn ...,1396.0,IL,0.0,0.0
12,2020-10-15 00:00:26,Twitter for Android,1.3165293286084813e+18,,496.0,CA,3.0,5.0
19,2020-10-15 00:01:08,Twitter for iPhone,1.3165295062790185e+18,owes pay mass murder family,101.0,MI,3.0,2.0
22,2020-10-15 00:01:14,Twitter Web App,1.3165295298971853e+18,nobody likes tell farmers better way working a...,1185.0,OR,1.0,1.0


Filter Tweets of more than 10 characters

In [7]:
tweets_df = tweets_df.loc[tweets_df["TWEETS_CLEANED"].str.len() > 10]
tweets_df = tweets_df.loc[tweets_df["likes"] ><10000]

display(tweets_df.shape)
display(tweets_df.head(10))

SyntaxError: invalid syntax (3539895494.py, line 2)

# Analysis of Likes

In [None]:
tweets_df["likes"].describe()

## Relabel LIKES as binary

In [None]:
def label_likes_cat(input_val):
    if input_val == 0.0:
        return "NONE"
    elif (input_val > 0.0) & (input_val <= 5):
        return "LOW"
    elif (input_val > 5):
        return "HIGH"
    else:
        return "NONE"

In [None]:
tweets_df["LIKES_CAT"] = tweets_df["likes"].apply(label_likes_cat)

display(tweets_df["LIKES_CAT"].value_counts())
display(tweets_df.shape)
display(tweets_df.head(10))

# Write CSV

In [None]:
tweets_df.to_csv(os.path.join(TWEET_DIR, "Election_Tweets_LikesCat3.csv"), index = False)

HISTOGRAM

In [None]:
plt.hist(x = np.array(tweets_df["likes"]), bins = 50)

In [None]:
tweets_df["likes"]