---
## CSE 482 - Tweet Loading and Processing
### Jack Nugent
Load Tweets related to Elon Musk and prepare the text for sentiment analysis.

---

In [1]:
import tweepy
from tweepy import OAuthHandler
from tweepy import API
import json
import pandas as pd
import re
import numpy as np

In [2]:
#!pip install emoji
import emoji

---
### Loading Tweets

In [3]:
data_json = []

In [4]:
files = ['output_geotag_day1.json', 'output_geotag_day2.json', 
         'output_geotag_day3.json', 'output_geotag_day4.json']

for file in files:
    f = open(file)

    for line in f:
        try:
            data_json.append(json.loads(line))
        # Discard Tweets with invalid chars
        except:
            pass

    f.close()

In [5]:
df = pd.DataFrame(columns=['Text', 'Location', 'Tweet_id', 'Date', 'Time'])

for tweet in data_json:
    date, time = tweet['data']['created_at'].split('T')
    # Handles Retweets that somehow slip through filter
    if 'RT @' not in tweet['data']['text']:
        df.loc[len(df.index)] = [tweet['data']['text'], tweet['includes']['places'][0]['full_name'], tweet['data']['id'], date, time[:-5]]

print("Number of Tweets:",len(df))
df

Number of Tweets: 2008


Unnamed: 0,Text,Location,Tweet_id,Date,Time
0,.@EconUS/@YouGovAmerica:\n\nDo you approve or ...,"Bay Harbor Islands, FL",1595482958764425218,2022-11-23,18:22:34
1,Twitter and Elon has the same vibes when Trump...,"New York, USA",1595483029048369152,2022-11-23,18:22:51
2,@RonFilipkowski Then why do I see him so frequ...,"West Virginia, USA",1595483099735154688,2022-11-23,18:23:08
3,@timothy_wall @elonmusk This is hysterical. Th...,"Port Orange, FL",1595483141879353344,2022-11-23,18:23:18
4,Elon Musk can show us the way but it’s we who ...,"Brooklyn, NY",1595483162289016832,2022-11-23,18:23:23
...,...,...,...,...,...
2003,@muskQu0tes @elonmusk There you are. I sent yo...,"Chicago, IL",1596571988877021185,2022-11-26,18:29:59
2004,@StephenKing That's the way to make money in t...,"Wellington, FL",1596572018237140992,2022-11-26,18:30:06
2005,@ABC Knock knock 👊🏻\nCALIFORNIA LABOR DEPT\n\n...,"Canovanillas, Puerto Rico",1596572065372389377,2022-11-26,18:30:17
2006,@kylegriffin1 And gained integrity,"Smithtown, NY",1596572102022385664,2022-11-26,18:30:26


---
### Processing Text


#### Cleaning Text

In [6]:
%%time
df['Words'] = df['Text'].str.split()
df['Hashtags'] = None
for x in range(len(df)):
    # Handle '@' Mentions
    df['Words'][x] = [i for i in df['Words'][x] if not re.compile('@').match(i)]
    
    # Handle links
    df['Words'][x] = [i for i in df['Words'][x] if not re.compile('http').match(i)]
    
    # Hashtags
    df['Hashtags'][x] = [word for word in df['Words'][x] if '#' in word]
    df['Words'][x] = [word for word in df['Words'][x] if word not in df['Hashtags'][x]]
    
    # Handle Emojis and Punctuation
    words = []
    for word in df.iloc[x]['Words']:
        for c in word:
            if c in emoji.EMOJI_DATA or c in '''!()-[]{};:'"\,<>./?@#$%^&*_~''':
                word = word.replace(c, '')
        # Lowercase Words
        words.append(word.lower())
    df['Words'][x] = words
    
    # Remove Empty Values
    for word in df['Words'][x]:
        if word == '':
            df['Words'][x].remove(word)
            

CPU times: user 2 s, sys: 29.3 ms, total: 2.03 s
Wall time: 2.02 s


---
### NLTK Cleaning

In [7]:
import nltk
nltk.download("wordnet")
from nltk.corpus import wordnet
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download("stopwords")
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize, sent_tokenize

[nltk_data] Downloading package wordnet to /home/nugentj3/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nugentj3/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/nugentj3/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/nugentj3/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


---
#### Stopword Removal

In [8]:
stop_words = set(stopwords.words('english'))

In [9]:
# Stopword Removal
for x in range(len(df)):
    df['Words'][x] = [word for word in df['Words'][x] if word not in stop_words]
#df

---
#### Stemming

In [10]:
df['Stemmed'] = None
for x in range(len(df)):
    df['Stemmed'][x] = [SnowballStemmer(language='english').stem(word) for word in df['Words'][x]]
#df

---
#### Lemmatization

In [11]:
df['Lemmed'] = None
for x in range(len(df)):
    df['Lemmed'][x] = [WordNetLemmatizer().lemmatize(word) for word in df['Words'][x]]
#df

---
#### Part of Speech Tagging

In [12]:
#POS Tagging
df['POS_Tags'] = None
for x in range(len(df)):
    df['POS_Tags'][x] = nltk.pos_tag(df['Lemmed'][x])
df

Unnamed: 0,Text,Location,Tweet_id,Date,Time,Words,Hashtags,Stemmed,Lemmed,POS_Tags
0,.@EconUS/@YouGovAmerica:\n\nDo you approve or ...,"Bay Harbor Islands, FL",1595482958764425218,2022-11-23,18:22:34,"[econusyougovamerica, approve, disapprove, elo...",[],"[econusyougovamerica, approv, disapprov, elon,...","[econusyougovamerica, approve, disapprove, elo...","[(econusyougovamerica, RB), (approve, VB), (di..."
1,Twitter and Elon has the same vibes when Trump...,"New York, USA",1595483029048369152,2022-11-23,18:22:51,"[twitter, elon, vibes, trump, got, elected, la...",[],"[twitter, elon, vibe, trump, got, elect, laugh...","[twitter, elon, vibe, trump, got, elected, lau...","[(twitter, NN), (elon, NN), (vibe, NN), (trump..."
2,@RonFilipkowski Then why do I see him so frequ...,"West Virginia, USA",1595483099735154688,2022-11-23,18:23:08,"[see, frequently, twitter, stop, posting]",[],"[see, frequent, twitter, stop, post]","[see, frequently, twitter, stop, posting]","[(see, VB), (frequently, RB), (twitter, JJR), ..."
3,@timothy_wall @elonmusk This is hysterical. Th...,"Port Orange, FL",1595483141879353344,2022-11-23,18:23:18,"[hysterical, thanks, laugh, prep, thanksgiving...",[],"[hyster, thank, laugh, prep, thanksgiv, need]","[hysterical, thanks, laugh, prep, thanksgiving...","[(hysterical, JJ), (thanks, NNS), (laugh, IN),..."
4,Elon Musk can show us the way but it’s we who ...,"Brooklyn, NY",1595483162289016832,2022-11-23,18:23:23,"[elon, musk, show, us, way, it’s, hard, work, ...",[],"[elon, musk, show, us, way, it, hard, work, ma...","[elon, musk, show, u, way, it’s, hard, work, m...","[(elon, NN), (musk, NN), (show, NN), (u, JJ), ..."
...,...,...,...,...,...,...,...,...,...,...
2003,@muskQu0tes @elonmusk There you are. I sent yo...,"Chicago, IL",1596571988877021185,2022-11-26,18:29:59,"[sent, thoughts, cyborg, video, project, send,...",[],"[sent, thought, cyborg, video, project, send, ...","[sent, thought, cyborg, video, project, send, ...","[(sent, NN), (thought, VBD), (cyborg, NNS), (v..."
2004,@StephenKing That's the way to make money in t...,"Wellington, FL",1596572018237140992,2022-11-26,18:30:06,"[thats, way, make, money, vc, world, unfortuna...",[],"[that, way, make, money, vc, world, unfortun, ...","[thats, way, make, money, vc, world, unfortuna...","[(thats, NNS), (way, NN), (make, VBP), (money,..."
2005,@ABC Knock knock 👊🏻\nCALIFORNIA LABOR DEPT\n\n...,"Canovanillas, Puerto Rico",1596572065372389377,2022-11-26,18:30:17,"[knock, knock, california, labor, dept]",[],"[knock, knock, california, labor, dept]","[knock, knock, california, labor, dept]","[(knock, NN), (knock, NN), (california, NN), (..."
2006,@kylegriffin1 And gained integrity,"Smithtown, NY",1596572102022385664,2022-11-26,18:30:26,"[gained, integrity]",[],"[gain, integr]","[gained, integrity]","[(gained, VBN), (integrity, NN)]"


---
### Positive/Negative

In [13]:
pos_words = []
f = open('positive-words.txt')
for line in f:
    try:
        pos_words.append(line.replace('\n', ''))
    except:
        pass

f.close()
len(pos_words)

2006

In [14]:
neg_words = []
f = open('negative-words.txt', encoding='ISO-8859-1')
for line in f:
    try:
        neg_words.append(line.replace('\n', ''))
    except:
        pass

f.close()
len(neg_words)

4783

In [15]:
df['Count'] = None
df['Pos_count'] = None
df['Neg_count'] = None

for x in range(len(df)):
    df['Count'][x] = len(df['Words'][x])
    df['Pos_count'][x] = len([word for word in df['Lemmed'][x] if word in pos_words])
    df['Neg_count'][x] = len([word for word in df['Lemmed'][x] if word in neg_words])
    

In [16]:
# Remove rows with no words.
df_fix = df.copy()
for idx, row in df[df['Count'] == 0].iterrows():
    df_fix = df_fix.drop(idx)

In [17]:
df_fix['Positive_ratio'] = df_fix['Pos_count']/df_fix['Count']
df_fix['Negative_ratio'] = df_fix['Neg_count']/df_fix['Count']

In [18]:
df_fix

Unnamed: 0,Text,Location,Tweet_id,Date,Time,Words,Hashtags,Stemmed,Lemmed,POS_Tags,Count,Pos_count,Neg_count,Positive_ratio,Negative_ratio
0,.@EconUS/@YouGovAmerica:\n\nDo you approve or ...,"Bay Harbor Islands, FL",1595482958764425218,2022-11-23,18:22:34,"[econusyougovamerica, approve, disapprove, elo...",[],"[econusyougovamerica, approv, disapprov, elon,...","[econusyougovamerica, approve, disapprove, elo...","[(econusyougovamerica, RB), (approve, VB), (di...",37,2,1,0.054054,0.027027
1,Twitter and Elon has the same vibes when Trump...,"New York, USA",1595483029048369152,2022-11-23,18:22:51,"[twitter, elon, vibes, trump, got, elected, la...",[],"[twitter, elon, vibe, trump, got, elect, laugh...","[twitter, elon, vibe, trump, got, elected, lau...","[(twitter, NN), (elon, NN), (vibe, NN), (trump...",19,1,1,0.052632,0.052632
2,@RonFilipkowski Then why do I see him so frequ...,"West Virginia, USA",1595483099735154688,2022-11-23,18:23:08,"[see, frequently, twitter, stop, posting]",[],"[see, frequent, twitter, stop, post]","[see, frequently, twitter, stop, posting]","[(see, VB), (frequently, RB), (twitter, JJR), ...",5,0,0,0.0,0.0
3,@timothy_wall @elonmusk This is hysterical. Th...,"Port Orange, FL",1595483141879353344,2022-11-23,18:23:18,"[hysterical, thanks, laugh, prep, thanksgiving...",[],"[hyster, thank, laugh, prep, thanksgiv, need]","[hysterical, thanks, laugh, prep, thanksgiving...","[(hysterical, JJ), (thanks, NNS), (laugh, IN),...",6,0,1,0.0,0.166667
4,Elon Musk can show us the way but it’s we who ...,"Brooklyn, NY",1595483162289016832,2022-11-23,18:23:23,"[elon, musk, show, us, way, it’s, hard, work, ...",[],"[elon, musk, show, us, way, it, hard, work, ma...","[elon, musk, show, u, way, it’s, hard, work, m...","[(elon, NN), (musk, NN), (show, NN), (u, JJ), ...",10,1,1,0.1,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2003,@muskQu0tes @elonmusk There you are. I sent yo...,"Chicago, IL",1596571988877021185,2022-11-26,18:29:59,"[sent, thoughts, cyborg, video, project, send,...",[],"[sent, thought, cyborg, video, project, send, ...","[sent, thought, cyborg, video, project, send, ...","[(sent, NN), (thought, VBD), (cyborg, NNS), (v...",13,1,0,0.076923,0.0
2004,@StephenKing That's the way to make money in t...,"Wellington, FL",1596572018237140992,2022-11-26,18:30:06,"[thats, way, make, money, vc, world, unfortuna...",[],"[that, way, make, money, vc, world, unfortun, ...","[thats, way, make, money, vc, world, unfortuna...","[(thats, NNS), (way, NN), (make, VBP), (money,...",13,0,1,0.0,0.076923
2005,@ABC Knock knock 👊🏻\nCALIFORNIA LABOR DEPT\n\n...,"Canovanillas, Puerto Rico",1596572065372389377,2022-11-26,18:30:17,"[knock, knock, california, labor, dept]",[],"[knock, knock, california, labor, dept]","[knock, knock, california, labor, dept]","[(knock, NN), (knock, NN), (california, NN), (...",5,0,2,0.0,0.4
2006,@kylegriffin1 And gained integrity,"Smithtown, NY",1596572102022385664,2022-11-26,18:30:26,"[gained, integrity]",[],"[gain, integr]","[gained, integrity]","[(gained, VBN), (integrity, NN)]",2,1,0,0.5,0.0


---
#### Sort By Positive Ratio

In [19]:
df_fix.sort_values('Positive_ratio')[::-1][:50]

Unnamed: 0,Text,Location,Tweet_id,Date,Time,Words,Hashtags,Stemmed,Lemmed,POS_Tags,Count,Pos_count,Neg_count,Positive_ratio,Negative_ratio
557,@bennyjohnson Outstanding!,"Greater Northdale, FL",1595835514615386112,2022-11-24,17:43:30,[outstanding],[],[outstand],[outstanding],"[(outstanding, JJ)]",1,1,0,1.0,0.0
1410,@Sidneerymes @washingtonpost #Neverdetrumpus! ...,"Greenville, SC",1596539090991689730,2022-11-26,16:19:16,[love],[#Neverdetrumpus!],[love],[love],"[(love, NN)]",1,1,0,1.0,0.0
1238,I love it #leftistlunacy #triggered https://t....,"Colorado Springs, CO",1596527230574002176,2022-11-26,15:32:08,[love],"[#leftistlunacy, #triggered]",[love],[love],"[(love, NN)]",1,1,0,1.0,0.0
242,@elonmusk Am I doing it right? https://t.co/zZ...,"Austin, TX",1595505528238931968,2022-11-23,19:52:15,[right],[],[right],[right],"[(right, NN)]",1,1,0,1.0,0.0
367,@Davidlaz Interesting! 🤔🙂,"Los Angeles, CA",1595817277047574529,2022-11-24,16:31:02,[interesting],[],[interest],[interesting],"[(interesting, VBG)]",1,1,0,1.0,0.0
1727,@onthebus17 @chrislhayes Thank you!,"Pennsylvania, USA",1596556403292114944,2022-11-26,17:28:03,[thank],[],[thank],[thank],"[(thank, NN)]",1,1,0,1.0,0.0
184,@realTuckFrumper Good,"South Lebanon, OH",1595500171353522176,2022-11-23,19:30:58,[good],[],[good],[good],"[(good, JJ)]",1,1,0,1.0,0.0
978,Great work. https://t.co/YVOJjBQ7Bv,"Pennsylvania, USA",1596171365853208577,2022-11-25,15:58:03,"[great, work]",[],"[great, work]","[great, work]","[(great, JJ), (work, NN)]",2,2,0,1.0,0.0
1092,@GarysBlues Good,"West Springfield, MA",1596181715642646530,2022-11-25,16:39:11,[good],[],[good],[good],"[(good, JJ)]",1,1,0,1.0,0.0
1045,Good 🧵 https://t.co/TMQAwgFxHu,"Arlington, VA",1596177460789276672,2022-11-25,16:22:16,[good],[],[good],[good],"[(good, JJ)]",1,1,0,1.0,0.0


---
#### One Word Responses

In [20]:
len(df_fix[df_fix['Count'] == 1])

140

In [30]:
df_fix[df_fix['Count'] > 1]

Unnamed: 0,Text,Location,Tweet_id,Date,Time,Words,Hashtags,Stemmed,Lemmed,POS_Tags,Count,Pos_count,Neg_count,Positive_ratio,Negative_ratio
0,.@EconUS/@YouGovAmerica:\n\nDo you approve or ...,"Bay Harbor Islands, FL",1595482958764425218,2022-11-23,18:22:34,"[econusyougovamerica, approve, disapprove, elo...",[],"[econusyougovamerica, approv, disapprov, elon,...","[econusyougovamerica, approve, disapprove, elo...","[(econusyougovamerica, RB), (approve, VB), (di...",37,2,1,0.054054,0.027027
1,Twitter and Elon has the same vibes when Trump...,"New York, USA",1595483029048369152,2022-11-23,18:22:51,"[twitter, elon, vibes, trump, got, elected, la...",[],"[twitter, elon, vibe, trump, got, elect, laugh...","[twitter, elon, vibe, trump, got, elected, lau...","[(twitter, NN), (elon, NN), (vibe, NN), (trump...",19,1,1,0.052632,0.052632
2,@RonFilipkowski Then why do I see him so frequ...,"West Virginia, USA",1595483099735154688,2022-11-23,18:23:08,"[see, frequently, twitter, stop, posting]",[],"[see, frequent, twitter, stop, post]","[see, frequently, twitter, stop, posting]","[(see, VB), (frequently, RB), (twitter, JJR), ...",5,0,0,0.0,0.0
3,@timothy_wall @elonmusk This is hysterical. Th...,"Port Orange, FL",1595483141879353344,2022-11-23,18:23:18,"[hysterical, thanks, laugh, prep, thanksgiving...",[],"[hyster, thank, laugh, prep, thanksgiv, need]","[hysterical, thanks, laugh, prep, thanksgiving...","[(hysterical, JJ), (thanks, NNS), (laugh, IN),...",6,0,1,0.0,0.166667
4,Elon Musk can show us the way but it’s we who ...,"Brooklyn, NY",1595483162289016832,2022-11-23,18:23:23,"[elon, musk, show, us, way, it’s, hard, work, ...",[],"[elon, musk, show, us, way, it, hard, work, ma...","[elon, musk, show, u, way, it’s, hard, work, m...","[(elon, NN), (musk, NN), (show, NN), (u, JJ), ...",10,1,1,0.1,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2003,@muskQu0tes @elonmusk There you are. I sent yo...,"Chicago, IL",1596571988877021185,2022-11-26,18:29:59,"[sent, thoughts, cyborg, video, project, send,...",[],"[sent, thought, cyborg, video, project, send, ...","[sent, thought, cyborg, video, project, send, ...","[(sent, NN), (thought, VBD), (cyborg, NNS), (v...",13,1,0,0.076923,0.0
2004,@StephenKing That's the way to make money in t...,"Wellington, FL",1596572018237140992,2022-11-26,18:30:06,"[thats, way, make, money, vc, world, unfortuna...",[],"[that, way, make, money, vc, world, unfortun, ...","[thats, way, make, money, vc, world, unfortuna...","[(thats, NNS), (way, NN), (make, VBP), (money,...",13,0,1,0.0,0.076923
2005,@ABC Knock knock 👊🏻\nCALIFORNIA LABOR DEPT\n\n...,"Canovanillas, Puerto Rico",1596572065372389377,2022-11-26,18:30:17,"[knock, knock, california, labor, dept]",[],"[knock, knock, california, labor, dept]","[knock, knock, california, labor, dept]","[(knock, NN), (knock, NN), (california, NN), (...",5,0,2,0.0,0.4
2006,@kylegriffin1 And gained integrity,"Smithtown, NY",1596572102022385664,2022-11-26,18:30:26,"[gained, integrity]",[],"[gain, integr]","[gained, integrity]","[(gained, VBN), (integrity, NN)]",2,1,0,0.5,0.0


In [32]:
df_fix[df_fix['Count'] > 1].sort_values('Positive_ratio')[::-1][:50]

Unnamed: 0,Text,Location,Tweet_id,Date,Time,Words,Hashtags,Stemmed,Lemmed,POS_Tags,Count,Pos_count,Neg_count,Positive_ratio,Negative_ratio
978,Great work. https://t.co/YVOJjBQ7Bv,"Pennsylvania, USA",1596171365853208577,2022-11-25,15:58:03,"[great, work]",[],"[great, work]","[great, work]","[(great, JJ), (work, NN)]",2,2,0,1.0,0.0
1764,Doing great @elonmusk keep up the stellar work...,"West Virginia, USA",1596558626185183232,2022-11-26,17:36:53,"[great, keep, stellar, work]",[],"[great, keep, stellar, work]","[great, keep, stellar, work]","[(great, JJ), (keep, VB), (stellar, JJ), (work...",4,3,0,0.75,0.0
1356,@MichaelMcCros16 Interesting- I wonder what Tr...,"Manhattan, NY",1596535643697733632,2022-11-26,16:05:34,"[interesting, wonder, trump, thinking]",[],"[interest, wonder, trump, think]","[interesting, wonder, trump, thinking]","[(interesting, VBG), (wonder, NN), (trump, NN)...",4,3,0,0.75,0.0
608,@LauraFlowD @WholeMarsBlog I’m ready for some ...,"Baton Rouge, LA",1595842277305368579,2022-11-24,18:10:22,"[i’m, ready, goodness]",[],"[i'm, readi, good]","[i’m, ready, goodness]","[(i’m, JJ), (ready, JJ), (goodness, NN)]",3,2,0,0.666667,0.0
462,@RBReich Like this during trump's inauguration...,"Gibsonton, FL",1595825124225290247,2022-11-24,17:02:13,"[like, trumps, inauguration]",[],"[like, trump, inaugur]","[like, trump, inauguration]","[(like, IN), (trump, NN), (inauguration, NN)]",3,2,0,0.666667,0.0
1809,@StephenKing Wow. Thank you sir. Civility! Lik...,"Connecticut, USA",1596561530128142337,2022-11-26,17:48:26,"[wow, thank, sir, civility, like, hot, cup, ch...",[],"[wow, thank, sir, civil, like, hot, cup, chocol]","[wow, thank, sir, civility, like, hot, cup, ch...","[(wow, NN), (thank, NN), (sir, JJ), (civility,...",8,5,0,0.625,0.0
253,@AdamParkhomenko Well this is a nice example o...,"Florida, USA",1595507064985554956,2022-11-23,19:58:22,"[well, nice, example, race, baiting, congratul...",[],"[well, nice, exampl, race, bait, congratul, am...","[well, nice, example, race, baiting, congratul...","[(well, RB), (nice, JJ), (example, NN), (race,...",10,6,0,0.6,0.0
1178,Well said. https://t.co/7UiZDNjF3E,"Kingston, NY",1596188206802030592,2022-11-25,17:04:58,"[well, said]",[],"[well, said]","[well, said]","[(well, RB), (said, VBD)]",2,1,0,0.5,0.0
1934,How sweet it's ya'll! https://t.co/JIeSXN26mR,"Fort Benning, GA",1596568943895912450,2022-11-26,18:17:53,"[sweet, yall]",[],"[sweet, yall]","[sweet, yall]","[(sweet, NN), (yall, NN)]",2,1,0,0.5,0.0
536,@Iuuk96 @elonmusk Well stated!,"Tennessee, USA",1595832971659034627,2022-11-24,17:33:24,"[well, stated]",[],"[well, state]","[well, stated]","[(well, RB), (stated, VBN)]",2,1,0,0.5,0.0
