---
## CSE 482 - Tweet Loading and Processing
### Jack Nugent
Load Tweets related to Elon Musk and prepare the text for sentiment analysis.

---

In [6]:
import tweepy
from tweepy import OAuthHandler
from tweepy import API
import json
import pandas as pd
import re

In [2]:
#!pip install emoji
import emoji

---

In [2]:
# Replace with Your Keys
C_KEY = None
C_SECRET = None
A_TOKEN_KEY = None
A_TOKEN_SECRET = None
BEARER = None

In [4]:
client = tweepy.Client(bearer_token=BEARER)
outFile = open('output.json', 'w')
count = 0
geo_count = 0


# Replace with your own search query
query = '(musk OR elon) -is:retweet lang:en'

# Replace with number of Tweets to pull
NUM_TWEETS = 10000

for tweet in tweepy.Paginator(client.search_recent_tweets, query=query,
                              tweet_fields=['entities', 'created_at', 'geo'], max_results=100).flatten(limit=NUM_TWEETS):
    d = tweet.data
    d['geo'] = tweet.geo
    if tweet.geo != None:
        geo_count += 1
    outFile.write(json.dumps(d))
    outFile.write('\n')
    count += 1
    
print("Total Tweets: {}\nGeotagged Tweets: {}".format(count, geo_count))

Total Tweets: 10000
Geotagged Tweets: 130


---
### Loading Data

In [3]:
f = open('output.json')

data_json = []
for line in f:
    try:
        data_json.append(json.loads(line))
    # Discard Tweets with invalid chars
    except:
        pass
    
f.close()

In [4]:
df = pd.DataFrame(columns=['Text', 'Location', 'Tweet_id', 'Date', 'Time'])

for tweet in data_json:
    date, time = tweet['created_at'].split('T')
    # Handles Retweets that somehow slip through filter
    if 'RT @' not in tweet['text']:
        df.loc[len(df.index)] = [tweet['text'], tweet['geo'], tweet['id'], date, time[:-5]]

print("Number of Tweets:",len(df))
df

Number of Tweets: 9992


Unnamed: 0,Text,Location,Tweet_id,Date,Time
0,elon musk needs to buy reddit next fr https://...,,1594040420891529216,2022-11-19,18:50:26
1,@lp_edoardo @e_heiker @alexxubyte @elonmusk Sa...,,1594040420652253186,2022-11-19,18:50:26
2,@AminaJMohammed @UN COP27 G20 has made a dec...,,1594040420446642179,2022-11-19,18:50:26
3,@elonmusk Why the f*ck am I getting notificati...,,1594040419133931521,2022-11-19,18:50:26
4,@CATASTROPHYCLUB Thanksgiving floki (Tfloki) i...,,1594040418584625152,2022-11-19,18:50:26
...,...,...,...,...,...
9987,"Did it ever occur to everybody cheering the ""d...",,1594027959916322821,2022-11-19,18:00:55
9988,@elonmusk @thevivafrei @paraga @jack Right on ...,,1594027959085838336,2022-11-19,18:00:55
9989,@co_bernard @elon @SBF_FTX WallStreetBets Whal...,,1594027958636724224,2022-11-19,18:00:55
9990,@TCDNB @WatcherGuru we love elon &lt;3,,1594027957773029376,2022-11-19,18:00:55


---
### Processing Text

#### Clean Text

In [46]:
%%time
df['Words'] = df['Text'].str.split()
df['Hashtags'] = None
for x in range(len(df)):
    # Handle '@' Mentions
    df['Words'][x] = [i for i in df['Words'][x] if not re.compile('@').match(i)]
    
    # Handle links
    df['Words'][x] = [i for i in df['Words'][x] if not re.compile('http').match(i)]
    
    # Hashtags
    df['Hashtags'][x] = [word for word in df['Words'][x] if '#' in word]
    df['Words'][x] = [word for word in df['Words'][x] if word not in df['Hashtags'][x]]
    
    # Handle Emojis and Punctuation
    words = []
    for word in df.iloc[x]['Words']:
        for c in word:
            if c in emoji.EMOJI_DATA or c in '''!()-[]{};:'"\,<>./?@#$%^&*_~''':
                word = word.replace(c, '')
        # Lowercase Words
        words.append(word.lower())
    df['Words'][x] = words
    
    # Remove Empty Values
    for word in df['Words'][x]:
        if word == '':
            df['Words'][x].remove(word)
            

CPU times: user 10.4 s, sys: 34.3 ms, total: 10.4 s
Wall time: 10.4 s


In [47]:
df

Unnamed: 0,Text,Location,Tweet_id,Date,Time,Words,Hashtags
0,elon musk needs to buy reddit next fr https://...,,1594040420891529216,2022-11-19,18:50:26,"[elon, musk, needs, to, buy, reddit, next, fr]",[]
1,@lp_edoardo @e_heiker @alexxubyte @elonmusk Sa...,,1594040420652253186,2022-11-19,18:50:26,"[sadly, this, people, think, elon, is, gonna, ...",[]
2,@AminaJMohammed @UN COP27 G20 has made a dec...,,1594040420446642179,2022-11-19,18:50:26,"[cop27, g20, has, made, a, decision, americanc...",[]
3,@elonmusk Why the f*ck am I getting notificati...,,1594040419133931521,2022-11-19,18:50:26,"[why, the, fck, am, i, getting, notifications,...",[]
4,@CATASTROPHYCLUB Thanksgiving floki (Tfloki) i...,,1594040418584625152,2022-11-19,18:50:26,"[thanksgiving, floki, tfloki, is, a, token, cr...","[#Tfloki, #Thanksgivingfloki]"
...,...,...,...,...,...,...,...
9987,"Did it ever occur to everybody cheering the ""d...",,1594027959916322821,2022-11-19,18:00:55,"[did, it, ever, occur, to, everybody, cheering...",[]
9988,@elonmusk @thevivafrei @paraga @jack Right on ...,,1594027959085838336,2022-11-19,18:00:55,"[right, on, elon]",[]
9989,@co_bernard @elon @SBF_FTX WallStreetBets Whal...,,1594027958636724224,2022-11-19,18:00:55,"[wallstreetbets, whales, has, planned, a, real...",[#Binance.]
9990,@TCDNB @WatcherGuru we love elon &lt;3,,1594027957773029376,2022-11-19,18:00:55,"[we, love, elon, lt3]",[]


---
### NLTK Cleaning

In [62]:
import nltk
nltk.download("wordnet")
from nltk.corpus import wordnet
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download("stopwords")
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize, sent_tokenize

[nltk_data] Downloading package wordnet to /home/nugentj3/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nugentj3/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/nugentj3/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/nugentj3/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


---
#### Stopword Removal

In [56]:
stop_words = set(stopwords.words('english'))

In [48]:
# Stopword Removal
for x in range(len(df)):
    df['Words'][x] = [word for word in df['Words'][x] if word not in stop_words]
df

Unnamed: 0,Text,Location,Tweet_id,Date,Time,Words,Hashtags
0,elon musk needs to buy reddit next fr https://...,,1594040420891529216,2022-11-19,18:50:26,"[elon, musk, needs, buy, reddit, next, fr]",[]
1,@lp_edoardo @e_heiker @alexxubyte @elonmusk Sa...,,1594040420652253186,2022-11-19,18:50:26,"[sadly, people, think, elon, gonna, cal, ask, ...",[]
2,@AminaJMohammed @UN COP27 G20 has made a dec...,,1594040420446642179,2022-11-19,18:50:26,"[cop27, g20, made, decision, americancanadian,...",[]
3,@elonmusk Why the f*ck am I getting notificati...,,1594040419133931521,2022-11-19,18:50:26,"[fck, getting, notifications, musks, account, ...",[]
4,@CATASTROPHYCLUB Thanksgiving floki (Tfloki) i...,,1594040418584625152,2022-11-19,18:50:26,"[thanksgiving, floki, tfloki, token, created, ...","[#Tfloki, #Thanksgivingfloki]"
...,...,...,...,...,...,...,...
9987,"Did it ever occur to everybody cheering the ""d...",,1594027959916322821,2022-11-19,18:00:55,"[ever, occur, everybody, cheering, demise, twi...",[]
9988,@elonmusk @thevivafrei @paraga @jack Right on ...,,1594027959085838336,2022-11-19,18:00:55,"[right, elon]",[]
9989,@co_bernard @elon @SBF_FTX WallStreetBets Whal...,,1594027958636724224,2022-11-19,18:00:55,"[wallstreetbets, whales, planned, really, big,...",[#Binance.]
9990,@TCDNB @WatcherGuru we love elon &lt;3,,1594027957773029376,2022-11-19,18:00:55,"[love, elon, lt3]",[]


---
#### Stemming

In [51]:
df['Stemmed'] = None
for x in range(len(df)):
    df['Stemmed'][x] = [SnowballStemmer(language='english').stem(word) for word in df['Words'][x]]
#df

Unnamed: 0,Text,Location,Tweet_id,Date,Time,Words,Hashtags,Stemmed
0,elon musk needs to buy reddit next fr https://...,,1594040420891529216,2022-11-19,18:50:26,"[elon, musk, needs, buy, reddit, next, fr]",[],"[elon, musk, need, buy, reddit, next, fr]"
1,@lp_edoardo @e_heiker @alexxubyte @elonmusk Sa...,,1594040420652253186,2022-11-19,18:50:26,"[sadly, people, think, elon, gonna, cal, ask, ...",[],"[sad, peopl, think, elon, gonna, cal, ask, hel..."
2,@AminaJMohammed @UN COP27 G20 has made a dec...,,1594040420446642179,2022-11-19,18:50:26,"[cop27, g20, made, decision, americancanadian,...",[],"[cop27, g20, made, decis, americancanadian, pi..."
3,@elonmusk Why the f*ck am I getting notificati...,,1594040419133931521,2022-11-19,18:50:26,"[fck, getting, notifications, musks, account, ...",[],"[fck, get, notif, musk, account, dont, even, f..."
4,@CATASTROPHYCLUB Thanksgiving floki (Tfloki) i...,,1594040418584625152,2022-11-19,18:50:26,"[thanksgiving, floki, tfloki, token, created, ...","[#Tfloki, #Thanksgivingfloki]","[thanksgiv, floki, tfloki, token, creat, bsc, ..."
...,...,...,...,...,...,...,...,...
9987,"Did it ever occur to everybody cheering the ""d...",,1594027959916322821,2022-11-19,18:00:55,"[ever, occur, everybody, cheering, demise, twi...",[],"[ever, occur, everybodi, cheer, demis, twitter..."
9988,@elonmusk @thevivafrei @paraga @jack Right on ...,,1594027959085838336,2022-11-19,18:00:55,"[right, elon]",[],"[right, elon]"
9989,@co_bernard @elon @SBF_FTX WallStreetBets Whal...,,1594027958636724224,2022-11-19,18:00:55,"[wallstreetbets, whales, planned, really, big,...",[#Binance.],"[wallstreetbet, whale, plan, realli, big, pump..."
9990,@TCDNB @WatcherGuru we love elon &lt;3,,1594027957773029376,2022-11-19,18:00:55,"[love, elon, lt3]",[],"[love, elon, lt3]"


---
#### Lemmatization

In [64]:
df['Lemmed'] = None
for x in range(len(df)):
    df['Lemmed'][x] = [WordNetLemmatizer().lemmatize(word) for word in df['Words'][x]]
df

Unnamed: 0,Text,Location,Tweet_id,Date,Time,Words,Hashtags,Stemmed,Lemmed
0,elon musk needs to buy reddit next fr https://...,,1594040420891529216,2022-11-19,18:50:26,"[elon, musk, needs, buy, reddit, next, fr]",[],"[elon, musk, need, buy, reddit, next, fr]","[elon, musk, need, buy, reddit, next, fr]"
1,@lp_edoardo @e_heiker @alexxubyte @elonmusk Sa...,,1594040420652253186,2022-11-19,18:50:26,"[sadly, people, think, elon, gonna, cal, ask, ...",[],"[sad, peopl, think, elon, gonna, cal, ask, hel...","[sadly, people, think, elon, gonna, cal, ask, ..."
2,@AminaJMohammed @UN COP27 G20 has made a dec...,,1594040420446642179,2022-11-19,18:50:26,"[cop27, g20, made, decision, americancanadian,...",[],"[cop27, g20, made, decis, americancanadian, pi...","[cop27, g20, made, decision, americancanadian,..."
3,@elonmusk Why the f*ck am I getting notificati...,,1594040419133931521,2022-11-19,18:50:26,"[fck, getting, notifications, musks, account, ...",[],"[fck, get, notif, musk, account, dont, even, f...","[fck, getting, notification, musk, account, do..."
4,@CATASTROPHYCLUB Thanksgiving floki (Tfloki) i...,,1594040418584625152,2022-11-19,18:50:26,"[thanksgiving, floki, tfloki, token, created, ...","[#Tfloki, #Thanksgivingfloki]","[thanksgiv, floki, tfloki, token, creat, bsc, ...","[thanksgiving, floki, tfloki, token, created, ..."
...,...,...,...,...,...,...,...,...,...
9987,"Did it ever occur to everybody cheering the ""d...",,1594027959916322821,2022-11-19,18:00:55,"[ever, occur, everybody, cheering, demise, twi...",[],"[ever, occur, everybodi, cheer, demis, twitter...","[ever, occur, everybody, cheering, demise, twi..."
9988,@elonmusk @thevivafrei @paraga @jack Right on ...,,1594027959085838336,2022-11-19,18:00:55,"[right, elon]",[],"[right, elon]","[right, elon]"
9989,@co_bernard @elon @SBF_FTX WallStreetBets Whal...,,1594027958636724224,2022-11-19,18:00:55,"[wallstreetbets, whales, planned, really, big,...",[#Binance.],"[wallstreetbet, whale, plan, realli, big, pump...","[wallstreetbets, whale, planned, really, big, ..."
9990,@TCDNB @WatcherGuru we love elon &lt;3,,1594027957773029376,2022-11-19,18:00:55,"[love, elon, lt3]",[],"[love, elon, lt3]","[love, elon, lt3]"


---
#### Part of Speech Tagging

In [66]:
#POS Tagging
df['POS_Tags'] = None
for x in range(len(df)):
    df['POS_Tags'][x] = nltk.pos_tag(df['Lemmed'][x])
df

Unnamed: 0,Text,Location,Tweet_id,Date,Time,Words,Hashtags,Stemmed,Lemmed,POS_Tags
0,elon musk needs to buy reddit next fr https://...,,1594040420891529216,2022-11-19,18:50:26,"[elon, musk, needs, buy, reddit, next, fr]",[],"[elon, musk, need, buy, reddit, next, fr]","[elon, musk, need, buy, reddit, next, fr]","[(elon, NN), (musk, NN), (need, VBP), (buy, VB..."
1,@lp_edoardo @e_heiker @alexxubyte @elonmusk Sa...,,1594040420652253186,2022-11-19,18:50:26,"[sadly, people, think, elon, gonna, cal, ask, ...",[],"[sad, peopl, think, elon, gonna, cal, ask, hel...","[sadly, people, think, elon, gonna, cal, ask, ...","[(sadly, RB), (people, NNS), (think, VBP), (el..."
2,@AminaJMohammed @UN COP27 G20 has made a dec...,,1594040420446642179,2022-11-19,18:50:26,"[cop27, g20, made, decision, americancanadian,...",[],"[cop27, g20, made, decis, americancanadian, pi...","[cop27, g20, made, decision, americancanadian,...","[(cop27, NN), (g20, NN), (made, VBD), (decisio..."
3,@elonmusk Why the f*ck am I getting notificati...,,1594040419133931521,2022-11-19,18:50:26,"[fck, getting, notifications, musks, account, ...",[],"[fck, get, notif, musk, account, dont, even, f...","[fck, getting, notification, musk, account, do...","[(fck, NN), (getting, VBG), (notification, NN)..."
4,@CATASTROPHYCLUB Thanksgiving floki (Tfloki) i...,,1594040418584625152,2022-11-19,18:50:26,"[thanksgiving, floki, tfloki, token, created, ...","[#Tfloki, #Thanksgivingfloki]","[thanksgiv, floki, tfloki, token, creat, bsc, ...","[thanksgiving, floki, tfloki, token, created, ...","[(thanksgiving, VBG), (floki, JJ), (tfloki, NN..."
...,...,...,...,...,...,...,...,...,...,...
9987,"Did it ever occur to everybody cheering the ""d...",,1594027959916322821,2022-11-19,18:00:55,"[ever, occur, everybody, cheering, demise, twi...",[],"[ever, occur, everybodi, cheer, demis, twitter...","[ever, occur, everybody, cheering, demise, twi...","[(ever, RB), (occur, MD), (everybody, VB), (ch..."
9988,@elonmusk @thevivafrei @paraga @jack Right on ...,,1594027959085838336,2022-11-19,18:00:55,"[right, elon]",[],"[right, elon]","[right, elon]","[(right, RB), (elon, NN)]"
9989,@co_bernard @elon @SBF_FTX WallStreetBets Whal...,,1594027958636724224,2022-11-19,18:00:55,"[wallstreetbets, whales, planned, really, big,...",[#Binance.],"[wallstreetbet, whale, plan, realli, big, pump...","[wallstreetbets, whale, planned, really, big, ...","[(wallstreetbets, NNS), (whale, VBP), (planned..."
9990,@TCDNB @WatcherGuru we love elon &lt;3,,1594027957773029376,2022-11-19,18:00:55,"[love, elon, lt3]",[],"[love, elon, lt3]","[love, elon, lt3]","[(love, NN), (elon, NN), (lt3, NN)]"
