## Clean and Process data

In [140]:
from datetime import datetime as dt
from datetime import timedelta as td
import requests
import pandas as pd
import base64
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer 
from nltk.stem.porter import PorterStemmer
from nltk import wordnet
import string

In [141]:
data_raw = pd.read_csv('7DayTwitterPull.csv')#, lineterminator='\n')

In [142]:
#View column quality
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11184 entries, 0 to 11183
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   created_at  11184 non-null  object
 1   id          11184 non-null  int64 
 2   lang        11184 non-null  object
 3   text        11184 non-null  object
dtypes: int64(1), object(3)
memory usage: 349.6+ KB
None


In [143]:
#Drop duplicates
data_raw.drop_duplicates(subset='text', keep='first', inplace=True)
#Rename columns
data = data_raw.rename(columns={'created_at':'Date', 'id':'ID', 'text': 'Tweet'})
#Create Original Text column for reference
data['Original Tweet'] = data['Tweet']
#Preview dataset
data.head()

Unnamed: 0,Date,ID,lang,Tweet,Original Tweet
0,2021-01-03T19:20:59.000Z,1345812421676576768,en,@MKBHD Step 2: buy $ETH sub $1k\n\nStep 3: out...,@MKBHD Step 2: buy $ETH sub $1k\n\nStep 3: out...
1,2021-01-03T19:20:59.000Z,1345812421600940032,en,RT @Naturalmed777: I think $XRP $XLM are bigge...,RT @Naturalmed777: I think $XRP $XLM are bigge...
2,2021-01-03T19:20:59.000Z,1345812421584285698,en,RT @Sheresed69: New toy 😍😍 subscribe to my onl...,RT @Sheresed69: New toy 😍😍 subscribe to my onl...
3,2021-01-03T19:20:59.000Z,1345812420955144197,en,RT @TokenGoodGuy0: .\n#Digibyte $dgb likely to...,RT @TokenGoodGuy0: .\n#Digibyte $dgb likely to...
4,2021-01-03T19:20:59.000Z,1345812420783206401,en,RT @nixtetic: I use my comp. for remote and co...,RT @nixtetic: I use my comp. for remote and co...


In [145]:
#Remove punctuation
def remove_punctuation(text):
    no_punct = "".join([c for c in text if c not in string.punctuation])
    return no_punct

In [146]:
#Tokenize teach tweet
data['Tweet'] = data['Tweet'].apply(lambda x: remove_punctuation(x))
data['Tweet'].head(20)

0     MKBHD Step 2 buy ETH sub 1k\n\nStep 3 outperfo...
1     RT Naturalmed777 I think XRP XLM are bigger th...
2     RT Sheresed69 New toy 😍😍 subscribe to my onlyf...
3     RT TokenGoodGuy0 \nDigibyte dgb likely to move...
4     RT nixtetic I use my comp for remote and commi...
5     RT valentinebxxx NEW YEAR SW PROMO THREAD🎉\n\n...
6     RT designernayima2 😍Amazing Company logo for y...
10    RT ReySantoscrypto Link to use dapp browser on...
11    Ajaz86 PeterMcCormack Exactly I don’t want bit...
12    amitkejriwal01 cryptokanoon Thats the point yo...
13    RT MKBHD Step 1 Resist the urge to buy Bitcoin...
14    Suhail Wrong eth has unlimited supply cant hol...
15    My family’s Christmas was somewhat ruined Does...
16    RT NFL 84 yards to the house\n\nIsaiah McKenzi...
17    RT DrAmyKellam toadmeister I checked your link...
18    Wanna know how I got to top 16 on onlyf4ns in ...
20    Our latest work on Lymphadenopathy in Fungatin...
21    RT Mareq16 New Parler from Dr Stella Emman

In [147]:
tokenizer = RegexpTokenizer(r'\w+')

In [148]:
#Tokenize teach tweet
data['Tweet'] = data['Tweet'].apply(lambda x: tokenizer.tokenize(x.lower()))
data['Tweet'].head(20)

0     [mkbhd, step, 2, buy, eth, sub, 1k, step, 3, o...
1     [rt, naturalmed777, i, think, xrp, xlm, are, b...
2     [rt, sheresed69, new, toy, subscribe, to, my, ...
3     [rt, tokengoodguy0, digibyte, dgb, likely, to,...
4     [rt, nixtetic, i, use, my, comp, for, remote, ...
5     [rt, valentinebxxx, new, year, sw, promo, thre...
6     [rt, designernayima2, amazing, company, logo, ...
10    [rt, reysantoscrypto, link, to, use, dapp, bro...
11    [ajaz86, petermccormack, exactly, i, don, t, w...
12    [amitkejriwal01, cryptokanoon, thats, the, poi...
13    [rt, mkbhd, step, 1, resist, the, urge, to, bu...
14    [suhail, wrong, eth, has, unlimited, supply, c...
15    [my, family, s, christmas, was, somewhat, ruin...
16    [rt, nfl, 84, yards, to, the, house, isaiah, m...
17    [rt, dramykellam, toadmeister, i, checked, you...
18    [wanna, know, how, i, got, to, top, 16, on, on...
20    [our, latest, work, on, lymphadenopathy, in, f...
21    [rt, mareq16, new, parler, from, dr, stell

In [66]:
#Remove stop words such as "i", "me", "my", "you" since they low predictive power (do not want to do with very small samples)
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

In [149]:
data['Tweet'] = data['Tweet'].apply(lambda x : remove_stopwords(x))
data['Tweet'].head(5)

0    [mkbhd, step, 2, buy, eth, sub, 1k, step, 3, o...
1    [rt, naturalmed777, think, xrp, xlm, bigger, p...
2    [rt, sheresed69, new, toy, subscribe, onlyfans...
3    [rt, tokengoodguy0, digibyte, dgb, likely, mov...
4    [rt, nixtetic, use, comp, remote, commission, ...
Name: Tweet, dtype: object

##### *Stemming & Lemmatizing* - both "stem" a word down to it's base
* Stemming - Faster, truncates the word and is less acccuurate
* Lemmatizing - Slower, finds the original word instead of truncating - more accurate

In [150]:
#Trying Lemmatizer
lemmatizer = WordNetLemmatizer()
#Trying Stemmer (add "".join to the front because we will use this module)
stemmer = PorterStemmer()

def word_lemmatizer(text):
    lem_text = " ".join([lemmatizer.lemmatize(i) for i in text])
    return lem_text
def word_stemmer(text):
    stem_text = " ".join([stemmer.stem(i) for i in text])
    return stem_text

In [55]:
#Using word_stemmer
#data['text'] = data['text'].apply(lambda x: word_stemmer(x))
#data['text'].head(5)

In [151]:
#Using word_lemmatizer
data['Tweet'] = data['Tweet'].apply(lambda x: word_lemmatizer(x))
data['Tweet'].head(5)

0    mkbhd step 2 buy eth sub 1k step 3 outperform ...
1    rt naturalmed777 think xrp xlm bigger people r...
2    rt sheresed69 new toy subscribe onlyfans see s...
3    rt tokengoodguy0 digibyte dgb likely move fast...
4    rt nixtetic use comp remote commission work ha...
Name: Tweet, dtype: object

In [152]:
#Save cleaned dataframe to csv for analysis
data.to_csv('Dec28toJan03_Clean.csv', index = False)