# Tweet Cleaning

In [2]:
import pandas as pd
import re

import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [3]:
df = pd.read_csv('../data/labeled_tweets.csv')
df.head()

Unnamed: 0,id,tweet,sentiment,reply_count
0,1223752255846912000,The Fox Corporation (The owners of Fox News) a...,-0.52,150
1,1223738389003952128,"Folks, you hear about this cornovirus deal in ...",-0.2,45
2,1223748267609030659,The news is finally out !!!! I will be on @lov...,0.5,6
3,1223739174160928773,Good news! The person under investigation for ...,-0.25,12
4,1223737953291128837,"Two avid golfers promised that, whoever died f...",0.166667,36


In [4]:
df_clean = df.copy()
df_clean.head()

Unnamed: 0,id,tweet,sentiment,reply_count
0,1223752255846912000,The Fox Corporation (The owners of Fox News) a...,-0.52,150
1,1223738389003952128,"Folks, you hear about this cornovirus deal in ...",-0.2,45
2,1223748267609030659,The news is finally out !!!! I will be on @lov...,0.5,6
3,1223739174160928773,Good news! The person under investigation for ...,-0.25,12
4,1223737953291128837,"Two avid golfers promised that, whoever died f...",0.166667,36


In [5]:
# Remove extra whitespace from df_clean['tweet']
df_clean['tweet'] = df_clean['tweet'].apply(lambda x: re.sub(r'\s+', ' ', x))
# Normalize case in df_clean['tweet']
df_clean['tweet'] = df_clean['tweet'].apply(lambda x: x.lower())

In [6]:
# Tokenize tweets
tw = TweetTokenizer()
df_clean['tweet'] = df_clean['tweet'].apply(lambda x : tw.tokenize(x))

In [7]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ishanshah/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ishanshah/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/ishanshah/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


True

In [8]:
# Remove stopwords
stop_words = stopwords.words('english')
df_clean['tweet'] = df_clean['tweet'].apply(lambda x : [word for word in x if word not in stop_words])

In [9]:
# Lemmatize words
lemmatizer = WordNetLemmatizer()
df_clean['tweet'] = df_clean['tweet'].apply(lambda x : [lemmatizer.lemmatize(word) for word in x])

In [10]:
# Remove all tokens that contain no text
df_clean['tweet'] = df_clean['tweet'].apply(lambda x : [word for word in x if word.isalpha()])

In [11]:
# Turn df_clean['tweet'] into a single string
df_clean['tweet'] = df_clean['tweet'].apply(lambda x : ' '.join(x))

In [12]:
df_clean.head()

Unnamed: 0,id,tweet,sentiment,reply_count
0,1223752255846912000,fox corporation owner fox news trying bully ro...,-0.52,150
1,1223738389003952128,folk hear cornovirus deal news heck doctor rec...,-0.2,45
2,1223748267609030659,news finally excited journey potential finding...,0.5,6
3,1223739174160928773,good news person investigation novel coronavir...,-0.25,12
4,1223737953291128837,two avid golfer promised whoever died first wo...,0.166667,36


In [13]:
df_clean.to_csv('../data/cleaned_tweets.csv', index=False)