# Tweet Cleaning

In [87]:
import pandas as pd
import re

import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [88]:
df = pd.read_csv('../data/tweet_format.csv')
df.head()

Unnamed: 0,tweet,response_sentiment
0,How much wood could a woodchuck chuck if a woo...,0.76
1,death is all around us,-0.9
2,things are stuff,0.0
3,I tweeted thisss while I .. was drunk,0.1
4,I jump he jumped we are jumping,0.5


In [89]:
df_clean = df.copy()
df_clean.head()

Unnamed: 0,tweet,response_sentiment
0,How much wood could a woodchuck chuck if a woo...,0.76
1,death is all around us,-0.9
2,things are stuff,0.0
3,I tweeted thisss while I .. was drunk,0.1
4,I jump he jumped we are jumping,0.5


In [90]:
# Remove extra whitespace from df_clean['tweet']
df_clean['tweet'] = df_clean['tweet'].apply(lambda x: re.sub(r'\s+', ' ', x))
# Normalize case in df_clean['tweet']
df_clean['tweet'] = df_clean['tweet'].apply(lambda x: x.lower())

In [91]:
# Tokenize tweets
tw = TweetTokenizer()
df_clean['tweet'] = df_clean['tweet'].apply(lambda x : tw.tokenize(x))

In [92]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jimothygreene/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/jimothygreene/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/jimothygreene/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [93]:
# Remove stopwords
stop_words = stopwords.words('english')
df_clean['tweet'] = df_clean['tweet'].apply(lambda x : [word for word in x if word not in stop_words])

In [94]:
# Lemmatize words
lemmatizer = WordNetLemmatizer()
df_clean['tweet'] = df_clean['tweet'].apply(lambda x : [lemmatizer.lemmatize(word) for word in x])

In [95]:
# Remove all tokens that contain no text
df_clean['tweet'] = df_clean['tweet'].apply(lambda x : [word for word in x if word.isalpha()])

In [96]:
df_clean.head()

Unnamed: 0,tweet,response_sentiment
0,"[much, wood, could, woodchuck, chuck, woodchuc...",0.76
1,"[death, around, u]",-0.9
2,"[thing, stuff]",0.0
3,"[tweeted, thisss, drunk]",0.1
4,"[jump, jumped, jumping]",0.5


In [97]:
df_clean.to_csv('../data/tweet_clean.csv', index=False)