# Data Preprocessing #

### Loading Data and Libraries ###

In [1]:
import pandas as pd 
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize

In [2]:
# read csv
tweets_df = pd.read_csv('all_tweets.csv')

  tweets_df = pd.read_csv('all_tweets.csv')


In [3]:
# drop the rows having NaN values
tweets_df = tweets_df.dropna()
 
# reset the indices
tweets_df = tweets_df.reset_index(drop = True)

### Cleaning Text ###

In [4]:
# remove mentions, hashtags, RT, and urls
def clean_text (text):
    text = re.sub(r'@[A-Za-z0-9_]+','', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'RT[\s]+', '', text)
    text = re.sub(r'https?:\/\/\S+', '', text)
    text = re.sub(r'\d+', '',text)
    return text

In [5]:
# make new column for cleaned tweets
tweets_df['cleaned_text'] = tweets_df['Text'].apply(clean_text)
tweets_df.head()
    
    
# lowercase
tweets_df['cleaned_text'] = tweets_df['cleaned_text'].apply(lambda x: ' ' .join(x.lower() for x in x.split()))

tweets_df.head()

Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Text,Username,cleaned_text
0,0,2015-12-30 19:16:32+00:00,6.822791e+17,@Daily_Star they better keep a close eye on th...,carisbrook5,"they better keep a close eye on this kid, so t..."
1,1,2015-12-30 17:11:12+00:00,6.822476e+17,Calling customer service lines gives me #diarr...,ktandthekittens,calling customer service lines gives me diarrh...
2,2,2015-12-30 16:13:04+00:00,6.822329e+17,5 Ways to Relieve Stress in 2016 https://t.co/...,ThriveStreams,ways to relieve stress in via our blog mentalh...
3,3,2015-12-30 15:25:33+00:00,6.82221e+17,Sweet story. #anxiety #depression #coping #men...,ynniv129,sweet story. anxiety depression coping mentalh...
4,4,2015-12-30 04:59:08+00:00,6.820633e+17,5 Different Types of Motivation https://t.co/x...,workttech,different types of motivation motivation menta...


In [6]:
# remove punctuation
tweets_df['cleaned_text'] = tweets_df['cleaned_text'].str.replace(r'[^\w\s]+', '')

tweets_df.head()

  tweets_df['cleaned_text'] = tweets_df['cleaned_text'].str.replace(r'[^\w\s]+', '')


Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Text,Username,cleaned_text
0,0,2015-12-30 19:16:32+00:00,6.822791e+17,@Daily_Star they better keep a close eye on th...,carisbrook5,they better keep a close eye on this kid so th...
1,1,2015-12-30 17:11:12+00:00,6.822476e+17,Calling customer service lines gives me #diarr...,ktandthekittens,calling customer service lines gives me diarrh...
2,2,2015-12-30 16:13:04+00:00,6.822329e+17,5 Ways to Relieve Stress in 2016 https://t.co/...,ThriveStreams,ways to relieve stress in via our blog mentalh...
3,3,2015-12-30 15:25:33+00:00,6.82221e+17,Sweet story. #anxiety #depression #coping #men...,ynniv129,sweet story anxiety depression coping mentalhe...
4,4,2015-12-30 04:59:08+00:00,6.820633e+17,5 Different Types of Motivation https://t.co/x...,workttech,different types of motivation motivation menta...


In [8]:
# extend stop list
stp_wrds = ['youd','lyn','ive','id','shed','hed','itd','hows','amp','ra','thats','can','could',\
            'should','would','rt','cant','dont','shouldve','im','u','b','doesnt','havent',\
            'hadnt','isnt','shouldnt','wasnt','werent','wont','wouldnt','youll','youre','ur','ull']
stop_words = stopwords.words('english')
stop_words.extend(stp_wrds)

tweets_df['cleaned_text'] = tweets_df.apply(lambda row: nltk.word_tokenize(row['cleaned_text']), axis=1)

# remove stop words
tweets_df['cleaned_text'] = tweets_df['cleaned_text'].apply(lambda words: [word for word in words if word not in stop_words])
    
tweets_df['cleaned_text'] = tweets_df['cleaned_text'].str.join(" ")

tweets_df.head()

Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Text,Username,cleaned_text
0,0,2015-12-30 19:16:32+00:00,6.822791e+17,@Daily_Star they better keep a close eye on th...,carisbrook5,better keep close eye kid commit suicide depre...
1,1,2015-12-30 17:11:12+00:00,6.822476e+17,Calling customer service lines gives me #diarr...,ktandthekittens,calling customer service lines gives diarrhea ...
2,2,2015-12-30 16:13:04+00:00,6.822329e+17,5 Ways to Relieve Stress in 2016 https://t.co/...,ThriveStreams,ways relieve stress via blog mentalhealth newy...
3,3,2015-12-30 15:25:33+00:00,6.82221e+17,Sweet story. #anxiety #depression #coping #men...,ynniv129,sweet story anxiety depression coping mentalhe...
4,4,2015-12-30 04:59:08+00:00,6.820633e+17,5 Different Types of Motivation https://t.co/x...,workttech,different types motivation motivation mentalhe...


In [9]:
# lemmatize
tokenizer = nltk.WhitespaceTokenizer()
lemmatizer = nltk.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in tokenizer.tokenize(text)]

tweets_df['cleaned_text'] = tweets_df['cleaned_text'].apply(lemmatize_text)

In [10]:
# rejoin tokens
tweets_df['cleaned_text'] = tweets_df['cleaned_text'].str.join(" ")

tweets_df.head()

Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Text,Username,cleaned_text
0,0,2015-12-30 19:16:32+00:00,6.822791e+17,@Daily_Star they better keep a close eye on th...,carisbrook5,better keep close eye kid commit suicide depre...
1,1,2015-12-30 17:11:12+00:00,6.822476e+17,Calling customer service lines gives me #diarr...,ktandthekittens,calling customer service line give diarrhea tm...
2,2,2015-12-30 16:13:04+00:00,6.822329e+17,5 Ways to Relieve Stress in 2016 https://t.co/...,ThriveStreams,way relieve stress via blog mentalhealth newye...
3,3,2015-12-30 15:25:33+00:00,6.82221e+17,Sweet story. #anxiety #depression #coping #men...,ynniv129,sweet story anxiety depression coping mentalhe...
4,4,2015-12-30 04:59:08+00:00,6.820633e+17,5 Different Types of Motivation https://t.co/x...,workttech,different type motivation motivation mentalhealth


In [11]:
# remove words with less than 2 characters
tweets_df['cleaned_text']= tweets_df['cleaned_text'].apply(lambda x: [word for word in x.split() if len(word) > 2])
tweets_df['cleaned_text'] = tweets_df['cleaned_text'].str.join(" ")
tweets_df.head()

Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Text,Username,cleaned_text
0,0,2015-12-30 19:16:32+00:00,6.822791e+17,@Daily_Star they better keep a close eye on th...,carisbrook5,better keep close eye kid commit suicide depre...
1,1,2015-12-30 17:11:12+00:00,6.822476e+17,Calling customer service lines gives me #diarr...,ktandthekittens,calling customer service line give diarrhea tm...
2,2,2015-12-30 16:13:04+00:00,6.822329e+17,5 Ways to Relieve Stress in 2016 https://t.co/...,ThriveStreams,way relieve stress via blog mentalhealth newye...
3,3,2015-12-30 15:25:33+00:00,6.82221e+17,Sweet story. #anxiety #depression #coping #men...,ynniv129,sweet story anxiety depression coping mentalhe...
4,4,2015-12-30 04:59:08+00:00,6.820633e+17,5 Different Types of Motivation https://t.co/x...,workttech,different type motivation motivation mentalhealth


In [12]:
# remove leftover issues
tweets_df['cleaned_text'] = tweets_df['cleaned_text'].str.replace(r'_', '')

tweets_df.head()

Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Text,Username,cleaned_text
0,0,2015-12-30 19:16:32+00:00,6.822791e+17,@Daily_Star they better keep a close eye on th...,carisbrook5,better keep close eye kid commit suicide depre...
1,1,2015-12-30 17:11:12+00:00,6.822476e+17,Calling customer service lines gives me #diarr...,ktandthekittens,calling customer service line give diarrhea tm...
2,2,2015-12-30 16:13:04+00:00,6.822329e+17,5 Ways to Relieve Stress in 2016 https://t.co/...,ThriveStreams,way relieve stress via blog mentalhealth newye...
3,3,2015-12-30 15:25:33+00:00,6.82221e+17,Sweet story. #anxiety #depression #coping #men...,ynniv129,sweet story anxiety depression coping mentalhe...
4,4,2015-12-30 04:59:08+00:00,6.820633e+17,5 Different Types of Motivation https://t.co/x...,workttech,different type motivation motivation mentalhealth


### Adjusting Datetime Column ###

In [13]:
# convert to datetime
tweets_df['Date'] = pd.to_datetime(tweets_df['Datetime'])
tweets_df.info()

# drop old date column
tweets_df = tweets_df.drop('Datetime', axis = 1)

tweets_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 386930 entries, 0 to 386929
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype              
---  ------        --------------   -----              
 0   Unnamed: 0    386930 non-null  object             
 1   Datetime      386930 non-null  object             
 2   Tweet Id      386930 non-null  float64            
 3   Text          386930 non-null  object             
 4   Username      386930 non-null  object             
 5   cleaned_text  386930 non-null  object             
 6   Date          386930 non-null  datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), float64(1), object(5)
memory usage: 20.7+ MB


Unnamed: 0.1,Unnamed: 0,Tweet Id,Text,Username,cleaned_text,Date
0,0,6.822791e+17,@Daily_Star they better keep a close eye on th...,carisbrook5,better keep close eye kid commit suicide depre...,2015-12-30 19:16:32+00:00
1,1,6.822476e+17,Calling customer service lines gives me #diarr...,ktandthekittens,calling customer service line give diarrhea tm...,2015-12-30 17:11:12+00:00
2,2,6.822329e+17,5 Ways to Relieve Stress in 2016 https://t.co/...,ThriveStreams,way relieve stress via blog mentalhealth newye...,2015-12-30 16:13:04+00:00
3,3,6.82221e+17,Sweet story. #anxiety #depression #coping #men...,ynniv129,sweet story anxiety depression coping mentalhe...,2015-12-30 15:25:33+00:00
4,4,6.820633e+17,5 Different Types of Motivation https://t.co/x...,workttech,different type motivation motivation mentalhealth,2015-12-30 04:59:08+00:00


In [14]:
tweets_df['year'] = pd.DatetimeIndex(tweets_df['Date']).year
tweets_df['month'] = pd.DatetimeIndex(tweets_df['Date']).month
tweets_df.head()

Unnamed: 0.1,Unnamed: 0,Tweet Id,Text,Username,cleaned_text,Date,year,month
0,0,6.822791e+17,@Daily_Star they better keep a close eye on th...,carisbrook5,better keep close eye kid commit suicide depre...,2015-12-30 19:16:32+00:00,2015,12
1,1,6.822476e+17,Calling customer service lines gives me #diarr...,ktandthekittens,calling customer service line give diarrhea tm...,2015-12-30 17:11:12+00:00,2015,12
2,2,6.822329e+17,5 Ways to Relieve Stress in 2016 https://t.co/...,ThriveStreams,way relieve stress via blog mentalhealth newye...,2015-12-30 16:13:04+00:00,2015,12
3,3,6.82221e+17,Sweet story. #anxiety #depression #coping #men...,ynniv129,sweet story anxiety depression coping mentalhe...,2015-12-30 15:25:33+00:00,2015,12
4,4,6.820633e+17,5 Different Types of Motivation https://t.co/x...,workttech,different type motivation motivation mentalhealth,2015-12-30 04:59:08+00:00,2015,12


In [15]:
# save to csv
tweets_df.to_csv('clean_tweets.csv')