In [12]:
# Numpy
import numpy as np

# Pandas
import pandas as pd

# Regular expression
import re

# Stemmers
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import SnowballStemmer

# Lemmatizers
from nltk.stem import WordNetLemmatizer

In [2]:
df = pd.read_csv('../data/processed/train_set.csv')

In [3]:
df.head()

Unnamed: 0,id,label,tweet_normalized
0,1,0,USER_MENTION father dysfunctional selfish drag...
1,2,0,USER_MENTION USER_MENTION thanks lyft credit c...
2,3,0,bihday majesty
3,4,0,model love u take u time ur
4,5,0,factsguide society motivation


# Stemming

To understand stemming, you need to gain some perspective on what word stems represent. Word stems are also known as the base form of a word, and we can create new words by attaching affixes to them in a process known as inflection. Consider the word JUMP. You can add affixes to it and form new words like JUMPS, JUMPED, and JUMPING. In this case, the base word JUMP is the word stem.

The reverse process of obtaining the base form of a word from its inflected form is known as stemming. Stemming helps us in standardizing words to their base or root stem, irrespective of their inflections, which helps many applications like classifying or clustering text, and even in information retrieval. 

In [4]:
def ps_stemmer(tweet):
    ps = PorterStemmer()
    tweet = ' '.join([ps.stem(word) for word in tweet.split()])
    return tweet

ps_stemmer("My system keeps crashing his crashed yesterday, ours crashes daily")

'My system keep crash hi crash yesterday, our crash daili'

In [5]:
def ls_stemmer(tweet):
    ls = LancasterStemmer()
    tweet = ' '.join([ls.stem(word) for word in tweet.split()])
    return tweet

ls_stemmer("My system keeps crashing his crashed yesterday, ours crashes daily")

'my system keep crash his crash yesterday, our crash dai'

In [10]:
def ss_stemmer(tweet):
    ss = SnowballStemmer('english')
    tweet = ' '.join([ss.stem(word) for word in tweet.split()])
    return tweet

ss_stemmer("My system keeps crashing his crashed yesterday, ours crashes daily")

'my system keep crash his crash yesterday, our crash daili'

In [11]:
import time
start_time = time.time()
for i in range(len(df)): 
    df.at[i, 'tweet_stemmed_porter_stemmer'] = ps_stemmer(str(df['tweet_normalized'][i]))
    df.at[i, 'tweet_stemmed_lancaster_stemmer'] = ls_stemmer(str(df['tweet_normalized'][i]))
    df.at[i, 'tweet_stemmed_snowball_stemmer'] = ss_stemmer(str(df['tweet_normalized'][i]))
print("--- %s seconds ---" % (time.time() - start_time))

--- 31.627742290496826 seconds ---


# Lemmatization

Lemmatization is very similar to stemming, where we remove word affixes to get to the base form of a word. However, the base form in this case is known as the root word, but not the root stem. The difference being that the root word is always a lexicographically correct word (present in the dictionary), but the root stem may not be so. Thus, root word, also known as the lemma, will always be present in the dictionary. 

In [7]:
df.head()

Unnamed: 0,id,label,tweet_normalized,tweet_stemmed_porter_stemmer,tweet_stemmed_lancaster_stemmer
0,1,0,USER_MENTION father dysfunctional selfish drag...,user_ment father dysfunct selfish drag kid dys...,user_mention fath dysfunct self drag kid dysfu...
1,2,0,USER_MENTION USER_MENTION thanks lyft credit c...,user_ment user_ment thank lyft credit cant use...,user_mention user_mention thank lyft credit ca...
2,3,0,bihday majesty,bihday majesti,bihday majesty
3,4,0,model love u take u time ur,model love u take u time ur,model lov u tak u tim ur
4,5,0,factsguide society motivation,factsguid societi motiv,factsguid socy mot


In [17]:
def wn_lemmatizer(tweet):
    wn = WordNetLemmatizer()
    tweet = ' '.join([wn.lemmatize(word) for word in tweet.split()])
    return tweet

wn_lemmatizer("My system keeps crashing his crashed yesterday, ours crashes daily")

'My system keep crashing his crashed yesterday, ours crash daily'

In [18]:
import time
start_time = time.time()
for i in range(len(df)): 
    df.at[i, 'tweet_lemmatized_wordnet'] = wn_lemmatizer(str(df['tweet_normalized'][i]))
print("--- %s seconds ---" % (time.time() - start_time))

--- 2.821909189224243 seconds ---


In [19]:
df.head()

Unnamed: 0,id,label,tweet_normalized,tweet_stemmed_porter_stemmer,tweet_stemmed_lancaster_stemmer,tweet_stemmed_snowball_stemmer,tweet_lemmatized_wordnet
0,1,0,USER_MENTION father dysfunctional selfish drag...,user_ment father dysfunct selfish drag kid dys...,user_mention fath dysfunct self drag kid dysfu...,user_ment father dysfunct selfish drag kid dys...,USER_MENTION father dysfunctional selfish drag...
1,2,0,USER_MENTION USER_MENTION thanks lyft credit c...,user_ment user_ment thank lyft credit cant use...,user_mention user_mention thank lyft credit ca...,user_ment user_ment thank lyft credit cant use...,USER_MENTION USER_MENTION thanks lyft credit c...
2,3,0,bihday majesty,bihday majesti,bihday majesty,bihday majesti,bihday majesty
3,4,0,model love u take u time ur,model love u take u time ur,model lov u tak u tim ur,model love u take u time ur,model love u take u time ur
4,5,0,factsguide society motivation,factsguid societi motiv,factsguid socy mot,factsguid societi motiv,factsguide society motivation


In [20]:
df.to_csv('../data/processed/train_stem_lem.csv', encoding='utf-8', index=False)