#### Tokenization:

In [4]:
import nltk
import re
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize, TweetTokenizer, TreebankWordTokenizer, wordpunct_tokenize
import pandas as pd
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\joelj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joelj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### sent_tokenize & word_tokenize:

In [5]:
data = pd.read_csv(filepath_or_buffer="../data/train.csv",delimiter=",", usecols=['text','sentiment'])
print("Shape:",data.shape)
print(data.head())
print(data.dtypes)

Shape: (27481, 2)
                                                text sentiment
0                I`d have responded. if I were going   neutral
1      Sooo SAD I will miss you here in San Diego!!!  negative
2                          my boss is bullying me...  negative
3                     what interview! leave me alone  negative
4   Sons of ****, why couldn`t they put them on t...  negative
text         object
sentiment    object
dtype: object


In [6]:
print(data.isnull().sum())
data.isnull()

text         1
sentiment    0
dtype: int64


Unnamed: 0,text,sentiment
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
27476,False,False
27477,False,False
27478,False,False
27479,False,False


In [7]:
val = data.isnull()
index = [i for i in range(len(val)) if val['text'][i] == True]
print(index)

[314]


In [8]:
print(data.iloc[index,:])

    text sentiment
314  NaN   neutral


In [9]:
data.drop(axis=0,index=index,inplace=True)
print(data.isnull().sum())
data.reset_index(drop=True, inplace=True)
data.shape

text         0
sentiment    0
dtype: int64


(27480, 2)

In [10]:
for index in tqdm(range(len(data['text']))): 
    token_sentences = sent_tokenize(data['text'][index], language='english')
    text_data = ""
    for sentence in token_sentences:
        words = word_tokenize(sentence)
        words = [word.lower() for word in words]
        text_data += " ".join(word for word in words if word not in stopwords.words('english'))
    text_data = re.sub('[^a-zA-Z0-9]+'," ",text_data) 
    text_data = re.sub('([\s]+){2,}', "", text_data)
    data.loc[index,'text'] = text_data
print(data.head())

100%|██████████| 27480/27480 [02:34<00:00, 177.73it/s]

                               text sentiment
0                   responded going   neutral
1          sooo sad miss san diego   negative
2                    boss bullying   negative
3             interview leave alone  negative
4  sons put releases already bought  negative





#### Tweet Tokenizer:

In [17]:
data2 = pd.read_csv(filepath_or_buffer="../data/train.csv",delimiter=",", usecols=['text','sentiment'])

In [18]:
data2.isnull().sum()

text         1
sentiment    0
dtype: int64

In [19]:
data2.dropna(axis=0, inplace=True)

In [20]:
data2.isnull().sum()
data2.shape

(27480, 2)

In [21]:
data2.reset_index(drop=True, inplace=True)

In [22]:
tweet_tokenizer = TweetTokenizer()
tweet_token = ""
for index in tqdm(range(len(data2['text']))):
    tokenized = tweet_tokenizer.tokenize(data2['text'][index])
    tokenized = [token.lower() for token in tokenized]
    tweet_token += " ".join(token for token in tokenized if token not in stopwords.words('english'))
    tweet_token = re.sub("[^a-zA-Z0-9]+", " ", tweet_token)
    tweet_token = re.sub("([\s]+){2,}","",tweet_token)
    data2.loc[index,'text'] = tweet_token
print(data2.head())

  0%|          | 0/27480 [00:00<?, ?it/s]

100%|██████████| 27480/27480 [28:24<00:00, 16.12it/s]


                                                text sentiment
0                                    responded going   neutral
1            responded goingsooo sad miss san diego   negative
2   responded goingsooo sad miss san diego boss b...  negative
3   responded goingsooo sad miss san diego boss b...  negative
4   responded goingsooo sad miss san diego boss b...  negative


#### TreebankWordTokenizer:

In [33]:
data3 = pd.read_csv("../data/train.csv", delimiter=",", usecols=['text','sentiment'])
data3.head(5)

Unnamed: 0,text,sentiment
0,I`d have responded. if I were going,neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


In [34]:
data3.dropna(axis=0,inplace=True)
data3.iloc[312:316]

Unnamed: 0,text,sentiment
312,DUSTBIN BABY ON AT 11.30 Cannot wait x,positive
313,"Not going to dwell on it. It happened, it`s p...",negative
315,It looks like the office TV DOES get MLB Netwo...,neutral
316,Home empty handed. No comics found today. I ...,neutral


In [35]:
data3.reset_index(drop=True,inplace=True)
data3.iloc[312:316]

Unnamed: 0,text,sentiment
312,DUSTBIN BABY ON AT 11.30 Cannot wait x,positive
313,"Not going to dwell on it. It happened, it`s p...",negative
314,It looks like the office TV DOES get MLB Netwo...,neutral
315,Home empty handed. No comics found today. I ...,neutral


#### wordpunct_tokenize:

In [None]:
for index in tqdm(range(len(data['text']))):
    text_data = wordpunct_tokenize(data3['text'][index])
    words = [word.lower() for word in text_data]
    text_data = " ".join(word for word in words if word not in stopwords.words('english'))
    text_data = re.sub("([^A-Za-z0-9]+)", " ", text_data)
    text_data = re.sub("r'[\s]+{2,}'","", text_data)
    data3.loc[index,'text'] = text_data

print(data3.head())

100%|██████████| 27480/27480 [02:31<00:00, 181.50it/s]


                               text sentiment
0                   responded going   neutral
1          sooo sad miss san diego   negative
2                    boss bullying   negative
3             interview leave alone  negative
4  sons put releases already bought  negative
