In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk # natural language tool kit
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andyv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\andyv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [28]:
initialFrame = pd.read_csv('train.tsv', delimiter = '\t');

In [29]:
initialFrame

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
...,...,...,...,...
156055,156056,8544,Hearst 's,2
156056,156057,8544,forced avuncular chortles,1
156057,156058,8544,avuncular chortles,3
156058,156059,8544,avuncular,2


In [30]:
# Cleans the initial frame
def cleanInitialFrame(df):
    cleanList = [] # list to grow
    currentSentence = 1 # tracks current sentence
    sentenceIDs = {0} 
    # Iterate row by row
    for index, row in df.iterrows():
        # If it's the first element, add to list
        if (row['SentenceId'] not in sentenceIDs):
            cleanList.append([row['PhraseId'], row['SentenceId'], row['Phrase'], row['Sentiment']])
            sentenceIDs.add(currentSentence)
            currentSentence += 1
    
    # Return a clean frame
    return pd.DataFrame(cleanList, columns = ['PhraseId', 'SentenceId', 'Phrase', 'Sentiment'])

In [31]:
def lowerAllPhrases(df):
    phrases_list = list(df['Phrase'])

    for i in range(len(phrases_list)):
        phrases_list[i] = phrases_list[i].lower()
    
    count = 0
    for index, row in df.iterrows():
        df['Phrase'] = df['Phrase'].replace([row['Phrase']], phrases_list[count])
        count += 1
    return df

In [32]:
df = cleanInitialFrame(initialFrame)
df = lowerAllPhrases(df)
print(df)

      PhraseId  SentenceId                                             Phrase  \
0            1           1  a series of escapades demonstrating the adage ...   
1           64           2  this quiet , introspective and entertaining in...   
2           82           3  even fans of ismail merchant 's work , i suspe...   
3          117           4  a positively thrilling combination of ethnogra...   
4          157           5  aggressive self-glorification and a manipulati...   
...        ...         ...                                                ...   
8539    155985        8540  ... either you 're willing to go with this cla...   
8540    155998        8541  despite these annoyances , the capable claybur...   
8541    156022        8542  -lrb- tries -rrb- to parody a genre that 's al...   
8542    156032        8543  the movie 's downfall is to substitute plot fo...   
8543    156040        8544  the film is darkly atmospheric , with herrmann...   

      Sentiment  
0        

In [46]:
def removeStopWords(df):
    
    phrases_list = list(df['Phrase'])
    stop_words = set(stopwords.words('english'))

    for i in range(len(phrases_list)):
        word_tokens = word_tokenize(phrases_list[i])
        filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
        filtered_sentence = []
        for w in word_tokens:
            if w not in stop_words:
                filtered_sentence.append(w)
        phrases_list[i] = filtered_sentence
    
    for i in range(len(phrases_list)):
        phrases_list[i] = TreebankWordDetokenizer().detokenize(phrases_list[i])
    
    count = 0
    for index, row in df.iterrows():
        df['Phrase'] = df['Phrase'].replace([row['Phrase']], phrases_list[count])
        count += 1

    return df      

In [47]:
df = removeStopWords(df)
df

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,series escapades demonstrating adage good goos...,1
1,64,2,"quiet, introspective entertaining independent ...",4
2,82,3,"even fans ismail merchant's work, suspect, wou...",1
3,117,4,positively thrilling combination ethnography i...,3
4,157,5,aggressive self-glorification manipulative whi...,1
...,...,...,...,...
8539,155985,8540,... either're willing go claustrophobic concep...,2
8540,155998,8541,"despite annoyances, capable clayburgh tambor r...",2
8541,156022,8542,-lrb- tries -rrb- parody genre's already joke ...,1
8542,156032,8543,movie's downfall substitute plot personality.,1
