In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk # natural language tool kit

In [4]:
initialFrame = pd.read_csv('train.tsv', delimiter = '\t');

In [5]:
initialFrame

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
...,...,...,...,...
156055,156056,8544,Hearst 's,2
156056,156057,8544,forced avuncular chortles,1
156057,156058,8544,avuncular chortles,3
156058,156059,8544,avuncular,2


In [6]:
# Cleans the initial frame
def cleanInitialFrame(df):
    cleanList = [] # list to grow
    currentSentence = 0 # tracks current sentence
    sentenceIDs = {0} 
    # Iterate row by row
    for index, row in df.iterrows():
        # If it's the first element, add to list
        if (row['SentenceId'] == currentSentence):
            continue
        else:
            cleanList.append([row['PhraseId'], row['SentenceId'], row['Phrase'], row['Sentiment']]);
            currentSentence = row['SentenceId']
    
    # Return a clean frame
    return pd.DataFrame(cleanList, columns = ['PhraseId', 'SentenceId', 'Phrase', 'Sentiment'])

In [7]:
df = cleanInitialFrame(initialFrame)
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,64,2,"This quiet , introspective and entertaining in...",4
2,82,3,"Even fans of Ismail Merchant 's work , I suspe...",1
3,117,4,A positively thrilling combination of ethnogra...,3
4,157,5,Aggressive self-glorification and a manipulati...,1


In [8]:
def lowerAllPhrases(df):
    phrases_list = list(df['Phrase'])

    for i in range(len(phrases_list)):
        phrases_list[i] = phrases_list[i].lower()
    count = 0;
    for index, row in df.iterrows():
        df.at[index,'Phrase'] = phrases_list[count]
        count += 1

    return df

In [9]:
lowerAllPhrases(df)
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,a series of escapades demonstrating the adage ...,1
1,64,2,"this quiet , introspective and entertaining in...",4
2,82,3,"even fans of ismail merchant 's work , i suspe...",1
3,117,4,a positively thrilling combination of ethnogra...,3
4,157,5,aggressive self-glorification and a manipulati...,1


In [10]:
# Remove non-ascii characters using str.replace()
def asciiClean(df):
    # iterate row by row
    for index, row in df.iterrows():
        old_str = row['Phrase']
        new_str = (old_str.encode('ascii','ignore')).decode()
        df.at[index, 'Phrase'] = new_str

In [11]:
asciiClean(df)
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,a series of escapades demonstrating the adage ...,1
1,64,2,"this quiet , introspective and entertaining in...",4
2,82,3,"even fans of ismail merchant 's work , i suspe...",1
3,117,4,a positively thrilling combination of ethnogra...,3
4,157,5,aggressive self-glorification and a manipulati...,1


In [12]:
df[0:10]

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,a series of escapades demonstrating the adage ...,1
1,64,2,"this quiet , introspective and entertaining in...",4
2,82,3,"even fans of ismail merchant 's work , i suspe...",1
3,117,4,a positively thrilling combination of ethnogra...,3
4,157,5,aggressive self-glorification and a manipulati...,1
5,167,6,a comedy-drama of nearly epic proportions root...,4
6,199,7,"narratively , trouble every day is a plodding ...",1
7,214,8,"the importance of being earnest , so thick wit...",3
8,248,9,but it does n't leave you with much .,1
9,260,10,you could hate it for the same reason .,1


In [13]:
import re
def removeSpaces(df):
    for index, row in df.iterrows():
        df['Phrase'] = df['Phrase'].replace([row['Phrase']], re.sub(r'\s+\'', "'", row['Phrase']))

In [14]:
removeSpaces(df)

In [15]:
df[0:10]

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,a series of escapades demonstrating the adage ...,1
1,64,2,"this quiet , introspective and entertaining in...",4
2,82,3,"even fans of ismail merchant's work , i suspec...",1
3,117,4,a positively thrilling combination of ethnogra...,3
4,157,5,aggressive self-glorification and a manipulati...,1
5,167,6,a comedy-drama of nearly epic proportions root...,4
6,199,7,"narratively , trouble every day is a plodding ...",1
7,214,8,"the importance of being earnest , so thick wit...",3
8,248,9,but it does n't leave you with much .,1
9,260,10,you could hate it for the same reason .,1


In [16]:
import contractions
print(contractions.fix("quiet"))

quiet


In [20]:
df.dtypes

PhraseId       int64
SentenceId     int64
Phrase        object
Sentiment      int64
dtype: object

In [22]:
for index, row in df.iterrows():
    for word in row['Phrase'].split():
        print(word)

a
series
of
escapades
demonstrating
the
adage
that
what
is
good
for
the
goose
is
also
good
for
the
gander
,
some
of
which
occasionally
amuses
but
none
of
which
amounts
to
much
of
a
story
.
this
quiet
,
introspective
and
entertaining
independent
is
worth
seeking
.
even
fans
of
ismail
merchant's
work
,
i
suspect
,
would
have
a
hard
time
sitting
through
this
one
.
a
positively
thrilling
combination
of
ethnography
and
all
the
intrigue
,
betrayal
,
deceit
and
murder
of
a
shakespearean
tragedy
or
a
juicy
soap
opera
.
aggressive
self-glorification
and
a
manipulative
whitewash
.
a
comedy-drama
of
nearly
epic
proportions
rooted
in
a
sincere
performance
by
the
title
character
undergoing
midlife
crisis
.
narratively
,
trouble
every
day
is
a
plodding
mess
.
the
importance
of
being
earnest
,
so
thick
with
wit
it
plays
like
a
reading
from
bartlett's
familiar
quotations
but
it
does
n't
leave
you
with
much
.
you
could
hate
it
for
the
same
reason
.
there's
little
to
recommend
snow
dogs
,
unless
one
con