In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk # natural language tool kit

In [2]:
initialFrame = pd.read_csv('train.tsv', delimiter = '\t');

In [3]:
initialFrame

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
...,...,...,...,...
156055,156056,8544,Hearst 's,2
156056,156057,8544,forced avuncular chortles,1
156057,156058,8544,avuncular chortles,3
156058,156059,8544,avuncular,2


In [4]:
# Cleans the initial frame
def cleanInitialFrame(df):
    cleanList = [] # list to grow
    currentSentence = 0 # tracks current sentence
    sentenceIDs = {0} 
    # Iterate row by row
    for index, row in df.iterrows():
        # If it's the first element, add to list
        if (row['SentenceId'] == currentSentence):
            continue
        else:
            cleanList.append([row['PhraseId'], row['SentenceId'], row['Phrase'], row['Sentiment']]);
            currentSentence = row['SentenceId']
    
    # Return a clean frame
    return pd.DataFrame(cleanList, columns = ['PhraseId', 'SentenceId', 'Phrase', 'Sentiment'])

In [5]:
df = cleanInitialFrame(initialFrame)
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,64,2,"This quiet , introspective and entertaining in...",4
2,82,3,"Even fans of Ismail Merchant 's work , I suspe...",1
3,117,4,A positively thrilling combination of ethnogra...,3
4,157,5,Aggressive self-glorification and a manipulati...,1


In [6]:
def lowerAllPhrases(df):
    phrases_list = list(df['Phrase'])

    for i in range(len(phrases_list)):
        phrases_list[i] = phrases_list[i].lower()
    count = 0;
    for index, row in df.iterrows():
        df.at[index,'Phrase'] = phrases_list[count]
        count += 1

    return df

In [7]:
lowerAllPhrases(df)
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,a series of escapades demonstrating the adage ...,1
1,64,2,"this quiet , introspective and entertaining in...",4
2,82,3,"even fans of ismail merchant 's work , i suspe...",1
3,117,4,a positively thrilling combination of ethnogra...,3
4,157,5,aggressive self-glorification and a manipulati...,1


In [8]:
# Remove non-ascii characters using str.replace()
def asciiClean(df):
    # iterate row by row
    for index, row in df.iterrows():
        old_str = row['Phrase']
        new_str = (old_str.encode('ascii','ignore')).decode()
        df.at[index, 'Phrase'] = new_str

In [9]:
asciiClean(df)
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,a series of escapades demonstrating the adage ...,1
1,64,2,"this quiet , introspective and entertaining in...",4
2,82,3,"even fans of ismail merchant 's work , i suspe...",1
3,117,4,a positively thrilling combination of ethnogra...,3
4,157,5,aggressive self-glorification and a manipulati...,1


In [10]:
df[0:10]

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,a series of escapades demonstrating the adage ...,1
1,64,2,"this quiet , introspective and entertaining in...",4
2,82,3,"even fans of ismail merchant 's work , i suspe...",1
3,117,4,a positively thrilling combination of ethnogra...,3
4,157,5,aggressive self-glorification and a manipulati...,1
5,167,6,a comedy-drama of nearly epic proportions root...,4
6,199,7,"narratively , trouble every day is a plodding ...",1
7,214,8,"the importance of being earnest , so thick wit...",3
8,248,9,but it does n't leave you with much .,1
9,260,10,you could hate it for the same reason .,1


In [11]:
import re
def removeSpaces(df):
    for index, row in df.iterrows():
        # df['Phrase'] = df['Phrase'].replace([row['Phrase']], re.sub(r'\s+\'', "'", row['Phrase']))
        df.at[index,'Phrase'] = re.sub(r'\s+\'', "'", row['Phrase'])

In [12]:
removeSpaces(df)

In [13]:
df[0:10]

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,a series of escapades demonstrating the adage ...,1
1,64,2,"this quiet , introspective and entertaining in...",4
2,82,3,"even fans of ismail merchant's work , i suspec...",1
3,117,4,a positively thrilling combination of ethnogra...,3
4,157,5,aggressive self-glorification and a manipulati...,1
5,167,6,a comedy-drama of nearly epic proportions root...,4
6,199,7,"narratively , trouble every day is a plodding ...",1
7,214,8,"the importance of being earnest , so thick wit...",3
8,248,9,but it does n't leave you with much .,1
9,260,10,you could hate it for the same reason .,1


In [14]:
import contractions
contractions.add('n\'t', 'not')
def expandContractions(df):
    for index, row in df.iterrows():
        phrase = []
        for i in row['Phrase'].split():
            phrase.append(contractions.fix(i))
        string_version = ' '.join(phrase)
        df.at[index, 'Phrase'] = string_version            

In [15]:
expandContractions(df)

In [None]:
import string
def removePunctuation(df):
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    for index,row in df.iterrows():
        phrase = []
        for i in row['Phrase'].split():
            phrase.append(regex.sub('', i))
        string_version = ' '.join(phrase)
        df.at[index, 'Phrase'] = string_version

In [None]:
removePunctuation(df)

In [16]:
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/muchlogic/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/muchlogic/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
def removeStopWords(df):
    
    phrases_list = list(df['Phrase'])
    stop_words = set(stopwords.words('english'))

    for i in range(len(phrases_list)):
        word_tokens = word_tokenize(phrases_list[i])
        filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
        filtered_sentence = []
        for w in word_tokens:
            if w not in stop_words:
                filtered_sentence.append(w)
        phrases_list[i] = filtered_sentence
    
    for i in range(len(phrases_list)):
        phrases_list[i] = TreebankWordDetokenizer().detokenize(phrases_list[i])
    
    count = 0
    for index, row in df.iterrows():
        # df['Phrase'] = df['Phrase'].replace([row['Phrase']], phrases_list[count])
        df.at[index, 'Phrase'] = phrases_list[count]
        count += 1

    return df      

In [18]:
df = removeStopWords(df)
df

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,series escapades demonstrating adage good goos...,1
1,64,2,"quiet, introspective entertaining independent ...",4
2,82,3,"even fans ismail merchant's work, suspect, wou...",1
3,117,4,positively thrilling combination ethnography i...,3
4,157,5,aggressive self-glorification manipulative whi...,1
...,...,...,...,...
8524,155985,8540,... either willing go claustrophobic concept.,2
8525,155998,8541,"despite annoyances, capable clayburgh tambor r...",2
8526,156022,8542,-lrb- tries -rrb- parody genre already joke un...,1
8527,156032,8543,movie's downfall substitute plot personality.,1


In [19]:
# TO DO:
# * preprocessing
#     - replace all grammar with spaces
#     - lemmatize words
# * reduce to multi-dimensional vector
#     - We have options including: Bag of words
#     - BERT
#     - TF-IDF
# * classification
#     - try a bunch of classifiers
# * graphing
#     - graph the outputs of our classifiers
# * presentation
#     - create a presentation

In [20]:
class Selene(object):
    
    def __init__(self, dataFrame):
        
        self.df = dataFrame
        self.wordsDict = {}
        self.wordSentOcc = {}
        self.wordSentVal = {}
        
        # Tokenizes phrases and finds frequency & sentiment
        self.setUniqueWords()
        self.setFrequencies()
        self.findSentFrequency()
    
    # Finds unique words in the list of phrases and puts into list
    def setUniqueWords(self):
        
        phrases = list(self.df['Phrase'])
        
        for index, row in df.iterrows():
            sentence = row['Phrase']
            for w in word_tokenize(sentence): # word_tokens:
                if w.isalpha():
                    self.wordsDict[w] = [0,0,0,0,0,0]
                    
    def setFrequencies(self):
        
        for index, row in df.iterrows():
            sentence = row['Phrase']
            for word in sentence.split():
                if word in self.wordsDict:
                    self.wordsDict[word][0] += 1
                    
            
#         print("Here are all the unique words:")
        
#         for key, value in self.wordsDict.items():
#             print(key, ": ", value)
            
    
    # Finds frequencies of the sentiment per word
    def findSentFrequency(self):
                    
        for index, row in df.iterrows():
            sentiment = int(row['Sentiment'])
            for word in row['Phrase'].split():
                if word in self.wordsDict:
                    try:
                        self.wordsDict[word][sentiment + 1] += 1
                    except:
                        print(sentiment + 1)
                
        
        print("Words = [Frequency, 0 Freq, 1 Freq, 2 Freq, 3 Freq, 4 Freq, 5 Freq]")
        for key, value in self.wordsDict.items():
            print(key, ": ", value)
    # Finds probabilities for a word's sentiment
    def findProbabilities(self, key):
        
        prob0 = self.wordSentOcc[key][0 + 1] / self.wordsDict[key]
        prob1 = self.wordSentOcc[key][1 + 1] / self.wordsDict[key]
        prob2 = self.wordSentOcc[key][2 + 1] / self.wordsDict[key]
        prob3 = self.wordSentOcc[key][3 + 1] / self.wordsDict[key]
        prob4 = self.wordSentOcc[key][4 + 1] / self.wordsDict[key]
        return prob0, prob1, prob2, prob3, prob4
    # Finds the average sentiment of a word
    def findWordSentVal(self):
        
        for key, value in self.wordSentOcc.items():
            sentVal = (((value[0 + 1] * 0) + (value[1 + 1] * 1) + (value[2 + 1] * 2)
                    + (value[3 + 1] * 3) + (value[4 + 1] * 4)) / self.wordsDict[key])
            prob0, prob1, prob2, prob3, prob4 = self.findProbabilities(key)
            self.wordSentVal[key] = {'Avg Value': sentVal,'Probability of 0': prob0,
                                     'Probability of 1': prob1, 'Probability of 2': prob2, 
                                     'Probabilty of 3': prob3, 'Probability of 4': prob4}
            
        print("Here is the average value of the sentiment and probabilities of each word in the training data:")
        for key, value in self.wordSentVal.items():
            print(key, ": ", value)
            
    def getSentiment(self, testDataFrame, weightPercent = .1, minimumOccurances = 3):
        pass
                
                
        

In [21]:
test = Selene(df)
# test.findWordSentVal()

There are 15925 unique words in this data.
The word with the most apperances is fil with 1499 apperances.
Here is all the unique words and their number of occurances:
series :  50
escapades :  1
demonstrating :  3
adage :  3
good :  315
goose :  5
also :  115
gander :  1
occasionally :  27
amuses :  1
none :  39
amounts :  8
much :  264
story :  462
quiet :  49
introspective :  3
entertaining :  97
independent :  5
worth :  120
seeking :  8
even :  398
fans :  54
ismail :  1
merchant :  6
work :  323
suspect :  13
would :  237
hard :  153
time :  494
sitting :  17
one :  1131
positively :  5
thrilling :  5
combination :  12
ethnography :  1
intrigue :  13
betrayal :  4
deceit :  1
murder :  28
shakespearean :  3
tragedy :  27
juicy :  4
soap :  31
opera :  48
aggressive :  7
self-glorification :  2
manipulative :  18
whitewash :  1
comedy-drama :  4
nearly :  46
epic :  40
proportions :  5
rooted :  2
sincere :  19
performance :  228
title :  61
character :  374
undergoing :  1
midlife

KeyboardInterrupt: 