In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk # natural language tool kit

In [112]:
initialFrame = pd.read_csv('train.tsv', delimiter = '\t');

In [113]:
initialFrame

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
...,...,...,...,...
156055,156056,8544,Hearst 's,2
156056,156057,8544,forced avuncular chortles,1
156057,156058,8544,avuncular chortles,3
156058,156059,8544,avuncular,2


In [114]:
# Cleans the initial frame
def cleanInitialFrame(df):
    cleanList = [] # list to grow
    currentSentence = 0 # tracks current sentence
    sentenceIDs = {0} 
    # Iterate row by row
    for index, row in df.iterrows():
        # If it's the first element, add to list
        if (row['SentenceId'] == currentSentence):
            continue
        else:
            cleanList.append([row['PhraseId'], row['SentenceId'], row['Phrase'], row['Sentiment']]);
            currentSentence = row['SentenceId']
    
    # Return a clean frame
    return pd.DataFrame(cleanList, columns = ['PhraseId', 'SentenceId', 'Phrase', 'Sentiment'])

In [115]:
df = cleanInitialFrame(initialFrame)
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,64,2,"This quiet , introspective and entertaining in...",4
2,82,3,"Even fans of Ismail Merchant 's work , I suspe...",1
3,117,4,A positively thrilling combination of ethnogra...,3
4,157,5,Aggressive self-glorification and a manipulati...,1


In [104]:
def lowerAllPhrases(df):
    phrases_list = list(df['Phrase'])

    for i in range(len(phrases_list)):
        phrases_list[i] = phrases_list[i].lower()
    count = 0;
    for index, row in df.iterrows():
        df.at[index,'Phrase'] = phrases_list[count]
        count += 1

    return df

In [105]:
lowerAllPhrases(df)
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,a series of escapades demonstrating the adage ...,1
1,64,2,"this quiet , introspective and entertaining in...",4
2,82,3,"even fans of ismail merchant 's work , i suspe...",1
3,117,4,a positively thrilling combination of ethnogra...,3
4,157,5,aggressive self-glorification and a manipulati...,1


In [106]:
# Remove non-ascii characters using str.replace()
def asciiClean(df):
    # iterate row by row
    for index, row in df.iterrows():
        old_str = row['Phrase']
        new_str = (old_str.encode('ascii','ignore')).decode()
        df.at[index, 'Phrase'] = new_str

In [107]:
asciiClean(df)
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,a series of escapades demonstrating the adage ...,1
1,64,2,"this quiet , introspective and entertaining in...",4
2,82,3,"even fans of ismail merchant 's work , i suspe...",1
3,117,4,a positively thrilling combination of ethnogra...,3
4,157,5,aggressive self-glorification and a manipulati...,1


In [108]:
df[0:10]

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,a series of escapades demonstrating the adage ...,1
1,64,2,"this quiet , introspective and entertaining in...",4
2,82,3,"even fans of ismail merchant 's work , i suspe...",1
3,117,4,a positively thrilling combination of ethnogra...,3
4,157,5,aggressive self-glorification and a manipulati...,1
5,167,6,a comedy-drama of nearly epic proportions root...,4
6,199,7,"narratively , trouble every day is a plodding ...",1
7,214,8,"the importance of being earnest , so thick wit...",3
8,248,9,but it does n't leave you with much .,1
9,260,10,you could hate it for the same reason .,1


In [109]:
import re
def removeSpaces(df):
    for index, row in df.iterrows():
        # df['Phrase'] = df['Phrase'].replace([row['Phrase']], re.sub(r'\s+\'', "'", row['Phrase']))
        df.at[index,'Phrase'] = re.sub(r'\s+\'', "'", row['Phrase'])

In [110]:
removeSpaces(df)

In [111]:
df

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,a series of escapades demonstrating the adage ...,1
1,64,2,"this quiet , introspective and entertaining in...",4
2,82,3,"even fans of ismail merchant's work , i suspec...",1
3,117,4,a positively thrilling combination of ethnogra...,3
4,157,5,aggressive self-glorification and a manipulati...,1
...,...,...,...,...
8524,155985,8540,... either you're willing to go with this clau...,2
8525,155998,8541,"despite these annoyances , the capable claybur...",2
8526,156022,8542,-lrb- tries -rrb- to parody a genre that's alr...,1
8527,156032,8543,the movie's downfall is to substitute plot for...,1


In [87]:
import contractions
contractions.add('n\'t', 'not')
def expandContractions(df):
    for index, row in df.iterrows():
        phrase = []
        for i in row['Phrase'].split():
            phrase.append(contractions.fix(i))
        string_version = ' '.join(phrase)
        df.at[index, 'Phrase'] = string_version            

In [88]:
expandContractions(df)

In [89]:
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Allen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Allen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [90]:
def removeStopWords(df):
    
    phrases_list = list(df['Phrase'])
    stop_words = set(stopwords.words('english'))

    for i in range(len(phrases_list)):
        word_tokens = word_tokenize(phrases_list[i])
        filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
        filtered_sentence = []
        for w in word_tokens:
            if w not in stop_words:
                filtered_sentence.append(w)
        phrases_list[i] = filtered_sentence
    
    for i in range(len(phrases_list)):
        phrases_list[i] = TreebankWordDetokenizer().detokenize(phrases_list[i])
    
    count = 0
    for index, row in df.iterrows():
        # df['Phrase'] = df['Phrase'].replace([row['Phrase']], phrases_list[count])
        df.at[index, 'Phrase'] = phrases_list[count]
        count += 1

    return df      

In [91]:
df = removeStopWords(df)
df

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,series escapades demonstrating adage good goos...,1
1,64,2,"quiet, introspective entertaining independent ...",4
2,82,3,"even fans ismail merchant's work, suspect, wou...",1
3,117,4,positively thrilling combination ethnography i...,3
4,157,5,aggressive self-glorification manipulative whi...,1
...,...,...,...,...
8524,155985,8540,... either willing go claustrophobic concept.,2
8525,155998,8541,"despite annoyances, capable clayburgh tambor r...",2
8526,156022,8542,-lrb- tries -rrb- parody genre already joke un...,1
8527,156032,8543,movie's downfall substitute plot personality.,1


In [61]:
# TO DO:
# * preprocessing
#     - replace all grammar with spaces
#     - lemmatize words
# * reduce to multi-dimensional vector
#     - We have options including: Bag of words
#     - BERT
#     - TF-IDF
# * classification
#     - try a bunch of classifiers
# * graphing
#     - graph the outputs of our classifiers
# * presentation
#     - create a presentation

In [62]:
#i am lemmatizing and removing grammar 

In [92]:
import string

In [95]:
from nltk.stem import WordNetLemmatizer
def removePunctuation(df):
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    for index,row in df.iterrows():
        phrase = []
        for i in row['Phrase'].split():
            phrase.append(regex.sub('', i))
        string_version = ' '.join(phrase)
        df.at[index, 'Phrase'] = string_version
        
def lemmatizPhrases(df):
    lemmatizer = WordNetLemmatizer()
    

In [96]:
removePunctuation(df)

In [97]:
df

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,series escapades demonstrating adage good goos...,1
1,64,2,quiet introspective entertaining independent w...,4
2,82,3,even fans ismail merchants work suspect would ...,1
3,117,4,positively thrilling combination ethnography i...,3
4,157,5,aggressive selfglorification manipulative whit...,1
...,...,...,...,...
8524,155985,8540,either willing go claustrophobic concept,2
8525,155998,8541,despite annoyances capable clayburgh tambor re...,2
8526,156022,8542,lrb tries rrb parody genre already joke united...,1
8527,156032,8543,movies downfall substitute plot personality,1


In [20]:
class Selene(object):
    
    def __init__(self, dataFrame):
        
        self.df = dataFrame
        self.unique = []
        self.apperances = []
        self.wordsDict = {}
        self.wordSentOcc = {}
        self.wordSentVal = {}

    def setUniqueWords(self):
        
        phrases = list(self.df['Phrase'])
        
        for i in range(len(phrases)):
            word_tokens = word_tokenize(phrases[i])
            for w in word_tokens:
                flag = 0
                for l in w:
                    if (l.isdigit()) or (l == '.') or (w[0] == '-'):
                        flag = 1
                if (w not in self.unique) and (len(w) > 2) and (not flag):
                    self.unique.append(w)
                    
        print("There are {} unique words in this data.".format(len(self.unique)))

    def setApperances(self):
        
        phrases = list(self.df['Phrase'])
        
        for word in self.unique:
            currWord = word
            counter = 0
            for i in range(len(phrases)):
                counter += phrases[i].count(currWord)
            self.apperances.append(counter)
            
        max_app = max(self.apperances)
        max_ind = self.apperances.index(max_app)
        
        print("The word with the most apperances is {} with {} "
              "apperances.".format(self.unique[max_ind], max_app))

    def setWordsDict(self):
        
        index = 0
        for word in self.unique:
            self.wordsDict[word] = self.apperances[index]
            index += 1
            
        print("Here is all the unique words and their number of occurances:")
        
        for key, value in self.wordsDict.items():
            print(key, ": ", value)

    def extractUniqueWords(self):
        
        self.setUniqueWords()
        self.setApperances()
        self.setWordsDict()
        
    def findWordSentOccurances(self):
        
        phrases = list(self.df['Phrase'])
        
        for word in self.unique:
            currWord = word
            self.wordSentOcc[word] = {'0': 0, '1': 0, '2': 0, '3': 0, '4':0}
            occ0 = 0; occ1 = 0; occ2 = 0; occ3 = 0; occ4 = 0
            for i in range(len(phrases)):
                occurances = phrases[i].count(currWord)
                senti = self.df.iloc[i]['Sentiment']
                if senti == 0:
                    occ0 += occurances
                    self.wordSentOcc[word]['0'] = occ0
                if senti == 1:
                    occ1 += occurances
                    self.wordSentOcc[word]['1'] = occ1
                if senti == 2:
                    occ2 += occurances
                    self.wordSentOcc[word]['2'] = occ2
                if senti == 3:
                    occ3 += occurances
                    self.wordSentOcc[word]['3'] = occ3
                if senti == 4:
                    occ4 += occurances
                    self.wordSentOcc[word]['4'] = occ4
        
        print("All the times each word has appeared with a certain sentiment has been set."
              " The results are the following:")
        for key, value in self.wordSentOcc.items():
            print(key, ": ", value)
        
    def findProbabilities(self, key):
        
        prob0 = self.wordSentOcc[key]['0'] / self.wordsDict[key]
        prob1 = self.wordSentOcc[key]['1'] / self.wordsDict[key]
        prob2 = self.wordSentOcc[key]['2'] / self.wordsDict[key]
        prob3 = self.wordSentOcc[key]['3'] / self.wordsDict[key]
        prob4 = self.wordSentOcc[key]['4'] / self.wordsDict[key]
        return prob0, prob1, prob2, prob3, prob4
        
    def findWordSentVal(self):
        
        for key, value in self.wordSentOcc.items():
            sentVal = (((value['0'] * 0) + (value['1'] * 1) + (value['2'] * 2)
                    + (value['3'] * 3) + (value['4'] * 4)) / self.wordsDict[key])
            prob0, prob1, prob2, prob3, prob4 = self.findProbabilities(key)
            self.wordSentVal[key] = {'Avg Value': sentVal,'Probability of 0': prob0,
                                     'Probability of 1': prob1, 'Probability of 2': prob2, 
                                     'Probabilty of 3': prob3, 'Probability of 4': prob4}
            
        print("Here is the average value of the sentiment and probabilities of each word in the training data:")
        for key, value in self.wordSentVal.items():
            print(key, ": ", value)
            
    def getSentiment(self, testDataFrame, weightPercent = .1, minimumOccurances = 3):
        pass
                
                
        

In [None]:
test = Selene(df)
test.extractUniqueWords()
test.findWordSentOccurances()
test.findWordSentVal()

There are 15925 unique words in this data.
