In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk # natura|l language tool kit

In [None]:
initialFrame = pd.read_csv('train.tsv', delimiter = '\t');

In [None]:
initialFrame

In [None]:
# Cleans the initial frame
def cleanInitialFrame(df):
    cleanList = [] # list to grow
    currentSentence = 0 # tracks current sentence
    sentenceIDs = {0} 
    # Iterate row by row
    for index, row in df.iterrows():
        # If it's the first element, add to list
        if (row['SentenceId'] == currentSentence):
            continue
        else:
            cleanList.append([row['PhraseId'], row['SentenceId'], row['Phrase'], row['Sentiment']]);
            currentSentence = row['SentenceId']
    
    # Return a clean frame
    return pd.DataFrame(cleanList, columns = ['PhraseId', 'SentenceId', 'Phrase', 'Sentiment'])

In [None]:
df = cleanInitialFrame(initialFrame)
df.head()

In [None]:
def lowerAllPhrases(df):
    phrases_list = list(df['Phrase'])

    for i in range(len(phrases_list)):
        phrases_list[i] = phrases_list[i].lower()
    count = 0;
    for index, row in df.iterrows():
        df.at[index,'Phrase'] = phrases_list[count]
        count += 1

    return df

In [None]:
lowerAllPhrases(df)
df.head()

In [None]:
# Remove non-ascii characters using str.replace()
def asciiClean(df):
    # iterate row by row
    for index, row in df.iterrows():
        old_str = row['Phrase']
        new_str = (old_str.encode('ascii','ignore')).decode()
        df.at[index, 'Phrase'] = new_str

In [None]:
asciiClean(df)
df.head()

In [None]:
df[0:10]

In [None]:
import re
def removeSpaces(df):
    for index, row in df.iterrows():
        # df['Phrase'] = df['Phrase'].replace([row['Phrase']], re.sub(r'\s+\'', "'", row['Phrase']))
        df.at[index,'Phrase'] = re.sub(r'\s+\'', "'", row['Phrase'])

In [None]:
removeSpaces(df)

In [None]:
df[0:10]

In [None]:
import contractions
contractions.add('n\'t', 'not')
def expandContractions(df):
    for index, row in df.iterrows():
        phrase = []
        for i in row['Phrase'].split():
            phrase.append(contractions.fix(i))
        string_version = ' '.join(phrase)
        df.at[index, 'Phrase'] = string_version            

In [None]:
expandContractions(df)

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer

In [None]:
def removeStopWords(df):
    
    phrases_list = list(df['Phrase'])
    stop_words = set(stopwords.words('english'))

    for i in range(len(phrases_list)):
        word_tokens = word_tokenize(phrases_list[i])
        filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
        filtered_sentence = []
        for w in word_tokens:
            if w not in stop_words:
                filtered_sentence.append(w)
        phrases_list[i] = filtered_sentence
    
    for i in range(len(phrases_list)):
        phrases_list[i] = TreebankWordDetokenizer().detokenize(phrases_list[i])
    
    count = 0
    for index, row in df.iterrows():
        # df['Phrase'] = df['Phrase'].replace([row['Phrase']], phrases_list[count])
        df.at[index, 'Phrase'] = phrases_list[count]
        count += 1

    return df      

In [None]:
df = removeStopWords(df)
df

In [None]:
# TO DO:
# * preprocessing
#     - replace all grammar with spaces
#     - lemmatize words
# * reduce to multi-dimensional vector
#     - We have options including: Bag of words
#     - BERT
#     - TF-IDF
# * classification
#     - try a bunch of classifiers
# * graphing
#     - graph the outputs of our classifiers
# * presentation
#     - create a presentation

In [None]:
df.dtypes

In [None]:
class Selene(object):
    
    def __init__(self, dataFrame):
        
        self.df = dataFrame
        self.unique = []
        self.apperances = []
        self.wordsDict = {}
        self.wordSentOcc = {}
        self.wordSentVal = {}
    # Finds unique words in the list of phrases and puts into list
    def setUniqueWords(self):
        
        phrases = list(self.df['Phrase'])
        
        for i in range(len(phrases)):
            word_tokens = word_tokenize(phrases[i])
            for w in word_tokens:
                if w.isalpha():
                    self.wordsDict[w] = 0
                # This logic needs to be removed
                # for l in w:
                #     if (l.isalpha()):
                #         flag = 1
                # if (w not in self.unique) and (len(w) > 2) and (not flag):
                #     self.wordsDict[w] = 0
                    
        # print("There are {} unique words in this data.".format(len(self.unique)))
    # Finds frequency of a word in the bag of words and puts into list
    def setApperances(self):
        
        phrases = list(self.df['Phrase'])
        
#         for word in self.unique:
#             currWord = word
#             counter = 0
#             for i in range(len(phrases)):
#                 counter += phrases[i].count(currWord)
#             self.wordsDict[word] += 1
        
        
        for index, row in df.iterrows():
            sentence = row['Phrase']
            for word in sentence:
                self.wordsDict[word] += 1
            
#         max_app = max(self.apperances)
#         max_ind = self.apperances.index(max_app)
        
#         print("The word with the most apperances is {} with {} "
#               "apperances.".format(self.unique[max_ind], max_app))
    # Converts the two lists into a dictionary
#     def setWordsDict(self):
        
#         index = 0
#         for word in self.unique:
#             # Declare dictionary: (frequency, 0 frequency, 1 frequency, 2 frequency, 3 frequency, 4 frequency)
#             self.wordsDict[word] = (self.apperances[index], 0, 0, 0, 0, 0)
#             index += 1
            
#         print("Here is all the unique words and their number of occurances:")
        
#         for key, value in self.wordsDict.items():
#             print(key, ": ", value)
            
    # Runs things
    def extractUniqueWords(self):
        
        self.setUniqueWords()
        self.setApperances()
        # self.setWordsDict()
    # Finds frequencies of the sentiment per word
    def findWordSentOccurances(self):
        
#         phrases = list(self.df['Phrase'])
        
#         for word in self.unique:
#             currWord = word
#             self.wordSentOcc[word] = {'0': 0, '1': 0, '2': 0, '3': 0, '4':0}
#             occ0 = 0; occ1 = 0; occ2 = 0; occ3 = 0; occ4 = 0
#             for i in range(len(phrases)):
#                 occurances = phrases[i].count(currWord)
#                 senti = self.df.iloc[i]['Sentiment']
#                 if senti == 0:
#                     occ0 += occurances
#                     self.wordSentOcc[word]['0'] = occ0
#                 if senti == 1:
#                     occ1 += occurances
#                     self.wordSentOcc[word]['1'] = occ1
#                 if senti == 2:
#                     occ2 += occurances
#                     self.wordSentOcc[word]['2'] = occ2
#                 if senti == 3:
#                     occ3 += occurances
#                     self.wordSentOcc[word]['3'] = occ3
#                 if senti == 4:
#                     occ4 += occurances
#                     self.wordSentOcc[word]['4'] = occ4
                    
        for index, rows in df.iterrows():
            sentiment = int(row['Sentiment'])
            for word in row['Phrase']:
                if word.isalpha():
                    self.wordsDict[word][sentiment + 1] += 1
                
        
        print("All the times each word has appeared with a certain sentiment has been set."
              " The results are the following:")
        for key, value in self.wordSentOcc.items():
            print(key, ": ", value)
    # Finds probabilities for a word's sentiment
    def findProbabilities(self, key):
        
        prob0 = self.wordSentOcc[key][0 + 1] / self.wordsDict[key]
        prob1 = self.wordSentOcc[key][1 + 1] / self.wordsDict[key]
        prob2 = self.wordSentOcc[key][2 + 1] / self.wordsDict[key]
        prob3 = self.wordSentOcc[key][3 + 1] / self.wordsDict[key]
        prob4 = self.wordSentOcc[key][4 + 1] / self.wordsDict[key]
        return prob0, prob1, prob2, prob3, prob4
    # Finds the average sentiment of a word
    def findWordSentVal(self):
        
        for key, value in self.wordSentOcc.items():
            sentVal = (((value[0 + 1] * 0) + (value[1 + 1] * 1) + (value[2 + 1] * 2)
                    + (value[3 + 1] * 3) + (value[4 + 1] * 4)) / self.wordsDict[key])
            prob0, prob1, prob2, prob3, prob4 = self.findProbabilities(key)
            self.wordSentVal[key] = {'Avg Value': sentVal,'Probability of 0': prob0,
                                     'Probability of 1': prob1, 'Probability of 2': prob2, 
                                     'Probabilty of 3': prob3, 'Probability of 4': prob4}
            
        print("Here is the average value of the sentiment and probabilities of each word in the training data:")
        for key, value in self.wordSentVal.items():
            print(key, ": ", value)
            
    def getSentiment(self, testDataFrame, weightPercent = .1, minimumOccurances = 3):
        pass
                
                
        

In [None]:
test = Selene(df)
test.extractUniqueWords()
test.findWordSentOccurances()
test.findWordSentVal()