In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk # natural language tool kit
import contractions
import string
import re

In [2]:
initialFrame = pd.read_csv('train.tsv', delimiter = '\t')

In [3]:
# Cleans the initial frame
def cleanInitialFrame(df):
    cleanList = [] # list to grow
    currentSentence = 0 # tracks current sentence
    sentenceIDs = {0} 
    # Iterate row by row
    for index, row in df.iterrows():
        # If it's the first element, add to list
        if (row['SentenceId'] == currentSentence):
            continue
        else:
            cleanList.append([row['PhraseId'], row['SentenceId'], row['Phrase'], row['Sentiment']]);
            currentSentence = row['SentenceId']
    
    # Return a clean frame
    return pd.DataFrame(cleanList, columns = ['PhraseId', 'SentenceId', 'Phrase', 'Sentiment'])

def lowerAllPhrases(df):
    phrases_list = list(df['Phrase'])

    for i in range(len(phrases_list)):
        phrases_list[i] = phrases_list[i].lower()
    count = 0;
    for index, row in df.iterrows():
        df.at[index,'Phrase'] = phrases_list[count]
        count += 1
    return df

# Remove non-ascii characters using str.replace()
def asciiClean(df):
    # iterate row by row
    for index, row in df.iterrows():
        old_str = row['Phrase']
        new_str = (old_str.encode('ascii','ignore')).decode()
        df.at[index, 'Phrase'] = new_str

def removeSpaces(df):
    for index, row in df.iterrows():
        df.at[index,'Phrase'] = re.sub(r'\s+\'', "'", row['Phrase'])
        
contractions.add('n\'t', 'not')
def expandContractions(df):
    for index, row in df.iterrows():
        phrase = []
        for i in row['Phrase'].split():
            phrase.append(contractions.fix(i))
        string_version = ' '.join(phrase)
        df.at[index, 'Phrase'] = string_version
        
def removePunctuation(df):
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    for index,row in df.iterrows():
        phrase = []
        for i in row['Phrase'].split():
            phrase.append(regex.sub('', i))
        string_version = ' '.join(phrase)
        df.at[index, 'Phrase'] = string_version

In [4]:
initialFrame = cleanInitialFrame(initialFrame)
initialFrame = lowerAllPhrases(initialFrame)
asciiClean(initialFrame)
removeSpaces(initialFrame)
expandContractions(initialFrame)
removePunctuation(initialFrame)

In [5]:
initialFrame

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,a series of escapades demonstrating the adage ...,1
1,64,2,this quiet introspective and entertaining ind...,4
2,82,3,even fans of ismail merchants work i suspect ...,1
3,117,4,a positively thrilling combination of ethnogra...,3
4,157,5,aggressive selfglorification and a manipulativ...,1
...,...,...,...,...
8524,155985,8540,either you are willing to go with this claust...,2
8525,155998,8541,despite these annoyances the capable clayburg...,2
8526,156022,8542,lrb tries rrb to parody a genre that is alread...,1
8527,156032,8543,the movies downfall is to substitute plot for ...,1


In [6]:
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Allen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Allen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
def removeStopWords(df):
    
    phrases_list = list(df['Phrase'])
    stop_words = set(stopwords.words('english'))

    for i in range(len(phrases_list)):
        word_tokens = word_tokenize(phrases_list[i])
        filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
        filtered_sentence = []
        for w in word_tokens:
            if w not in stop_words:
                filtered_sentence.append(w)
        phrases_list[i] = filtered_sentence
    
    for i in range(len(phrases_list)):
        phrases_list[i] = TreebankWordDetokenizer().detokenize(phrases_list[i])
    
    count = 0
    for index, row in df.iterrows():
        # df['Phrase'] = df['Phrase'].replace([row['Phrase']], phrases_list[count])
        df.at[index, 'Phrase'] = phrases_list[count]
        count += 1
    return df      

In [8]:
initialFrame = removeStopWords(initialFrame)

In [9]:
def makeCorpus(df):
    corpus = []
    for index, row in df.iterrows():
        phrase = row.Phrase
        corpus.append(phrase)
    return corpus
        
            

In [10]:
corpus = makeCorpus(initialFrame)

In [11]:
corpus

['series escapades demonstrating adage good goose also good gander occasionally amuses none amounts much story',
 'quiet introspective entertaining independent worth seeking',
 'even fans ismail merchants work suspect would hard time sitting one',
 'positively thrilling combination ethnography intrigue betrayal deceit murder shakespearean tragedy juicy soap opera',
 'aggressive selfglorification manipulative whitewash',
 'comedydrama nearly epic proportions rooted sincere performance title character undergoing midlife crisis',
 'narratively trouble every day plodding mess',
 'importance earnest thick wit plays like reading bartletts familiar quotations',
 'leave much',
 'could hate reason',
 'little recommend snow dogs unless one considers cliched dialogue perverse escapism source high hilarity',
 'kung pow oedekerks realization childhood dream martialarts flick proves sometimes dreams youth remain',
 'performances absolute joy',
 'fresnadillo something serious say ways extravagant cha

In [12]:
def wordVector(df):
    wordVec = []
    for index, row in df.iterrows():
        for i in row['Phrase'].split():
            wordVec.append(i)
    wordVec = set(wordVec)
    return wordVec

In [13]:
wordVec = wordVector(initialFrame)
wordVec

{'basically',
 'decades',
 'two',
 'befuddling',
 'welcome',
 'love',
 'agenda',
 'knowledge',
 'soiree',
 'slivers',
 '1986',
 'corbett',
 'make',
 'scalds',
 'bros',
 'attuned',
 'zen',
 'creatively',
 'utter',
 'lad',
 'bloodletting',
 'cleanflicks',
 'lan',
 'largeframe',
 'career',
 'beresford',
 'girlfriends',
 'practices',
 'villeneuve',
 'dilemma',
 'melodramatic',
 'bartletts',
 'crippled',
 'composed',
 'rowlings',
 'brought',
 'villainous',
 'nm',
 'mcgrath',
 'daughters',
 'purposes',
 'fiennes',
 'anybody',
 'conquers',
 'overburdened',
 'misleading',
 'juliette',
 'provokes',
 'horses',
 'epps',
 'works',
 'fusion',
 'onedimensional',
 'solidity',
 'insinuation',
 'deadly',
 'recovered',
 'heaven',
 'poof',
 'stance',
 'romeo',
 'activism',
 'devotion',
 'vernes',
 'rip',
 'hopelessly',
 'gradually',
 'overcomingobstacles',
 'comic',
 'engaged',
 'implies',
 'straightshooting',
 'swept',
 'extremities',
 'nary',
 'streetwise',
 'early',
 'cattaneo',
 'salute',
 'trick',
 

In [14]:
intValue = {}

for i,word in enumerate(wordVec):
    intValue[word] = i

phrases = []
for phrase in corpus:
    phrases.append(phrase.split())
    
WINDOW_SIZE = 2

data = []
for sentence in phrases:
    for idx, word in enumerate(sentence):
        for neighbor in sentence[max(idx - WINDOW_SIZE, 0) : min(idx + WINDOW_SIZE, len(sentence)) + 1] : 
            if neighbor != word:
                data.append([word, neighbor])

In [15]:
intValue

{'basically': 0,
 'decades': 1,
 'two': 2,
 'befuddling': 3,
 'welcome': 4,
 'love': 5,
 'agenda': 6,
 'knowledge': 7,
 'soiree': 8,
 'slivers': 9,
 '1986': 10,
 'corbett': 11,
 'make': 12,
 'scalds': 13,
 'bros': 14,
 'attuned': 15,
 'zen': 16,
 'creatively': 17,
 'utter': 18,
 'lad': 19,
 'bloodletting': 20,
 'cleanflicks': 21,
 'lan': 22,
 'largeframe': 23,
 'career': 24,
 'beresford': 25,
 'girlfriends': 26,
 'practices': 27,
 'villeneuve': 28,
 'dilemma': 29,
 'melodramatic': 30,
 'bartletts': 31,
 'crippled': 32,
 'composed': 33,
 'rowlings': 34,
 'brought': 35,
 'villainous': 36,
 'nm': 37,
 'mcgrath': 38,
 'daughters': 39,
 'purposes': 40,
 'fiennes': 41,
 'anybody': 42,
 'conquers': 43,
 'overburdened': 44,
 'misleading': 45,
 'juliette': 46,
 'provokes': 47,
 'horses': 48,
 'epps': 49,
 'works': 50,
 'fusion': 51,
 'onedimensional': 52,
 'solidity': 53,
 'insinuation': 54,
 'deadly': 55,
 'recovered': 56,
 'heaven': 57,
 'poof': 58,
 'stance': 59,
 'romeo': 60,
 'activism': 6

In [16]:
df = pd.DataFrame(data, columns = ['input', 'label'])

In [17]:
df

Unnamed: 0,input,label
0,series,escapades
1,series,demonstrating
2,escapades,series
3,escapades,demonstrating
4,escapades,adage
...,...,...
267267,avuncular,hearsts
267268,avuncular,forced
267269,avuncular,chortles
267270,chortles,forced


In [18]:
from tqdm import tqdm

In [19]:
import tensorflow as tf
one_hot_encoding = len(wordVec)
def encoder(index):
    hot_encoding = np.zeros(one_hot_encoding)
    hot_encoding[index] = 1
    return hot_encoding
input_word = []
target_word = []

for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    input_word.append(encoder(intValue[row['input']]))
    


100%|██████████| 267272/267272 [00:46<00:00, 5714.79it/s]


In [20]:
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    target_word.append(encoder(intValue[row['label']]))
    


 31%|███       | 81892/267272 [00:24<00:55, 3332.73it/s]


MemoryError: Unable to allocate 130. KiB for an array with shape (16576,) and data type float64

In [None]:
input_word_train = np.asarray(input_word)
target_word_train = np.asarray(target_word)



In [None]:
inp_pl = tf.placeholder(tf.float32, shape=(None, one_hot_encoding))
y_label = tf.placeholder(tf.float32, shape=(None, one_hot_encoding))



In [None]:
weight_layer = tf.Variable(tf.raandom_normal([one_hot_encoding, 2]))
bias = tf.Variable(tf.random_normal[1])


In [None]:
pred = tf.nn.softmax(tf.add( tf.matmul(hidden_layer, W2), b2))

In [None]:
loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(pred), axis=[1]))
gradientDescent = tf.train.GradientDescentOptimizer(0.05).minimize(loss)