In [4]:
import numpy as np
import pandas as pd
import torch
import sys
import string
from collections import Counter

In [5]:
vocab = {}

In [6]:
def initializeVocabulary():
    unkToken = '<UNK>'
    vocab['t_2_i'] = {}
    vocab['i_2_t'] = {}
    idx = addToken(unkToken)
    vocab['addUnk'] = True
    vocab['unkToken'] = unkToken
    vocab['unkTokenIdx'] = idx 

In [8]:
def addToken(token):
    if token in vocab['t_2_i']:
        idx = vocab['t_2_i'][token]
    else: 
        idx = len(vocab['t_2_i'])
        vocab['t_2_i'][token] = idx
        vocab['i_2_t'][idx] = token
    return idx

In [9]:
def addManyTokens(tokens):
    idxes = [addToken(token) for token in tokens]
    return idxes

In [11]:
def lookUpToken(token):
    if vocab['unkTokenIdx'] >= 0:
        return vocab['t_2_i'].get(token,vocab['unkTokenIdx'])
    else:
        return vocab['t_2_i'][token]

In [12]:
def lookUpIndex(idx):
    if idx not in vocab['i_2_t']:
        raise KeyError("the index (%d) is not there" % idx)
    return vocab['i_2_t'][idx]

In [14]:
def vocabularyFromDataFrame(df, cutoff = 25):
    initializeVocabulary()
    wordCounts = Counter()
    for r in df.review: 
        for word in r.split(" "):
            if word not in string.punctuation:
                wordCounts[word] += 1
    for word,count in wordCounts.items():
        if count > cutoff:
            addToken(word)

In [15]:
df = pd.read_csv(r'/Users/ceylinekinci/NN-Basics/data/reviews.csv')

In [16]:
vocabularyFromDataFrame(df, cutoff = 25)

In [17]:
lookUpToken('this')

128

In [19]:
lookUpIndex(128)

'this'

In [21]:
len(vocab['t_2_i'])

8945

In [25]:
#vectorizer:
def vectorize(review):
    isFirst = True
    for token in review.split(" "):
        if token not in string.punctuation:
            oneHot = np.zeros((len(vocab['t_2_i']),1))
            oneHot[lookUpToken(token)] = 1
            if isFirst:
                xF=oneHot
                isFirst = False
            else:
                xF=np.hstack((xF, oneHot))
    return xF

In [26]:
xF = vectorize(df['review'][1])

In [27]:
xF.shape

(8945, 17)

In [28]:
df

Unnamed: 0,rating,review
0,negative,terrible place to work for i just heard a stor...
1,negative,"hours , minutes total time for an extremely s..."
2,negative,my less than stellar review is for service . w...
3,negative,i m granting one star because there s no way t...
4,negative,the food here is mediocre at best . i went aft...
...,...,...
55995,positive,"great food . wonderful , friendly service . i ..."
55996,positive,charlotte should be the new standard for moder...
55997,positive,get the encore sandwich ! ! make sure to get i...
55998,positive,i m a pretty big ice cream gelato fan . pretty...


In [29]:
smallDF_pos = df[df['rating']=='positive'].iloc[:5]
smallDF_neg = df[df['rating']=='negative'].iloc[:5]
df_small = pd.concat([smallDF_pos, smallDF_neg],axis = 0)

In [30]:
df_small

Unnamed: 0,rating,review
28000,positive,my experience was by far the most pleasant i h...
28001,positive,i have been to this place a couple of times on...
28002,positive,very popular sushi bar in the heart of old tow...
28003,positive,the staff is nice . it s pretty clean . they u...
28004,positive,my co worker picked up lunch for us from this ...
0,negative,terrible place to work for i just heard a stor...
1,negative,"hours , minutes total time for an extremely s..."
2,negative,my less than stellar review is for service . w...
3,negative,i m granting one star because there s no way t...
4,negative,the food here is mediocre at best . i went aft...


In [32]:
vocabularyFromDataFrame(df_small, cutoff = 0)

In [33]:
len(vocab['t_2_i'])

491

In [35]:
numFeatures = len(vocab['t_2_i'])
hiddenUnits = 10
h0 = torch.tensor(np.zeros((hiddenUnits,1)))
Wx = torch.tensor(np.random.uniform(0,1,(hiddenUnits, numFeatures)),requires_grad =True)
Wh = torch.tensor(np.random.uniform(0,1,(hiddenUnits, hiddenUnits)),requires_grad =True)
Wy = torch.tensor(np.random.uniform(0,1,(1, hiddenUnits)),requires_grad =True)


In [66]:
def stepForward(xt, Wx, Wh, Wy, prevMemory):
    x_frd = torch.matmul(Wx, torch.from_numpy(xt[:,np.newaxis]))
    h_frd = torch.matmul(Wh, prevMemory)
    ht = torch.tanh(x_frd+h_frd)
    yt_hat = torch.sigmoid(torch.matmul(Wy, ht))
    return ht,yt_hat

In [67]:
def fullForwardRNN(X, Wx, Wh, Wy, prevMemory):
    y_hat = 0
    ht = prevMemory
    for t in range(X.shape[1]):
        ht, yt_hat = stepForward(X[:,t],Wx,Wh, Wy, prevMemory)
        prevMemory = ht
        y_hat = yt_hat
    return y_hat

In [68]:
def computeLoss(y,y_hat):
    loss = 0
    for yi, yi_hat in zip(y, y_hat):
        if yi==1:
            loss+= -torch.log2(yi_hat)
        else:
            loss+= -torch.log2(1- yi_hat)
    return loss/len(y)

In [69]:
def updateParams(Wx,Wh,Wy,dWx, dWh,dWy, lr):
    with torch.no_grad():
        Wx -= lr*dWx 
        Wh -= lr*dWh
        Wy -= lr*dWy
    return Wx, Wh, Wy

In [70]:
def trainRnn(train_df, Wx, Wh, Wy, prevMemory, lr, nepoch):
    losses = []
    for epoch in range(nepoch):
        y,y_hat = [],[]
        for rv, rt in zip(train_df['review'], train_df['rating']):
            X=vectorize(rv)
            yi_hat = fullForwardRNN(X, Wx,Wh,Wy,prevMemory)
            yi=0
            if rt == 'positive' : 
                yi = 1
            y.append(yi)
            y_hat.append(yi_hat)

        loss = computeLoss(y, y_hat)
        loss.backward()
        losses.append(loss)
        print("Loss after epoch = %d: %f" %(epoch,loss))
        sys.stdout.flush()
        dWx = Wx.grad.data
        dWh = Wh.grad.data
        dWy = Wy.grad.data
        Wx,Wh,Wy = updateParams(Wx,Wh,Wy,dWx, dWh,dWy, lr)
        Wx.grad.data.zero_()
        Wh.grad.data.zero_()
        Wy.grad.data.zero_()
    return Wx, Wh, Wy, losses

In [73]:
Wx, Wh, Wy, losses = trainRnn(df_small, Wx, Wh, Wy, h0, 0.01, 50)

Loss after epoch = 0: 2.699967
Loss after epoch = 1: 2.652953
Loss after epoch = 2: 2.606275
Loss after epoch = 3: 2.559954
Loss after epoch = 4: 2.514011
Loss after epoch = 5: 2.468466
Loss after epoch = 6: 2.423344
Loss after epoch = 7: 2.378665
Loss after epoch = 8: 2.334455
Loss after epoch = 9: 2.290737
Loss after epoch = 10: 2.247537
Loss after epoch = 11: 2.204879
Loss after epoch = 12: 2.162790
Loss after epoch = 13: 2.121297
Loss after epoch = 14: 2.080424
Loss after epoch = 15: 2.040198
Loss after epoch = 16: 2.000647
Loss after epoch = 17: 1.961795
Loss after epoch = 18: 1.923669
Loss after epoch = 19: 1.886293
Loss after epoch = 20: 1.849692
Loss after epoch = 21: 1.813889
Loss after epoch = 22: 1.778907
Loss after epoch = 23: 1.744767
Loss after epoch = 24: 1.711487
Loss after epoch = 25: 1.679087
Loss after epoch = 26: 1.647582
Loss after epoch = 27: 1.616986
Loss after epoch = 28: 1.587311
Loss after epoch = 29: 1.558567
Loss after epoch = 30: 1.530762
Loss after epoch =

In [93]:
r = df_small['review'].iloc[6]
y = df_small['rating'].iloc[6]

In [94]:
X= vectorize(r)

In [95]:
y_hat = fullForwardRNN(X,Wx,Wh,Wy,h0)

In [96]:
y_hat

tensor([[0.7245]], dtype=torch.float64, grad_fn=<SigmoidBackward0>)

In [97]:
y

'negative'