In [3]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import pandas as pd

In [4]:
df_size = 100000
ds = tfds.load('sentiment140', split='train', shuffle_files=True)
df = tfds.as_dataframe(ds.take(df_size))
df.sample(2)

Unnamed: 0,date,polarity,query,text,user
52332,b'Sat Jun 06 14:47:40 PDT 2009',4,b'NO_QUERY',b'@nextepisodenet KB: Now thanks to you I have...,b'marco1475'
52979,b'Thu Jun 18 12:44:23 PDT 2009',0,b'NO_QUERY',b'the merkat makes me all dizzy. ',b'gameOVERdose'


In [5]:
len(df)

100000

In [6]:
df['polarity'].value_counts()

4    50021
0    49979
Name: polarity, dtype: int64

---
##### Note: Binary classification can be used. 



In [7]:
df['bin_polarity'] = df['polarity'].apply(lambda x: 1 if x == 4 else 0)

The structure of the RNN is that we take just the output of the final word (unless we're doing a bidirectional format, or a concatenated format where we concatenate the output of all the words)

---

#### Creating X values

The text is in byte form, so we need to convert it to string form, and then use the split functionality to convert to a list. We ignore the first two characters which are a side effect of the conversion, and the last element in the string which is the same

In [8]:
print(f'columns names: {list(df.columns)}')

columns names: ['date', 'polarity', 'query', 'text', 'user', 'bin_polarity']


In [9]:
## separating the words into a list of words
df['split_words'] = df['text'].apply(lambda x: str(x)[2:].split()[:-1])

In [10]:
## Get the length of words
df['txt_length'] = df['split_words'].apply(lambda x: len(x))

In [11]:
df['txt_length'].value_counts(bins=10)

(7.8, 11.7]     21460
(3.9, 7.8]      20917
(11.7, 15.6]    17393
(15.6, 19.5]    13669
(19.5, 23.4]    12096
(23.4, 27.3]     7499
(-0.04, 3.9]     5696
(27.3, 31.2]     1233
(31.2, 35.1]       36
(35.1, 39.0]        1
Name: txt_length, dtype: int64

----
Very few tweets are over 30 words, so we will create an RNN based on 30 words

We now need to create a words dictionary

In [12]:
Tx = 30 #this is the value we use to specify how many words to consider in the tweet (first e.g. 30)

In [13]:
no_words = 2000
top_2k_words = list(df['split_words'].explode().value_counts()[:no_words].index)

word_dict_2k = {}
for idx, word in enumerate(top_2k_words):
    word_dict_2k[idx+1] = word

In [14]:
#create additional values for 'word is over' and 'unknown'
word_dict_2k[no_words+1] = 'word_over'
word_dict_2k[0] = 'UNKNOWN-WORD'

In [15]:
word_dict_reversed = {}
for key, value in word_dict_2k.items():
    word_dict_reversed[value] = key

---
##### Our X values need to be 2000 x 30 x m, one hot encoded

In [16]:
#working with a smaller set for now

reduced_df = df.iloc[:df_size]

In [17]:
#getting the words in numeric index format
x = reduced_df['split_words'].apply(lambda x: np.array([word_dict_reversed[word] if word in word_dict_reversed.keys() else 0 for word in x]))

In [18]:
'''we're going to make all the x values the same length'''
array_x = np.zeros((df_size,Tx)) #blank array to put them in
for idx,arr in enumerate(x):
    leng = len(arr) 
    #get length, and fork based on current size
    if leng > Tx:
        array_x[idx] = arr[:Tx] #just take first 30 values
    elif leng < Tx:
        array_x[idx] = np.append(arr,np.zeros(30-leng)+no_words+1) #append 30 minus the current length -1s
#put examples on the columns
array_x = array_x.T

In [19]:
'''Now one hot encode them'''
one_hot_x_40k = np.zeros((no_words+2,Tx,df_size))
for row_idx, row in enumerate(array_x):

    for exam_idx, word_val in enumerate(row):

        one_hot_x_40k[int(word_val),row_idx,exam_idx] = 1

In [20]:
array_x[12,32:50]

array([ 127., 2001.,    3., 1211.,    0., 2001., 1319.,    0., 2001.,
         71., 2001., 2001., 2001.,  109., 2001.,   16., 2001., 2001.])

---
We've used 2002 in order to incorporatate the 2001th word (end of word) and unknown index. We can need to create a mask which tells the machine to skip if the mask is positive

In [21]:
mask_x  = np.where(array_x==2001,1,0)

----
OK so we have our x inputs. Mask x, shaped 30 x 40000, and the actual onehotencoded x values, of shape 2001, 30, 40000

In the basic RNN, ignoring the last stage for a second, each cell has three weights matrixes.

### Forward Prop

WAa - the weights applied to the previous cells outputs

WAx - the weights applied to the X values

WaB - a bias term. 

We have to make a choice of how large each cell is, and then initialise the weights. We'll use a Xavier initialization for now

In [22]:
#try a cell size of 50
cell_size = 50
x_size = no_words+no_words #Xavier_initialization of weights.
WAa = np.random.uniform(-(6)/(np.sqrt(cell_size*2)),(6)/((cell_size*2)),[cell_size,cell_size]) 
WAb = np.random.uniform(-(6)/(np.sqrt(cell_size+1)),(6)/((cell_size+1)),[cell_size,1]) 
WAx = np.random.uniform(-(6)/(np.sqrt(cell_size+x_size)),(6)/((cell_size+x_size)),[cell_size,x_size]) 

#We need a weights matrix too for later
WYa = np.random.uniform(-(6)/((cell_size+1)),(6)/((cell_size+1)),[1,cell_size])
WYb = np.random.uniform(-(6)/((1+1)),(6)/((1+1)),[1,1])

weights_dict = {}
weights_dict['WAa'] = WAa
weights_dict['WAb'] = WAb
weights_dict['WAx'] = WAx
weights_dict['WYa'] = WYa
weights_dict['WYb'] = WYb


---
Now let's implement one run of forward prop in order to replicate it going forward

In [27]:
#these allow us to set some default values
a0 = np.zeros((cell_size,df_size))
z_dict, a_dict = {}, {}
a_dict[0] = a0

In [71]:
'''See notes from above - this is just implemented as a function'''
def forward_prop(a_prev,  WAa, WAx, WAb, x, mask,time_period=0):
    
    # #z1 - the pre-tanh values of a0, bias and x1 times weights
    z1 = WAa @ a_prev + WAx @ x[:,time_period,:] + WAb
    
    # #a1 either equals a1 if mask ==1 or equal tanh z1 if mask == - 
    a1 = mask[time_period].reshape(1,-1) * a_prev + (1-mask[time_period]) * np.tanh(z1)
    
    return a1, z1

---
The idea here is to loop over the time periods and run forward prop each time. We will need to re-use a1 and z1 when we come back and do the back prop, so we will store them in a dictionary

In [None]:
def cells_fw_prop(Tx = Tx, z_dict, a_dict, a0,WAa, WAx, WAb, x, mask):
    a_prev = a0 #to have an initial value to call the forward_prop function
    for i in range(Tx):
        a_, z_ = forward_prop(a_prev=a_prev, time_period = i,x=x,mask = mask,WAa=WAa, WAx=WAx, WAb=WAb)
        z_dict[i+1], a_dict[i+1] = z_, a_
        a_prev = a_
    #we end by returning the dictionary for z and a values throughout the time period
    return z_dict, a_dict 

---
##### Lets put it all together in a forward prop function

In [73]:
def sigmoid(Yz):
        return (1/(1+np.exp(-Yz)))
    
def full_fw_prop(weights_dict, z_dict, a_dict, x, mask, Tx = Tx):
    
    # initialize a0 and dict values
    a0 = np.zeros((cell_size, x.shape[2]))
    a_dict[0] = a0
    WAa, WAb, WAx, WYa, WYb = weights_dict.values()

    #do the cell forward prop    
    z_dict, a_dict = cells_fw_prop(Tx = Tx, z_dict=z_dict, a_dict=a_dict, a0=a0,WAa=WAa, WAx=WAx, WAb=WAb, x=x,mask=mask)

    #extract last value and prediction
    final_a = a_dict[Tx]
    Yz = WYa @ final_a + WYb
   
    Ÿ = sigmoid(Yz)

    return z_dict, a_dict, Yz, Ÿ

----
### Back Propogation

We can conveniently use the differentiated sigmoid function, which gives

DL/DZ = Ÿ - Y

From there we can initially back calculate the values of the final portion, giving values for DL/DA, DL/DWYa, DL/DWYb

In [36]:
#Turn Y into an array
Y = np.array(reduced_df['bin_polarity']).reshape(1,-1)

In [31]:
#let's start by initializing the backprop dicts
dA_dict, dZ_dict = {}, {}

---
OK let's create the relevant functions

In [None]:
def back_prop_one_layer(prev_Loss, t, weights_dict,  a_dict,z_dict,  X, mask_x,learning_rate = 0.01,batch_size=400):
    
    WAa, WAb, WAx, WYa, WYb = weights_dict.values()
    a_ = a_dict[t]
    z_ = z_dict[t]
    a__ = a_dict[t-1]
    x_ = X[:,t-1,:]

    #get dA and dZ as a base for the layer
    dL_dA = prev_Loss #40000,50
    dA_dZ = ((1-mask_x[t-1]) * (1-(np.tanh(z_)**2))).T #40000,50
    
    #get a previous
    dZ_dAp = WAa #50x50
    dA_dAp = mask_x[t-1].reshape(-1,1) + np.zeros_like((dL_dA)) + (dA_dZ @ dZ_dAp) #we create a mask of 1s everywhere here.
    dL_dAp = dL_dA * dA_dAp

    #differentiate with respect to weights
    dZ_dWAa = a__ #50 x m
    dL_dWAa = ((dL_dA * dA_dZ).T @ dZ_dWAa.T)/batch_size #weights dont matter if we are having a 0 on the mask

    dZ_dWAb = np.zeros((batch_size,1))+1
    dL_dWAb = ((dL_dA * dA_dZ).T @ dZ_dWAb)/batch_size

    dZ_dWAx = x_
    dL_dWAx = ((dL_dA * dA_dZ).T @ dZ_dWAx.T)/batch_size #weights dont matter if we are having a 0 on the mask
    
    #now update the weights based on the findings here.
    return dL_dWAa, dL_dWAx, dL_dWAb, dL_dAp

In [None]:
def back_prop_full_swing(Ÿ, Y, Tx=Tx, weights_dict,  a_dict,z_dict,  X, mask_x,learning_rate = 0.01,batch_size = 100):
    
    '''initial loss function w.r.t Z'''
    WAa, WAb, WAx, WYa, WYb = weights_dict.values()
    dL_dZ = (Ÿ - Y).T #40000 x 1


    # first order diffs
    dZ_dA = WYa # 1 x 50
    dZ_dWYa = a_dict[Tx] # 50 x 40000
    dZ_dB = np.zeros_like(dL_dZ) + 1

    #chain ruled diffs
    dL_dA = dL_dZ @ dZ_dA
    dL_dWYa = (dZ_dWYa @ dL_dZ).T / batch_size
    dL_dWYb = (dZ_dB.T @ dL_dZ)/ batch_size
    
    weights_dict['WYa'] -= learning_rate * dL_dWYa
    weights_dict['WYb'] -= learning_rate * dL_dWYb
    
    prev_loss = dL_dA
    '''now go through the other functions'''
    for t in reversed(range(1,Tx)):
        #extract relevant differentials for updating backprop and also carrying on the backprop through the layers
        dL_dWAa, dL_dWAx, dL_dWAb, dL_dAp = back_prop_one_layer(prev_Loss=prev_loss, t=t, weights_dict = weights_dict,X=x,  a_dict=a_dict,z_dict=z_dict,mask_x=mask_x,batch_size=batch_size)
        weights_dict['WAa'] -= learning_rate*dL_dWAa
        weights_dict['WAx'] -= learning_rate*dL_dWAx
        weights_dict['WAb'] -= learning_rate*dL_dWAb
        prev_loss = dL_dAp
        return dL_dZ, dZ_dA, dZ_dB, dL_dA, dL_dWYa, dL_dWYb, dL_dWAa, dL_dWAx, dL_dWAb, dL_dAp

In [None]:
# back_prop_full_swing(Ÿ=Ÿ, Y=Y, Tx=Tx, weights_dict=weights_dict,  a_dict=a_dict,z_dict=z_dict,  X=one_hot_x_40k, mask_x=mask_x,learning_rate = 0.01)

---

#### Putting it all together

Let's now create a function which runs forward prop and backward prop in batches, and calculates the loss at each time

In [142]:
'''set some initial parameters'''
epochs = 1000
cell_size = 100
x_size = no_words+2

''' initialize weights'''

WAx = np.random.uniform(-1,1,[cell_size,x_size])
WAb = np.random.uniform(-1,1,[cell_size,1])
WAa = np.random.uniform(-0.1,0.1,[cell_size,cell_size])
WYa = np.random.uniform(-0.25,0.25,[1,cell_size])
WYb = np.random.uniform(-1,1,[1,1])

weights_dict = {}
weights_dict['WAa'] = WAa
weights_dict['WAb'] = WAb
weights_dict['WAx'] = WAx
weights_dict['WYa'] = WYa
weights_dict['WYb'] = WYb

'''define additional dictionaries and characteristics'''
z_dict, a_dict = {}, {}
losses = []
accs = []
batches=500
batch_size = int(df_size//batches) #so 400

for epoch in range(epochs):
    #run fw prop
    '''create batching'''
    

    ''' This sections deals with batching'''
    
    round_ = epoch % batches #deals with if it's the first of a new set of batches
    n, k = batch_size*round_, batch_size*(1 + round_)
    x = one_hot_x_40k[:,:,n:k]
    mask_x_ = mask_x[:,n:k]
    y = Y[0,n:k]
    
    '''Forward prop'''
    z_dict, a_dict, Yz, Ÿ = full_fw_prop(Tx = Tx, z_dict=z_dict, a_dict=a_dict, weights_dict = weights_dict, x=x,mask=mask_x_)
    '''create loss metrics for reporting'''
    loss = -np.sum(y*np.log(Ÿ) + (1-y)*np.log(1-Ÿ))/batch_size
    accuracy = np.sum(y==np.where(Ÿ>0.5,1,0))/batch_size
    losses.append(loss)
    accs.append(accuracy)
    if epoch % 200 == 0:
        print(f'Accuracy score: {np.mean(accs[(epoch-batch_size):])}')    
        print(f'Epoch: {epoch}, loss: {loss}')
    
    #run backprop
    dL_dZ, dZ_dA, dZ_dB, dL_dA, dL_dWYa, dL_dWYb, dL_dWAa, dL_dWAx, dL_dWAb, dL_dAp = back_prop_full_swing(Ÿ=Ÿ, Y=y, Tx=Tx, weights_dict=weights_dict,  a_dict=a_dict,z_dict=z_dict,  X=x, mask_x=mask_x_,learning_rate = 0.1,batch_size = batch_size)

Accuracy score: 0.49
Epoch: 0, loss: 0.7850219847952613
Accuracy score: 0.5309452736318407
Epoch: 200, loss: 0.689136217178594
Accuracy score: 0.5364676616915423
Epoch: 400, loss: 0.6982539029965139
Accuracy score: 0.542960199004975
Epoch: 600, loss: 0.6808523880735468
Accuracy score: 0.544228855721393
Epoch: 800, loss: 0.6758142577257107


In [68]:
Ÿ

array([[0.75961822, 0.66332816, 0.42293606, 0.74628054, 0.66337667,
        0.51844784, 0.39549641, 0.37381742, 0.48381265, 0.41081774,
        0.48419246, 0.47888069, 0.76984334, 0.70411132, 0.63604063,
        0.49869362, 0.53156112, 0.54493951, 0.50739358, 0.53566634]])

---
We see a small amount of ability from the model to predict sentiment.

#### GRU

We'll use the same format, albeit with different functions to build a GRU

In [111]:
'''Weights initialisation'''
def weights_initialisation(batch_size,cell_size=50,x_size=2002):
    #initialise the weights for the main 'estimator' node
    weights={}
    weights['gWCx'] = np.random.uniform(-1,1,[cell_size, x_size])
    weights['gWCc'] = np.random.uniform(-1,1,[cell_size, cell_size])
    weights['gWCb'] = np.random.uniform(-1,1,[cell_size,1])

    #initialize weights for the 'update' node
    weights['gWUx'] = np.random.uniform(-1,1,[cell_size, x_size])
    weights['gWUc'] = np.random.uniform(-1,1,[cell_size, cell_size])
    weights['gWUb'] = np.random.uniform(-1,1,[cell_size,1])

    #initalise weights for output layer
    weights['gWYa'] = np.random.uniform(-0.25,0.25,[1,cell_size])
    weights['gWYb'] = np.random.uniform(-1,1,[1,1])
    
    a0 = np.zeros((cell_size, batch_size)) #create blank a0
    return weights, a0

def dict_initialisation_gru():
    zc_dict, zu_dict, č_dict, cupd_dict, c_dict = {}, {}, {}, {}, {}
    return zc_dict, zu_dict, č_dict, cupd_dict, c_dict

In [123]:
def gforward_prop_1(c_prev, weights, x, mask, t=0):
    gWCx, gWCc, gWCb, gWUx, gWUc, gWUb, _, _ = weights.values()
    
    #create value for č by creating a z value to be tanhd + b
    zc = gWCx @ x[:,t,:] + gWCc @ c_prev + gWCb
    č = np.tanh(zc)
    
    #now create a value for c_update, and use mask to make this 0 if the mask is on.
    zu = gWUx @ x[:,t,:] + gWUc @ c_prev + gWUb 
    Cupd = (1-mask[t]) * sigmoid(zu) #this will be zero if mask is 1
    
    #now update c
    c = Cupd * č + (1-Cupd) * c_prev
    
    return zc, č, zu, Cupd, c

In [125]:
def gcells_fw_prop(zc_dict, zu_dict, č_dict, cupd_dict, c_dict, a0, weights, x, mask,Tx):
    
    c_prev = a0 #to have an initial value to call the forward_prop function
    for t in range(Tx):
        zc, č, zu, Cupd, c = gforward_prop_1(c_prev=c_prev, t = t,x=x,mask = mask, weights=weights)
        zc_dict[t+1], č_dict[t+1], zu_dict[t+1], cupd_dict[t+1], c_dict[t+1] = zc, č, zu, Cupd, c
        c_prev = c
    #we end by returning the dictionary for z and a values throughout the time period
    return zc_dict, zu_dict, č_dict, cupd_dict, c_dict

In [139]:
def gfull_fw_prop(a0, weights, zc_dict, zu_dict, č_dict, cupd_dict, c_dict, x, mask, Tx = Tx):
    
    #do the cell forward prop    
    zc_dict, zu_dict, č_dict, cupd_dict, c_dict = gcells_fw_prop(zc_dict, zu_dict, č_dict, cupd_dict, c_dict, a0, weights, x, mask,Tx)

    #extract last value and prediction
    final_c = c_dict[Tx]
    Yz = weights['gWYa'] @ final_c + weights['gWYb']
    Ÿ = sigmoid(Yz)

    return zc_dict, zu_dict, č_dict, cupd_dict, c_dict, Yz, Ÿ

---

#### unit tests

In [141]:
#when i initialise the weights, with a different x size, gWCx gives that value. same with cell size
x_size_ut = 2002
c_size_ut = 50
batch_size= 1000
x_ut, mask_x_ut = one_hot_x_40k[:,:,:batch_size], mask_x[:,:batch_size]

weights_ut,a0= weights_initialisation(cell_size=c_size_ut,x_size=x_size_ut,batch_size=batch_size)
assert weights_ut['gWUx'].shape == (c_size_ut, x_size_ut)

#when i call the main function, c_dict[30] should be numbers with mean ~ 0 and var [0.25,1.25]
zc_dict, zu_dict, č_dict, cupd_dict, c_dict = dict_initialisation_gru()
zc_dict, zu_dict, č_dict, cupd_dict, c_dict, Yz, Ÿ = gfull_fw_prop(a0, weights_ut, zc_dict, zu_dict, č_dict, cupd_dict, c_dict, x_ut, mask_x_ut, Tx)

assert abs(np.mean(c_dict[30])) < 0.1
assert 0.25 < np.var(c_dict[30]) < 1.25

print(f'Mean: {np.mean(c_dict[30])}\nVariance: {np.var(c_dict[30])}')

Mean: -0.02841736242788992
Variance: 0.619418359769762


---
Alright now we're burning diesel.

Time for back prop...

In [144]:
weights_ut.keys()

dict_keys(['gWCx', 'gWCc', 'gWCb', 'gWUx', 'gWUc', 'gWUb', 'gWYa', 'gWYb'])

In [None]:
def gback_prop_one_layer(prev_loss, t, weights, zc_dict, zu_dict, č_dict, cupd_dict, c_dict,  x, mask_x,learning_rate = 0.01,batch_size=400):
    
    gWCx, gWCc, gWCb, gWUx, gWUc, gWUb, gWYa, gWYb = weights.values()
    zc, zu, č, cupd, c = zc_dict[t], zu_dict[t], č_dict[t], cupd_dict[t], c_dict[t]
    c_ = c_dict[t-1] #also take the last c value frm previous round for differentials
    x_ = x[:,t-1,:] #relevant x values

    dl_dc = prev_loss
    '''first backpropogate through to get c previous'''
    dc_dč = cupd
    dc_dcupd = č - c_
    dč_dzc = 1 - np.tanh(zc)**2
    dcupd_dzu = sigmoid(zu) * (1-sigmoid(zu))
    dzc_dc_1 = gWCc
    dzu_dc_1 = gWCu
    dc_dc_1 = (1-cupd) + (dc_dč * dč_dzc @ gWCc) + dc_dcupd * dcupd_dzu @ gWCu
    
#     #get dA and dZ as a base for the layer
#     dL_dC = prev_loss #40000,50
#     dA_dZ = ((1-mask_x[t-1]) * (1-(np.tanh(z_)**2))).T #40000,50
    
#     #get a previous
#     dZ_dAp = WAa #50x50
#     dA_dAp = mask_x[t-1].reshape(-1,1) + np.zeros_like((dL_dA)) + (dA_dZ @ dZ_dAp) #we create a mask of 1s everywhere here.
#     dL_dAp = dL_dA * dA_dAp

#     #differentiate with respect to weights
#     dZ_dWAa = a__ #50 x m
#     dL_dWAa = ((dL_dA * dA_dZ).T @ dZ_dWAa.T)/batch_size #weights dont matter if we are having a 0 on the mask

#     dZ_dWAb = np.zeros((batch_size,1))+1
#     dL_dWAb = ((dL_dA * dA_dZ).T @ dZ_dWAb)/batch_size

#     dZ_dWAx = x_
#     dL_dWAx = ((dL_dA * dA_dZ).T @ dZ_dWAx.T)/batch_size #weights dont matter if we are having a 0 on the mask
    
#     #now update the weights based on the findings here.
#     return dL_dWAa, dL_dWAx, dL_dWAb, dL_dAp