In [77]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

In [78]:
len(df)

1600000

In [70]:
ds = tfds.load('sentiment140', split='train', shuffle_files=True)
df = tfds.as_dataframe(ds.take(-1))
df.sample(2)

Unnamed: 0,date,polarity,query,text,user
262996,b'Sun May 03 20:45:40 PDT 2009',0,b'NO_QUERY',"b""Man, i'm tired of waiting for the cupcake up...",b'millenniumze'
1102121,b'Fri May 29 01:23:43 PDT 2009',0,b'NO_QUERY',"b""Shit... It's 2am and I'm wide awake """,b'anotherJohn'


In [71]:
df['polarity'].value_counts()

4    800000
0    800000
Name: polarity, dtype: int64

---
##### Note: Binary classification can be used. 

Lets review how long the words are

In [75]:
df['bin_polarity'] = df['polarity'].apply(lambda x: 1 if x == 4 else 0)

In [73]:
display(pd.DataFrame(df['length_of_text'].value_counts(bins=10
                                        ).sort_index()).reset_index().rename(
                                        columns={'index':'word_count','length_of_text':'freq'}).T)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
word_count,"(5.287999999999999, 77.1]","(77.1, 148.2]","(148.2, 219.3]","(219.3, 290.4]","(290.4, 361.5]","(361.5, 432.6]","(432.6, 503.7]","(503.7, 574.8]","(574.8, 645.9]","(645.9, 717.0]"
freq,911101,686081,2448,154,111,66,21,9,6,3


The structure of the RNN is that we take just the output of the final word (unless we're doing a bidirectional format, or a concatenated format where we concatenate the output of all the words)

---

#### Creating X values

The text is in byte form, so we need to convert it to string form, and then use the split functionality to convert to a list. We ignore the first two characters which are a side effect of the conversion, and the last element in the string which is the same

In [176]:
print(f'columns names: {list(df.columns)}')

columns names: ['date', 'polarity', 'query', 'text', 'user', 'length_of_text', 'bin_polarity', 'split_words', 'txt_length']


In [100]:
## separating the words into a list of words
df['split_words'] = df['text'].apply(lambda x: str(x)[2:].split()[:-1])

In [103]:
## Get the length of words
df['txt_length'] = df['split_words'].apply(lambda x: len(x))

In [106]:
df['txt_length'].value_counts(bins=10)

(6.4, 12.8]      509407
(12.8, 19.2]     421913
(-0.065, 6.4]    335661
(19.2, 25.6]     267707
(25.6, 32.0]      65097
(32.0, 38.4]        206
(38.4, 44.8]          6
(57.6, 64.0]          2
(51.2, 57.6]          1
(44.8, 51.2]          0
Name: txt_length, dtype: int64

----
Very few tweets are over 30 words, so we will create an RNN based on 30 words

We now need to create a words dictionary

In [253]:
no_words = 2000
top_2k_words = list(df['split_words'].explode().value_counts()[:no_words].index)

word_dict_2k = {}
for idx, word in enumerate(top_2k_words):
    word_dict_2k[idx+1] = word

In [254]:
#create additional values for 'word is over' and 'unknown'
word_dict_2k[no_words+1] = 'word_over'
word_dict_2k[0] = 'UNKNOWN-WORD'

In [255]:
word_dict_reversed = {}
for key, value in word_dict_2k.items():
    word_dict_reversed[value] = key

---
##### Our X values need to be 2000 x 30 x m, one hot encoded

In [261]:
#working with a smaller set for now
df_size = 40000
reduced_df = df.iloc[:df_size]

In [257]:
#getting the words in numeric index format
x = reduced_df['split_words'].apply(lambda x: np.array([word_dict_reversed[word] if word in word_dict_reversed.keys() else 0 for word in x]))

In [281]:
'''we're going to make all the x values the same length'''
array_x = np.zeros((df_size,30)) #blank array to put them in
for idx,arr in enumerate(x):
    leng = len(arr) 
    #get length, and fork based on current size
    if leng > 30:
        array_x[idx] = arr[:30] #just take first 30 values
    elif leng < 30:
        array_x[idx] = np.append(arr,np.zeros(30-leng)+no_words+1) #append 30 minus the current length -1s
#put examples on the columns
array_x = array_x.T

In [282]:
'''Now one hot encode them'''
one_hot_x_40k = np.zeros((2002,30,df_size))
for row_idx, row in enumerate(array_x):

    for exam_idx, word_val in enumerate(row):

        one_hot_x_40k[int(word_val),row_idx,exam_idx] = 1

In [283]:
array_x[29,7]

2001.0

---
We've used 2002 in order to incorporatate the 2001th word (end of word) and unknown index. We can need to create a mask which tells the machine to skip if the mask is positive

In [285]:
mask_x = np.zeros((30,df_size))
for idx, time_p in enumerate(mask_x):
    for idxx, example in enumerate(time_p):
        if array_x[idx,idxx] == 2001:

            mask_x[idx,idxx] = 1

----
OK so we have our x inputs. Mask x, shaped 30 x 40000, and the actual onehotencoded x values, of shape 2001, 30, 40000

In the basic RNN, ignoring the last stage for a second, each cell has three weights matrixes.

WAa - the weights applied to the previous cells outputs

WAx - the weights applied to the X values

WaB - a bias term. 

We have to make a choice of how large each cell is, and then initialise the weights. We'll use a Xavier initialization for now

In [None]:
#try a cell size of 50
cell_size = 50
WAa = np.zeros(())