# Clark Whitehead
# Sentiment Analysis - LSTM
# 

In [135]:
import json
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from keras.preprocessing.sequence import TimeseriesGenerator
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

In [2]:
# Requires extra packages.
!pip install tqdm boto3 requests regex sentencepiece sacremoses
!pip install tokenizers
!pip install huggingface-hub
import pickle
import cloudpickle as cp
import math
import sklearn.metrics # Area Under the ROC calculations.
import matplotlib.pylab as plt # Plotting
from urllib.request import urlopen
import torch
#Downloads a tokenizer that will automatically convert words to indices in a big dictionary.
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')



Using cache found in /home/clark/.cache/torch/hub/huggingface_pytorch-transformers_master


# Tokenize

In [133]:
tweet = "Hello there! How are you?"
indexed_tokens = tokenizer.encode(tweet, padding='max_length', add_special_tokens=True) 
# Create transition matrix as sparse matrix to save memory.
print(indexed_tokens[:50])
n = tokenizer.vocab_size
print(n)

[101, 7592, 2045, 999, 2129, 2024, 2017, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
30522


In [4]:
# print(tokenizer.decode(indexed_tokens, skip_special_tokens=True))

# Store all 35k file titles in a variable

In [5]:
from pathlib import Path
paths = [str(x) for x in Path('./filesplit/').glob('*.json')]
paths[:10]

['filesplit/reddit99917556.json',
 'filesplit/reddit998757.json',
 'filesplit/reddit997973.json',
 'filesplit/reddit99922195.json',
 'filesplit/reddit99909782.json',
 'filesplit/reddit99912979.json',
 'filesplit/reddit99905760.json',
 'filesplit/reddit99911182.json',
 'filesplit/reddit99900043.json',
 'filesplit/reddit99903226.json']

In [6]:
len(paths)

34119

In [15]:
# Python program to read
# json file
 
count = 0
list1 = []

matches = ["xbox", "Xbox", "XBOX"]
# Opening JSON file
for i in range(100):
    f = open(paths[i], 'r')
    data = [json.loads(line) for line in f]
    for item in data:
        if any(x in item["subreddit"] for x in matches):
            list1.append(item)


In [16]:
len(list1)

88

In [18]:
list1[2]["subreddit"]

'DayzXbox'

In [19]:
list2 = []
for item in list1:
    if item["selftext"] != "[removed]":
        list2.append(item)

In [20]:
len(list2)

77

In [21]:
list2[0]["selftext"]

'As the title says, I have not owned an Xbox since the Xbox360, and after seeing the exclusives coming to this console, I had to get my hands on one. Last week Walmart had a drop, and I scored a series X, this console is fantastic, the ultimate gamepass is a game changer. I own a ps5 as well, and I am truly excited for this generation of gaming.\n\nLooking forward to joining the Xbox community online, I don’t have many friends on there, hopefully change that soon. The Bethesda exclusives, Halo, Forza, and gamepass ultimate made me realize I was being one dimensional on my console preferences. \n\nI must say Gears 5 looks fantastic on my 65” OLED. \n\nTo everyone still trying for a console, be patient, you will get one I promise!'

In [22]:
list1[6]["selftext"]

''

In [23]:
len(list1)

88

In [24]:
# for item in data:
#     if "made" in item:
#         print("yes")

print(data[102]["selftext"])

[deleted]


In [47]:
if "hey" in data[102]["selftext"]:
    print("yes")

yes


# Load sentiment analysis training data

In [149]:
df = pd.read_csv("./data/reddit.csv")

In [150]:
X = df.clean_comment

In [151]:
Y = df.category

In [31]:
X.head(10)

0     family mormon have never tried explain them t...
1    buddhism has very much lot compatible with chr...
2    seriously don say thing first all they won get...
3    what you have learned yours and only yours wha...
4    for your own benefit you may want read living ...
5    you should all sit down together and watch the...
6     was teens when discovered zen meditation was ...
7                             jesus was zen meets jew 
8    there are two varieties christians dogmatic th...
9    dont worry about trying explain yourself just ...
Name: clean_comment, dtype: object

In [32]:
len(X)

37249

In [33]:
X.iloc[1]

'buddhism has very much lot compatible with christianity especially considering that sin and suffering are almost the same thing suffering caused wanting things shouldn want going about getting things the wrong way christian this would mean wanting things that don coincide with god will and wanting things that coincide but without the aid jesus buddhism could also seen proof god all mighty will and omnipotence certainly christians are lucky have one such christ there side but what about everyone else well many christians believe god grace salvation and buddhism god way showing grace upon others would also help study the things jesus said and see how buddha has made similar claims such rich man getting into heaven joke basically advocating that should rid ourselves material possessions fact distinctly remembered jesus making someone cry because that someone asked what achieve salvation and jesus replied with live like buddhist very very roughly translated also point out that buddha rare

# Remove all null from X and Y

In [152]:
P = X.isnull()

In [36]:
Z = X[P == False]

In [37]:
Z[:10]

0     family mormon have never tried explain them t...
1    buddhism has very much lot compatible with chr...
2    seriously don say thing first all they won get...
3    what you have learned yours and only yours wha...
4    for your own benefit you may want read living ...
5    you should all sit down together and watch the...
6     was teens when discovered zen meditation was ...
7                             jesus was zen meets jew 
8    there are two varieties christians dogmatic th...
9    dont worry about trying explain yourself just ...
Name: clean_comment, dtype: object

In [140]:
X = Z

In [153]:
Y = Y[P == False]

In [154]:
Y[:10]

0    1
1    1
2   -1
3    0
4    1
5   -1
6    1
7    0
8   -1
9    1
Name: category, dtype: int64

In [145]:
listRemove = []
count = 0
for item in X:
    if len(item) > 512:
        listRemove.append(count)
    count += 1

In [146]:
len(listRemove)

2613

In [147]:
output = X.drop(X.index[listRemove])

In [155]:
outputY = Y.drop(Y.index[listRemove])

In [156]:
len(output)

34536

In [157]:
len(outputY)

34536

In [187]:
output.to_csv('output.csv')

Exception ignored in: <function tqdm.__del__ at 0x7f3a0b69ab80>
Traceback (most recent call last):
  File "/home/clark/anaconda3/lib/python3.8/site-packages/tqdm/std.py", line 1145, in __del__
    self.close()
  File "/home/clark/anaconda3/lib/python3.8/site-packages/tqdm/notebook.py", line 278, in close
    self.disp(bar_style='success')
AttributeError: 'tqdm_notebook' object has no attribute 'disp'


In [188]:
outputY.to_csv('y.csv')

# Remove all strings longer than 512 words and their matching Y

In [59]:
allSamples = []
with tqdm(total=len(X)) as pbar:
    for i in range(len(X)):
        indexed_tokens = tokenizer.encode(X.iloc[i], padding='max_length', add_special_tokens=True) 
        allSamples.append(np.array(indexed_tokens))
        pbar.update(1)
            
listRemove = []
count = 0
for item in allSamples:
    if len(item) > 512:
        listRemove.append(count)
    count += 1
    
allSamples = []
with tqdm(total=37016) as pbar:
    for i in range(len(X)):
        if i not in listRemove:
            indexed_tokens = tokenizer.encode(X.iloc[i], padding='max_length', add_special_tokens=True) 
            allSamples.append(np.array(indexed_tokens))
            pbar.update(1)

  0%|          | 0/37149 [00:00<?, ?it/s]

  0%|          | 0/37016 [00:00<?, ?it/s]

In [66]:
len(Y)

37149

In [67]:
Y = Y.drop(Y.index[listRemove])

In [68]:
len(Y)

37016

In [69]:
len(allSamples[914])

512

In [70]:
len(listRemove)

133

In [71]:
len(allSamples)

37016

In [72]:
type(Y)

pandas.core.series.Series

In [75]:
df = pd.DataFrame(allSamples)

In [76]:
df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,101,2155,15111,2031,2196,2699,4863,2068,2027,2145,...,0,0,0,0,0,0,0,0,0,0
1,101,11388,2038,2200,2172,2843,11892,2007,7988,2926,...,0,0,0,0,0,0,0,0,0,0
2,101,5667,2123,2360,2518,2034,2035,2027,2180,2131,...,0,0,0,0,0,0,0,0,0,0
3,101,2054,2017,2031,4342,6737,1998,2069,6737,2054,...,0,0,0,0,0,0,0,0,0,0
4,101,2005,2115,2219,5770,2017,2089,2215,3191,2542,...,0,0,0,0,0,0,0,0,0,0
5,101,2017,2323,2035,4133,2091,2362,1998,3422,1996,...,0,0,0,0,0,0,0,0,0,0
6,101,2001,13496,2043,3603,16729,13804,2001,2059,6151,...,0,0,0,0,0,0,0,0,0,0
7,101,4441,2001,16729,6010,16522,102,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,101,2045,2024,2048,9903,8135,3899,12644,2008,23120,...,0,0,0,0,0,0,0,0,0,0
9,101,2123,2102,4737,2055,2667,4863,4426,2074,19960,...,0,0,0,0,0,0,0,0,0,0


# Normalize (scale) data for ML model

In [99]:
Xscaler = MinMaxScaler(feature_range=(0, 1)) # scale so that all the X data will range from 0 to 1
Xscaler.fit(df)
scaled_X = Xscaler.transform(df)

In [87]:
j = np.array(Y)

In [91]:
j = j.reshape(-1, 1)

In [92]:
j.shape

(37016, 1)

In [100]:
Yscaler = MinMaxScaler(feature_range=(0, 1)) # scale so that all the X data will range from 0 to 1
Yscaler.fit(j)
scaled_Y = Yscaler.transform(j)

In [101]:
scaled_Y_train[:10]

array([[1. ],
       [1. ],
       [0. ],
       [0.5],
       [1. ],
       [0. ],
       [1. ],
       [0.5],
       [0. ],
       [1. ]])

In [102]:
scaled_Y_train.shape

(37016, 1)

In [103]:
scaled_X_train.shape

(37016, 512)

In [104]:
x_train, x_test, y_train, y_test = train_test_split(
...     scaled_X, scaled_Y, test_size=0.33, random_state=42)

In [105]:
type(x_train)

numpy.ndarray

In [120]:
x_train[1][:30]

array([0.        , 0.06659659, 0.14368977, 0.29301182, 0.18925203,
       0.68345705, 0.30724388, 0.11063309, 0.11635918, 0.0665054 ,
       0.10411149, 0.10222266, 0.06995462, 0.06788918, 0.10391914,
       0.08183005, 0.06680501, 0.11420428, 0.06719211, 0.06611242,
       0.20015899, 0.06730291, 0.09620035, 0.07380661, 0.07303334,
       0.26705956, 0.0672873 , 0.06606428, 0.45551248, 0.00337625])

In [121]:
x_train.shape

(24800, 512)

In [117]:
y_train.shape

(24800, 1)

# Transform data to timeseries for LSTM

In [122]:
generator = TimeseriesGenerator(x_train, y_train, length=25, batch_size=100)

In [123]:
type(generator)

keras.preprocessing.sequence.TimeseriesGenerator

In [125]:
generator[0][:50]

(array([[[0.        , 0.40564918, 0.070305  , ..., 0.        ,
          0.        , 0.        ],
         [0.        , 0.06659659, 0.14368977, ..., 0.        ,
          0.        , 0.        ],
         [0.        , 0.2046387 , 0.19846342, ..., 0.        ,
          0.        , 0.        ],
         ...,
         [0.        , 0.0646977 , 0.07745803, ..., 0.        ,
          0.        , 0.        ],
         [0.        , 0.07273405, 0.88147829, ..., 0.        ,
          0.        , 0.        ],
         [0.        , 0.09331661, 0.11679968, ..., 0.        ,
          0.        , 0.        ]],
 
        [[0.        , 0.06659659, 0.14368977, ..., 0.        ,
          0.        , 0.        ],
         [0.        , 0.2046387 , 0.19846342, ..., 0.        ,
          0.        , 0.        ],
         [0.        , 0.06680004, 0.23085075, ..., 0.        ,
          0.        , 0.        ],
         ...,
         [0.        , 0.07273405, 0.88147829, ..., 0.        ,
          0.        , 0.

In [126]:
print(generator[0][0].shape)

(100, 25, 512)


In [128]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM#, CuDNNLSTM


# mnist = tf.keras.datasets.mnist  # mnist is a dataset of 28x28 images of handwritten digits and their labels
# (x_train, y_train),(x_test, y_test) = mnist.load_data()  # unpacks images to x_train/x_test and labels to y_train/y_test

# x_train = x_train/255.0
# x_test = x_test/255.0

# print(x_train.shape)
# print(x_train[0].shape)

model = Sequential()

# IF you are running with a GPU, try out the CuDNNLSTM layer type instead (don't pass an activation, tanh is required)
model.add(LSTM(512, input_shape=(25,512), activation='relu', return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(128, activation='relu'))
model.add(Dropout(0.1))

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))



model.add(Dense(1, activation='softmax'))



opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

# Compile model
model.compile(
    loss='mse',
    optimizer='adam',
    metrics=['accuracy'],
)

model.fit_generator(generator,
          epochs=3)

  super(Adam, self).__init__(name, **kwargs)
  model.fit_generator(generator,


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f39b839a220>