In [1]:
# removes unnecessary logs
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

# imports required for the training algorithm
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import json
import pickle
import random

In [2]:
# predefined variables
vocabSize = 10000
outputDim = 16
maxInput = 200
truncType='post'
padType='post'
oov = "<OOV>"

In [3]:
with open("datasets/test.json", 'r') as f:
    tweets = json.load(f)
random.shuffle(tweets)
train = tweets[:int(round(4*len(tweets)/5))]
test = tweets[int(round(4*len(tweets)/5)):len(tweets)]
xtrain = []
ytrain = []
xtest = []
ytest = []

for tweet in train:
    xtrain.append(tweet['content'])
    ytrain.append(tweet['label'])
    
for tweet in test:
    xtest.append(tweet['content'])
    ytest.append(tweet['label'])

In [4]:
# tokenization go brrr
tokenizer = Tokenizer(num_words=vocabSize, oov_token=oov)
tokenizer.fit_on_texts(xtrain)

wordIndex = tokenizer.word_index

# preparing training data for neural network
xtrainencoded = tokenizer.texts_to_sequences(xtrain)
xtrainpadded = pad_sequences(xtrainencoded, maxlen=maxInput, padding=padType, truncating=truncType)
xtrain = np.asarray(xtrainpadded).astype(np.float32)
ytrain = np.asarray(ytrain).astype(np.float32)

# preparing testing data for neural network
xtestencoded = tokenizer.texts_to_sequences(xtest)
xtestpadded = pad_sequences(xtestencoded, maxlen=maxInput, padding=padType, truncating=truncType)
xtest = np.asarray(xtestpadded).astype(np.float32)
ytest = np.asarray(ytest).astype(np.float32)

In [5]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocabSize, outputDim, input_length=maxInput),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
numEpochs = 10
print("training the model")
history = model.fit(xtrain, ytrain, epochs=numEpochs, validation_data=(xtest, ytest), verbose=2)

training the model
Epoch 1/20
750/750 - 8s - loss: 0.4683 - accuracy: 0.8378 - val_loss: 0.2432 - val_accuracy: 0.9172
Epoch 2/20
750/750 - 3s - loss: 0.1705 - accuracy: 0.9435 - val_loss: 0.1382 - val_accuracy: 0.9527
Epoch 3/20
750/750 - 3s - loss: 0.1071 - accuracy: 0.9641 - val_loss: 0.1056 - val_accuracy: 0.9647
Epoch 4/20
750/750 - 3s - loss: 0.0793 - accuracy: 0.9737 - val_loss: 0.0941 - val_accuracy: 0.9713
Epoch 5/20
750/750 - 3s - loss: 0.0647 - accuracy: 0.9793 - val_loss: 0.0836 - val_accuracy: 0.9720
Epoch 6/20
750/750 - 3s - loss: 0.0549 - accuracy: 0.9818 - val_loss: 0.0798 - val_accuracy: 0.9732
Epoch 7/20
750/750 - 3s - loss: 0.0466 - accuracy: 0.9850 - val_loss: 0.0783 - val_accuracy: 0.9735
Epoch 8/20
750/750 - 4s - loss: 0.0403 - accuracy: 0.9873 - val_loss: 0.0786 - val_accuracy: 0.9743
Epoch 9/20
750/750 - 3s - loss: 0.0360 - accuracy: 0.9882 - val_loss: 0.0806 - val_accuracy: 0.9737
Epoch 10/20
750/750 - 3s - loss: 0.0317 - accuracy: 0.9900 - val_loss: 0.0819 - v

In [None]:
%matplotlib inline
# graphs
import matplotlib.pyplot as plt

def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
  
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import pandas as pd
plt.figure()
pred = tf.round(model.predict(xtestpadded))
array = confusion_matrix(tf.round(ytest),pred,labels=[1,0])
df_cm = pd.DataFrame(array, range(2), range(2))
sn.set(font_scale=1.4) # for label size
sn.heatmap(df_cm, annot=True, annot_kws={"size": 16}) # font size

plt.show()

In [6]:
# saving the tokenizer
with open('savedModel/nn/basic/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# saving trained model
model.save("savedModel/nn/basic/model")

INFO:tensorflow:Assets written to: savedModel/nn/basic/model\assets


In [None]:
# removes unnecessary logs
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

# importing necessary libraries
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

# predefined variables
maxInput = 200
truncType='post'
padType='post'

In [None]:
# loading the tokenizer and model
with open('savedModel/nn/basic/tokenizer.pickle', 'rb') as file:
    tokenizer = pickle.load(file)

model = tf.keras.models.load_model("savedModel/nn/basic/model")

In [None]:
n = int(input("no of sentences: "))
sentences = [input("Enter sentence:") for _ in range(n)]
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, maxlen=maxInput, padding=padType, truncating=truncType)

In [None]:
pred = []
for i in model.predict(padded):
    for j in i:
        pred.append(int(j))

print(pred)