In [1]:
from IPython.core.debugger import set_trace

#%load_ext nb_black

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import time

plt.style.use(style="dark_background")

In [2]:
SENTIMENT_LABELLED_DATA_FILEPATH = '..\Data\Sentiment Analysis Dataset.csv'
DATA_PREPROCESSING_FILEPATH = '../Data Preprocessed'

TOKENIZER_OUTPUT_FILEPATH = 'tokenizer_100K.pickle'
MODEL_OUTPUT_FILEPATH = 'model_100K_glove'

Import Dataset

In [3]:
txt_data = pd.read_csv(SENTIMENT_LABELLED_DATA_FILEPATH , sep='|', names=['col1'])

#split data into columns with ','
txt_data = txt_data.col1.str.split(',',  3, expand=True)
txt_data.columns = list(txt_data.iloc[0])
txt_data = txt_data.drop(0)
txt_data.index = np.subtract(txt_data.index, 1)
# x = txt_data.groupby('Sentiment')
# l=[x.get_group(i)['SentimentText'] for i in x.groups]


In [4]:
dataset = pd.concat([txt_data['SentimentText'], txt_data['Sentiment']], axis = 1)
dataset.columns = ['text', 'target']

In [5]:
dataset

Unnamed: 0,text,target
0,is so sad for my APL frie...,0
1,I missed the New Moon trail...,0
2,omg its already 7:30 :O,1
3,.. Omgaga. Im sooo im gunna CRy. I'...,0
4,i think mi bf is cheating on me!!! ...,0
...,...,...
1577833,Zzzzzz.... Finally! Night tweeters!,1
1577834,"""Zzzzzzz, sleep well people """,1
1577835,ZzzZzZzzzZ... wait no I have homework.,0
1577836,"""ZzZzzzZZZZzzz meh, what am I doing up again? """,0


In [6]:
#suffle
dataset = dataset.sample(frac = 1)
dataset

Unnamed: 0,text,target
209621,"""@jackbakerrr My God, you freak. Good gooooooo...",1
1306135,"""watching twilight for no reson, and listening...",1
1204091,So my birthday is on Friday and STILL no plans,0
1447387,I need my favorite cuddle buddy right now.,0
425130,"""@sonnyjoeflangan goddammit, i missed it what...",0
...,...,...
796383,I don't really know what I'm doing...tryin out...,1
1440481,I hate the fact that I can't upload my photo ...,0
643448,"""Days of wonder going so quickly...trying to j...",1
856372,I don't get or feel sick but I don't feel 100%...,0


In [9]:
import sys
# insert at 1, 0 is the script path (or '' in REPL)
sys.path.append(DATA_PREPROCESSING_FILEPATH)
from data_preprocess import *

import pickle

In [10]:
text = NLP_preprocess(dataset)
text.preprocess_data()

text.set_tokenizer()
import pickle

# saving
with open(TOKENIZER_OUTPUT_FILEPATH, 'wb') as handle:
    pickle.dump(text.tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
train_dataset = dataset[:100000]
#train_dataset = dataset[:1000000]
train_labels = train_dataset.target.astype(float)
train_padded = text.tokenize_and_pad(train_dataset, train = True)
print(train_padded)
# saving
with open(TOKENIZER_OUTPUT_FILEPATH, 'wb') as handle:
    pickle.dump(text.tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

[[ 27852    201   2019 ...      0      0      0]
 [    66    833  17317 ...      0      0      0]
 [   190    244     24 ...      0      0      0]
 ...
 [     9     11    105 ...      0      0      0]
 [  7872    236   1162 ...      0      0      0]
 [107119      2     31 ...      0      0      0]]


In [12]:
text.set_embedding_matrix()

In [13]:
text.embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.67391002,  0.54449999, -0.44868001, ...,  0.49522001,
         1.27960002, -0.75579   ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.37011999,  0.066369  ,  0.23375   , ..., -0.13676   ,
         0.18174   , -0.14895999],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [14]:
test_dataset = dataset[:20000]
test_labels = test_dataset.target.astype(float)
test_padded = text.tokenize_and_pad(test_dataset, train = False)
print(test_padded)

[[27852   201  2019 ...     0     0     0]
 [   66   833 17317 ...     0     0     0]
 [  190   244    24 ...     0     0     0]
 ...
 [   38     5    11 ...     0     0     0]
 [43697   284   259 ...     0     0     0]
 [ 3188   607  4653 ...     0     0     0]]


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.initializers import Constant
from tensorflow.keras.optimizers import Adam

model = Sequential()

# trainable = False
#bidirectional

#Glove
model.add(
    Embedding(
        text.num_words,
        100,
        embeddings_initializer = Constant(text.embedding_matrix),
        input_length = text.max_length,
        trainable = False,
    )
)

#basic
for i in range(0,200):
#model.add(Embedding(text.num_words, 8, input_length=text.max_length))
    model.add(Dense(1000, activation='relu'))
model.add(LSTM(100, dropout=0.1))
model.add(Dense(1, activation="sigmoid"))

optimizer = Adam(learning_rate=3e-4)

model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=['accuracy'])

In [32]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 50, 100)           10712200  
_________________________________________________________________
dense_1235 (Dense)           (None, 50, 1000)          101000    
_________________________________________________________________
dense_1236 (Dense)           (None, 50, 1000)          1001000   
_________________________________________________________________
dense_1237 (Dense)           (None, 50, 1000)          1001000   
_________________________________________________________________
dense_1238 (Dense)           (None, 50, 1000)          1001000   
_________________________________________________________________
dense_1239 (Dense)           (None, 50, 1000)          1001000   
_________________________________________________________________
dense_1240 (Dense)           (None, 50, 1000)         

In [33]:
history = model.fit(
    train_padded, train_labels, epochs=20, validation_data=(test_padded, test_labels),
)

Epoch 1/20
 263/3125 [=>............................] - ETA: 12:42 - loss: 0.6933 - accuracy: 0.4948

KeyboardInterrupt: 

In [18]:
model.save(MODEL_OUTPUT_FILEPATH)



INFO:tensorflow:Assets written to: model_100K_glove\assets


INFO:tensorflow:Assets written to: model_100K_glove\assets


In [19]:
import random
#test_neg = random.choice(neg.values)
i = random.choice(test_dataset.index)
if test_dataset.loc[i]['target'] == '0':
    print('negative: ', test_dataset.loc[i]['text'])
else:
    print('positive: ', test_dataset.loc[i]['text'])
test_sequences = text.tokenizer.texts_to_sequences([test_dataset.loc[i]['text']])
test_padded2 = pad_sequences(
    test_sequences, maxlen=text.max_length, padding="post", truncating="post"
)
model.predict(test_padded2)

negative:  ohmymandy lucky met


array([[0.8493518]], dtype=float32)

In [31]:
# import keras
# model = keras.models.load_model('C:/Users/Konst/Desktop/PythonNotebooks')

In [None]:
# model.summary()

In [None]:
#model.save('')