In [1]:
from IPython.core.debugger import set_trace

#%load_ext nb_black

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import time

plt.style.use(style="dark_background")

In [2]:
SENTIMENT_LABELLED_DATA_FILEPATH = '..\Data\Sentiment Analysis Dataset.csv'
DATA_PREPROCESSING_FILEPATH = '../Data Preprocessed'

TOKENIZER_OUTPUT_FILEPATH = 'tokenizer_100K.pickle'
MODEL_OUTPUT_FILEPATH = 'model_100K_glove'

Import Dataset

In [3]:
txt_data = pd.read_csv(SENTIMENT_LABELLED_DATA_FILEPATH , sep='|', names=['col1'])

#split data into columns with ','
txt_data = txt_data.col1.str.split(',',  3, expand=True)
txt_data.columns = list(txt_data.iloc[0])
txt_data = txt_data.drop(0)
txt_data.index = np.subtract(txt_data.index, 1)
# x = txt_data.groupby('Sentiment')
# l=[x.get_group(i)['SentimentText'] for i in x.groups]

In [4]:
dataset = pd.concat([txt_data['SentimentText'], txt_data['Sentiment']], axis = 1)
dataset.columns = ['text', 'target']

In [5]:
dataset

Unnamed: 0,text,target
0,is so sad for my APL frie...,0
1,I missed the New Moon trail...,0
2,omg its already 7:30 :O,1
3,.. Omgaga. Im sooo im gunna CRy. I'...,0
4,i think mi bf is cheating on me!!! ...,0
...,...,...
1577833,Zzzzzz.... Finally! Night tweeters!,1
1577834,"""Zzzzzzz, sleep well people """,1
1577835,ZzzZzZzzzZ... wait no I have homework.,0
1577836,"""ZzZzzzZZZZzzz meh, what am I doing up again? """,0


In [5]:
#suffle
dataset = dataset.sample(frac = 1)
dataset

Unnamed: 0,text,target
1024003,@madilovesmerder me 2 My MacBook is packed wi...,1
285751,@lodossheros no @bhs3133 isn't coming back unt...,0
32670,@agnsrms aww i left you pennies..as a joke tho...,0
1209250,today was fun. and it's going to get even be...,1
413343,"""@officialjman hey jordan, so im addicted to g...",1
...,...,...
1539266,Stupid icky churning tummy...what is your prob...,0
1332733,@unifex. Documentation often does suck. But I ...,0
993420,just got back from the beach &amp; smells like...,1
1223809,Suddenly I see this is what I wanna be. Sudden...,1


In [6]:
import sys
# insert at 1, 0 is the script path (or '' in REPL)
sys.path.append(DATA_PREPROCESSING_FILEPATH)
from data_preprocess import *

import pickle

In [7]:
text = NLP_preprocess(dataset)
text.preprocess_data()

text.set_tokenizer()
import pickle

# saving
with open(TOKENIZER_OUTPUT_FILEPATH, 'wb') as handle:
    pickle.dump(text.tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [8]:
train_dataset = dataset[:100000]
#train_dataset = dataset[:1000000]
train_labels = train_dataset.target.astype(float)
train_padded = text.tokenize_and_pad(train_dataset, train = True)
print(train_padded)
# saving
with open(TOKENIZER_OUTPUT_FILEPATH, 'wb') as handle:
    pickle.dump(text.tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

[[ 13244     32   1458 ...      0      0      0]
 [ 27855  27856    209 ...      0      0      0]
 [ 17296    230    147 ...      0      0      0]
 ...
 [106972 106973  19662 ...      0      0      0]
 [106974      8   1310 ...      0      0      0]
 [ 16763     50    702 ...      0      0      0]]


In [9]:
text.set_embedding_matrix()

In [10]:
text.embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.67391002,  0.54449999, -0.44868001, ...,  0.49522001,
         1.27960002, -0.75579   ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [11]:
test_dataset = dataset[:20000]
test_labels = test_dataset.target.astype(float)
test_padded = text.tokenize_and_pad(test_dataset, train = False)
print(test_padded)

[[13244    32  1458 ...     0     0     0]
 [27855 27856   209 ...     0     0     0]
 [17296   230   147 ...     0     0     0]
 ...
 [  201     9   286 ...     0     0     0]
 [43631 43632  3769 ...     0     0     0]
 [ 1747  1665 10509 ...     0     0     0]]


In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.initializers import Constant
from tensorflow.keras.optimizers import Adam

model = Sequential()

# trainable = False
#bidirectional

#Glove
model.add(
    Embedding(
        text.num_words,
        100,
        embeddings_initializer = Constant(text.embedding_matrix),
        input_length = text.max_length,
        trainable = False,
    )
)

#basic
#model.add(Embedding(text.num_words, 8, input_length=text.max_length))
model.add(LSTM(100, dropout=0.1))
model.add(Dense(1, activation="sigmoid"))

optimizer = Adam(learning_rate=3e-4)

model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=['accuracy'])

In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 100)           10697600  
_________________________________________________________________
lstm (LSTM)                  (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 10,778,101
Trainable params: 80,501
Non-trainable params: 10,697,600
_________________________________________________________________


In [14]:
history = model.fit(
    train_padded, train_labels, epochs=20, validation_data=(test_padded, test_labels),
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [16]:
model.save(MODEL_OUTPUT_FILEPATH)



INFO:tensorflow:Assets written to: model_100K_glove\assets


INFO:tensorflow:Assets written to: model_100K_glove\assets


In [30]:
import random
#test_neg = random.choice(neg.values)
i = random.choice(test_dataset.index)
if test_dataset.loc[i]['target'] == '0':
    print('negative: ', test_dataset.loc[i]['text'])
else:
    print('positive: ', test_dataset.loc[i]['text'])
test_sequences = text.tokenizer.texts_to_sequences([test_dataset.loc[i]['text']])
test_padded2 = pad_sequences(
    test_sequences, maxlen=text.max_length, padding="post", truncating="post"
)
model.predict(test_padded2)

negative:  thi m4 v cï¿½i u b iï¿½n


array([[0.50002813]], dtype=float32)

In [31]:
# import keras
# model = keras.models.load_model('C:/Users/Konst/Desktop/PythonNotebooks')

In [None]:
# model.summary()

In [None]:
model.save('')