In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import string

from nltk.corpus import stopwords

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Input, Flatten
from keras import regularizers
from keras import optimizers

from sklearn.model_selection import train_test_split
import nltk
nltk.download('stopwords')

Using TensorFlow backend.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
def wordprep(report):
    puncs = "".join(string.punctuation)
    tokens = report.split(' ')
    words = []
    for token in tokens:
        for punc in puncs:
            while punc in token:
                token = token.replace(punc, "")
        if (token != " "):
            words.append(token)
    stop_words = list(set(stopwords.words('english')))
    out = [x for x in words if x.lower() not in stop_words]
    df_out = " ".join(out)
    
    return ' '.join(df_out.split())

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
df_train = pd.read_csv("/content/drive/My Drive/kaggle_sentiment_data/train.tsv",delimiter="\t")

In [6]:
df_train

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
...,...,...,...,...
156055,156056,8544,Hearst 's,2
156056,156057,8544,forced avuncular chortles,1
156057,156058,8544,avuncular chortles,3
156058,156059,8544,avuncular,2


In [0]:
df_test = pd.read_csv("/content/drive/My Drive/kaggle_sentiment_data/test.tsv",delimiter="\t")

In [8]:
df_test

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine
...,...,...,...
66287,222348,11855,"A long-winded , predictable scenario ."
66288,222349,11855,"A long-winded , predictable scenario"
66289,222350,11855,"A long-winded ,"
66290,222351,11855,A long-winded


In [0]:
df_train["cleansed_txt"] = df_train["Phrase"].apply(lambda x: wordprep(x))
df_test["cleansed_txt"] = df_test["Phrase"].apply(lambda x: wordprep(x))

In [10]:
df_train

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,cleansed_txt
0,1,1,A series of escapades demonstrating the adage ...,1,series escapades demonstrating adage good goos...
1,2,1,A series of escapades demonstrating the adage ...,2,series escapades demonstrating adage good goose
2,3,1,A series,2,series
3,4,1,A,2,
4,5,1,series,2,series
...,...,...,...,...,...
156055,156056,8544,Hearst 's,2,Hearst
156056,156057,8544,forced avuncular chortles,1,forced avuncular chortles
156057,156058,8544,avuncular chortles,3,avuncular chortles
156058,156059,8544,avuncular,2,avuncular


In [11]:
df_train = df_train[df_train.cleansed_txt != '']
print(df_train)

        PhraseId  ...                                       cleansed_txt
0              1  ...  series escapades demonstrating adage good goos...
1              2  ...    series escapades demonstrating adage good goose
2              3  ...                                             series
4              5  ...                                             series
5              6  ...           escapades demonstrating adage good goose
...          ...  ...                                                ...
156055    156056  ...                                             Hearst
156056    156057  ...                          forced avuncular chortles
156057    156058  ...                                 avuncular chortles
156058    156059  ...                                          avuncular
156059    156060  ...                                           chortles

[154906 rows x 5 columns]


In [0]:
#To change lexicon size
nword1 = 128
nword2 = 256
nword3 = 512
nwords = nword1
tokenizer = Tokenizer(num_words=nwords, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True,split=' ')
tokenizer.fit_on_texts(df_train["cleansed_txt"].values)
tokenizer.fit_on_texts(df_test["cleansed_txt"].values)

In [13]:
df_train

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,cleansed_txt
0,1,1,A series of escapades demonstrating the adage ...,1,series escapades demonstrating adage good goos...
1,2,1,A series of escapades demonstrating the adage ...,2,series escapades demonstrating adage good goose
2,3,1,A series,2,series
4,5,1,series,2,series
5,6,1,of escapades demonstrating the adage that what...,2,escapades demonstrating adage good goose
...,...,...,...,...,...
156055,156056,8544,Hearst 's,2,Hearst
156056,156057,8544,forced avuncular chortles,1,forced avuncular chortles
156057,156058,8544,avuncular chortles,3,avuncular chortles
156058,156059,8544,avuncular,2,avuncular


In [0]:
X_train = tokenizer.texts_to_sequences(df_train["cleansed_txt"].values)
X_test = tokenizer.texts_to_sequences(df_test["cleansed_txt"].values)

In [15]:
X_train

[[8, 88, 8, 10, 6],
 [8],
 [],
 [],
 [8],
 [8],
 [],
 [8],
 [],
 [],
 [],
 [],
 [8],
 [8],
 [8],
 [8],
 [8],
 [],
 [],
 [],
 [88, 8, 10, 6],
 [88, 8, 10, 6],
 [88],
 [88],
 [8, 10, 6],
 [10, 6],
 [10, 6],
 [],
 [],
 [],
 [10, 6],
 [10, 6],
 [],
 [10, 6],
 [],
 [10, 6],
 [10, 6],
 [],
 [10, 6],
 [10, 6],
 [10, 6],
 [],
 [10, 6],
 [10, 6],
 [10],
 [6],
 [6],
 [6],
 [108],
 [108],
 [108],
 [108],
 [],
 [108],
 [108],
 [],
 [],
 [108],
 [],
 [],
 [],
 [],
 [],
 [],
 [14, 26, 34, 116, 12, 4],
 [14, 26],
 [14],
 [14],
 [],
 [26],
 [26],
 [],
 [],
 [],
 [],
 [26],
 [34, 116, 12, 4],
 [],
 [],
 [],
 [],
 [34, 116, 12, 4],
 [34, 116, 12, 4],
 [34],
 [116, 12, 4],
 [116, 12, 4],
 [116, 12],
 [116, 12],
 [116],
 [12],
 [4],
 [],
 [4],
 [4],
 [4],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [80, 40],
 []

In [0]:
#min_size = min([len(a) for a in X_train])
min_size=5

In [17]:
min_size

5

In [0]:

X_train = pad_sequences(X_train, padding="post")
X_test = pad_sequences(X_test, padding="post")
X_train = X_train[:, range(min_size*1)]
X_test = X_test[:, range(min_size*1)]

In [19]:
X_train

array([[ 8, 88,  8, 10,  6],
       [ 8,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0],
       ...,
       [ 0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0]], dtype=int32)

In [20]:
X_test[1]

array([0, 0, 0, 0, 0], dtype=int32)

In [0]:
embeddings_index = dict()
f = open("/content/drive/My Drive/Thesis/glove.6B.50d.txt", encoding="utf-8")
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()

In [0]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((nwords, 50))

In [0]:
for i in range(1, nwords):
    word = tokenizer.index_word[i]
    if word in embeddings_index.keys():
        embedding_vector = embeddings_index[word]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [0]:
#To change batch size
embed_dim = 50
lstm_out = 8
batch_size01 = 8
batch_size02 = 16
batch_size03 = 32
batch_size = batch_size01

In [52]:
#opt = optimizers.Adam(lr=1e-4, beta_1=0.8, beta_2=0.9, epsilon=0.2, decay=1e-2)
model = Sequential()
model.add(Embedding(nwords, embed_dim,input_length = X_train.shape[1], weights=[embedding_matrix], trainable=False))
model.add(LSTM(lstm_out,
               dropout = 0.3,
               recurrent_dropout = 0.3,
               kernel_initializer="lecun_uniform",
               recurrent_regularizer=regularizers.l2(0.001)))
model.add(Dense(5, kernel_initializer="random_uniform", activation="softmax"))
adam01 = optimizers.adam(lr=0.1)
model.compile(loss="binary_crossentropy", optimizer="adadelta",metrics = ["accuracy"])
print(model.summary())

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 5, 50)             6400      
_________________________________________________________________
lstm_2 (LSTM)                (None, 8)                 1888      
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 45        
Total params: 8,333
Trainable params: 1,933
Non-trainable params: 6,400
_________________________________________________________________
None


In [26]:
df_train

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,cleansed_txt
0,1,1,A series of escapades demonstrating the adage ...,1,series escapades demonstrating adage good goos...
1,2,1,A series of escapades demonstrating the adage ...,2,series escapades demonstrating adage good goose
2,3,1,A series,2,series
4,5,1,series,2,series
5,6,1,of escapades demonstrating the adage that what...,2,escapades demonstrating adage good goose
...,...,...,...,...,...
156055,156056,8544,Hearst 's,2,Hearst
156056,156057,8544,forced avuncular chortles,1,forced avuncular chortles
156057,156058,8544,avuncular chortles,3,avuncular chortles
156058,156059,8544,avuncular,2,avuncular


In [0]:
Y_train = pd.get_dummies(df_train["Sentiment"]).values
#Y_test = pd.get_dummies(df_test["TNM-stage"]).values

In [0]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=100)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)
rlrop = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=20)

In [29]:
X_train.shape

(154906, 5)

In [30]:
Y_train.shape

(154906, 5)

In [31]:
X_test.shape

(66292, 5)

In [32]:
history = model.fit(X_train, Y_train, batch_size =batch_size, epochs = 200, verbose = 1, shuffle=False, callbacks=[es])

Epoch 1/200
Epoch 2/200
   432/154906 [..............................] - ETA: 58s - loss: 0.3813 - accuracy: 0.8361 



Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 

In [34]:
score1,acc1 = model.evaluate(X_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score1))
print("acc: %.2f" % (acc1))

IndexError: ignored

In [0]:
My Drive/kaggle_sentiment_data/test.tsv

In [0]:
model.save('/content/drive/My Drive/kaggle_sentiment_data/model1.h5')

In [49]:
predictions1 = model.predict_classes(X_test)
print(predictions1[50])

2


In [46]:
predictions1.size

66292

In [45]:
predictions1.shape

(66292,)

In [42]:
predictions1_prob = model.predict_proba(X_test)
print(predictions1_prob)

[[0.03653471 0.19098471 0.5593346  0.17703134 0.03611458]
 [0.03653471 0.19098471 0.5593346  0.17703134 0.03611458]
 [0.03653471 0.19098471 0.5593346  0.17703134 0.03611458]
 ...
 [0.03653471 0.19098471 0.5593346  0.17703134 0.03611458]
 [0.03653475 0.19098493 0.55933523 0.17703153 0.03611352]
 [0.03653475 0.19098493 0.55933523 0.17703152 0.03611352]]
