-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
107 lines (90 loc) · 3.7 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#%%
import pandas as pd
import numpy as np
df=pd.read_csv("FINAL_DATA.csv")
#%%
df['word_length'] = df['content'].apply(lambda x:len(x.split()))
#%%
from sklearn.model_selection import train_test_split
X_train,X_test, y_train,y_test = train_test_split(df.content,df.search_word, test_size = 0.2, stratify = df.search_word,random_state = 42)
#%%
from keras.preprocessing.text import Tokenizer
num_words = 10000 # this means 10000 unique words can be taken
tokenizer=Tokenizer(num_words,lower=True)
df_total = pd.concat([X_train, X_test], axis = 0)
tokenizer.fit_on_texts(df_total)
#%%
len(tokenizer.word_index)
#%%
df.word_length.max()
# %%
from keras.preprocessing.sequence import pad_sequences
X_train_ =tokenizer.texts_to_sequences(X_train)
X_train_pad=pad_sequences(X_train_,maxlen=171,padding='post')
X_test_ = tokenizer.texts_to_sequences(X_test)
X_test_pad = pad_sequences(X_test_, maxlen = 171, padding = 'post')
# %%
print(X_train_pad.shape,X_test_pad.shape)
# %%
import gensim.downloader as api
glove_gensim = api.load('glove-wiki-gigaword-300')
# %%
glove_gensim['cat'].shape[0]
#%%
print(glove_gensim.key_to_index)
#%%
vector_size = 300
gensim_weight_matrix = np.zeros((num_words ,vector_size))
gensim_weight_matrix.shape
for word, index in tokenizer.word_index.items():
if index < num_words: # since index starts with zero
if word in glove_gensim.key_to_index:
gensim_weight_matrix[index] = glove_gensim[word]
else:
gensim_weight_matrix[index] = np.zeros(300)
#%%
gensim_weight_matrix.shape
#%%from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding,Bidirectional
import tensorflow
from keras.models import Sequential
from tensorflow.compat.v1.keras.layers import CuDNNLSTM,CuDNNGRU
from tensorflow.keras.layers import Dropout
#%%
EMBEDDING_DIM = 300 # this means the embedding layer will create a vector in 100 dimension
model_gensim = Sequential()
model_gensim.add(Embedding(input_dim = num_words,# the whole vocabulary size
output_dim = EMBEDDING_DIM, # vector space dimension
input_length= X_train_pad.shape[1], # max_len of text sequence
weights = [gensim_weight_matrix],trainable = False))
model_gensim.add(Dropout(0.2))
model_gensim.add(Bidirectional(CuDNNLSTM(300,return_sequences=True)))
model_gensim.add(Dropout(0.2))
model_gensim.add(Bidirectional(CuDNNLSTM(400,return_sequences=True)))
model_gensim.add(Dropout(0.2))
model_gensim.add(Bidirectional(CuDNNLSTM(300,return_sequences=False)))
model_gensim.add(Dense(3, activation = 'softmax'))
model_gensim.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam',metrics = 'accuracy')
#%%
model_gensim.summary()
#%%
from keras.callbacks import EarlyStopping, ModelCheckpoint
es = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 5)
mc = ModelCheckpoint('./model_gensim.h5', monitor = 'val_accuracy', mode = 'max', verbose = 1, save_best_only = True)
#%%
history_gensim = model_gensim.fit(X_train_pad,y_train, epochs = 25, batch_size = 120, validation_data=(X_test_pad, y_test),verbose = 1, callbacks= [es, mc] )
#%%
print(X_train_pad.shape,y_train.shape)
#%%
import seaborn as sns
import matplotlib.pyplot as plt
# %%
from keras.callbacks import EarlyStopping, ModelCheckpoint
epochs = 5
batch_size = 64
history = model_gensim.fit(X_train_pad, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
# %%
plt.plot(history_gensim.history['accuracy'],c='b',label='train accuracy')
plt.plot(history_gensim.history['val_accuracy'],c='r',label='validation accuracy')
plt.legend(loc='lower right')
plt.show()