# Mount Google Drive to the Dataset
## If run locally, do not need to run this section

In [5]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [6]:
cd /content/gdrive/My Drive/Project/Media

/content/gdrive/My Drive/Project/Media


In [11]:
ls

content_seg.txt  med250.model.bin  Sntlst.txt   wordMatrix.npz
[0m[01;34mDataset[0m/         Preprocess.ipynb  train.ipynb


# LSTM Text Classsifier by Keras

In [0]:
# -*- coding: utf-8 -*-
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction import text
from sklearn import decomposition, ensemble
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import LSTM, Embedding
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from gensim.models import Word2Vec


import pandas as pd
import numpy as np

###  Hyper Parameters



In [0]:
MAX_SEQUENCE_LENGTH = 60 # max length of each weibo
EMBEDDING_DIM = 100 # dimensions of word embedding
TEST_SPLIT = 0.2 # split the data to train and test data

In [0]:
# Load Word2Vec Model & Word Embedding Matrix

w2v_model = Word2Vec.load("Model/med250.model.bin")
word_file = np.load("Model/wordMatrix.npz")
embedding_matrix = word_file['arr_0']

tokenizer = Tokenizer()

# Get texts of aimed file
def getTexts():
    Texts = []
    with open("Sntlst.txt", "r", encoding='utf8') as f:
        lines = f.readlines()
        for line in lines:
            Texts.append(line.strip())
    return Texts
all_texts = getTexts()

# Make texts to sequences
tokenizer.fit_on_texts(all_texts)
sequences = tokenizer.texts_to_sequences(all_texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

print('Shape of data tensor:', data.shape)

In [17]:
# Split Train and Test Data
y_0 = [[0] * 7907]
y_1 = [[1] * 7880]
y = np.append(y_0, y_1)
labels = to_categorical(np.asarray(y))

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.33, random_state=42)
print(X_train.shape, X_test.shape)

(10577, 60) (5210, 60)


In [18]:
# Get Embedding Matrix of Pretrained Word2Vec Model
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items(): 
    if word in w2v_model.wv:
        embedding_matrix[i] = np.asarray(w2v_model.wv[word],
                                         dtype='float32')
print(embedding_matrix.shape)

(59128, 100)


In [20]:
# Defined Embedding Layer
embedding_layer = Embedding(input_dim = len(word_index) + 1,
                            output_dim = EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

# LSTM Model
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dropout(0.2))
model.add(Dense(2, activation='softmax'))
model.summary()
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])
model.fit(X_train, y_train, epochs=2, batch_size=128)
model.save('Model/word_vector.h5')
print(model.evaluate(X_test, y_test))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 60, 100)           5912800   
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 202       
Total params: 5,993,402
Trainable params: 80,602
Non-trainable params: 5,912,800
_________________________________________________________________
Epoch 1/2
Epoch 2/2
[0.004930232965489965, 0.9990403071017274]
