In [1]:
# Hide %run outputs
!pip install -q gwpy --quiet

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.[0m


In [2]:
!pip install matplotlib==3.1.3

Collecting matplotlib==3.1.3
  Using cached matplotlib-3.1.3-cp37-cp37m-manylinux1_x86_64.whl (13.1 MB)
Installing collected packages: matplotlib
  Attempting uninstall: matplotlib
    Found existing installation: matplotlib 3.5.1
    Uninstalling matplotlib-3.5.1:
      Successfully uninstalled matplotlib-3.5.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gwpy 2.1.3 requires matplotlib>=3.3.0, but you have matplotlib 3.1.3 which is incompatible.
albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.[0m
Successfully installed matplotlib-3.1.3


### Load Data

In [3]:
%%capture
%run load_data.ipynb

## FastText Embedding

In [4]:
# install fasttext
!pip install fasttext



### Import Libraries

In [6]:
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf 

from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping

### Download Skipgram Model

In [7]:
! pip install gdown

SKIPGRAM_MODEL_FILE_ID = '1wPnMG9_GNUVdSgbznQziQc5nMWI3QKNz'
!gdown --id $SKIPGRAM_MODEL_FILE_ID 

Downloading...
From: https://drive.google.com/uc?id=1wPnMG9_GNUVdSgbznQziQc5nMWI3QKNz
To: /content/farsi-dedup-skipgram.bin
100% 4.37G/4.37G [01:07<00:00, 64.5MB/s]


### Fasttext Constants

In [8]:
EMBEDDING_LEN = 100
SENT_MAX_LEN = 50

### Load FastText Model

In [9]:
import fasttext
model_skipgram = fasttext.load_model('farsi-dedup-skipgram.bin')

In [10]:
# Fit Keras Tokenizer on comments
comments = df_train['clean_comment'].values
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=3000)
tokenizer.fit_on_texts(comments)

In [11]:
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size : {}'.format(vocab_size))

Vocabulary Size : 4679


In [12]:
encoded_comments = tokenizer.texts_to_sequences(comments)

In [15]:
print("Comment : {}".format(comments[1]))
print("Corresponding Encoding : {}".format(encoded_comments[1]))

Comment : ['سلام', 'به', 'دوستای', 'عزیزم', 'عزاداری', 'هاتون', 'قبول', 'باشه']
Corresponding Encoding : [79, 2, 1840, 831, 1841, 832, 525, 60]


In [16]:
# padding
padded_sequence = pad_sequences(encoded_comments, maxlen=SENT_MAX_LEN, padding='post')

In [17]:
print('Padding Shape: {}'.format(padded_sequence.shape))

Padding Shape: (800, 50)


In [19]:
# initial embedding matrix
embedding_matrix = np.zeros((vocab_size, EMBEDDING_LEN))

for word, i in tokenizer.word_index.items():
  embedding_vector = model_skipgram.get_word_vector(word)
  # words that cannot be found will be set to 0
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

## LSTM Model Architecture

In [21]:
# LSTM constants
LSTM_UNITS = 50

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_length, input_length=sentences_max_length, weights=[embedding_matrix], trainable=True))
model.add(SpatialDropout1D(0.7))
model.add(LSTM(lstm_units, dropout=0.5, recurrent_dropout=0.5))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
eval_comments = df_eval['clean_comment'].values
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=3000)
tokenizer.fit_on_texts(eval_comments)

In [None]:
eval_vocab_size = len(tokenizer.word_index) + 1
print('vocab size: {}'.format(vocab_size))

In [None]:
eval_encoded_comments = tokenizer.texts_to_sequences(eval_comments)

In [None]:
eval_padded_sequence = pad_sequences(eval_encoded_texts, maxlen=sentences_max_length, padding='post')
print('Padding Sequence Shape : {}'.format(eval_padded_sequence.shape))

In [None]:
eval_embedding_matrix = np.zeros((eval_vocab_size, embedding_vector_length))

for word, i in tokenizer.word_index.items():
  embedding_vector = model_skipgram.get_word_vector(word)
  # words that cannot be found will be set to 0
  if embedding_vector is not None:
    eval_embedding_matrix[i] = embedding_vector

### Creating Labels based on data

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train_y = label_encoder.fit_transform(df_train['label'])
eval_y = label_encoder.fit_transform(df_eval['label'])

### Fit LSTM Model

In [None]:
model = model.fit(
    padded_sequence, 
    train_y, 
    batch_size=32, 
    epochs=5, 
    validation_data=(eval_padded_sequence, eval_y)
    )

### Draw Model

In [None]:
plt.style.use('ggplot')

def plot_history(model):
    acc = model.history['accuracy']
    val_acc = model.history['val_accuracy']
    loss = model.history['loss']
    val_loss = model.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

plot_history(model)