## Before You Run
make a `data` drectory and upload data (eval, test and train csvs)

In [6]:
# ! mkdir data

mkdir: cannot create directory ‘data’: File exists


In [None]:
# install fasttext
! pip install fasttext

# install gdown to download fasttext model from google drive (with maximum speed!)
! pip install gdown

# install matplotlib to prevent unwelcome errors
! pip install matplotlib==3.1.3

! pip install hazm

### Import Libraries

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf 

from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.layers import GlobalMaxPool1D, MaxPooling1D, GlobalMaxPooling1D, Conv1D
from sklearn.metrics import classification_report, confusion_matrix


from tensorflow.keras.callbacks import EarlyStopping
import fasttext

from hazm import word_tokenize, Normalizer
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import re
import numpy as np

### Load Data

In [3]:
PATH = 'data/'
PATH = PATH.rstrip('/')

# Train
df_train = pd.read_csv(PATH + '/train.csv')
df_train.columns = ['index', 'comment', 'rate']

# Evaluation
df_eval = pd.read_csv(PATH + '/eval.csv')
df_eval.columns = ['index', 'comment', 'rate']

# Test
df_test = pd.read_csv(PATH + '/test.csv')
df_test.columns = ['index', 'comment', 'rate']

# Create Lables
label_encoder = LabelEncoder()

train_y = label_encoder.fit_transform((df_train['rate'] >= 0).astype(int))
eval_y = label_encoder.fit_transform((df_eval['rate'] >= 0).astype(int))
test_y = label_encoder.fit_transform((df_test['rate'] >= 0).astype(int))

### Preprocess

In [4]:
normalizer = Normalizer() # Hazm normlizer
symbols_complete_reg = re.compile(r"(\d|\"|'ٍ|¬|[؛“،,”‘۔’’‘–]|[|\.÷+\]\[\)\(\:\-\?»\=\{}\*«»_…\؟!/ـ]|[۰'ٓ۫'ٔ]|[ٓٔ]|[ًٌٍْﹼ،َُِّ«ٰ»ٖء])")

def remeove_arabic(text):
    # remove arabic alphabet
    mapping = {
        u"ۀ" : u"ه",
        u"ة" : u"ت",
        u"ي" : u"ی",
        u"ؤ" : u"و",
        u"إ" : u"ا",
        u"ٹ" : u"ت",
        u"ڈ" : u"د",
        u"ئ" : u"ی",
        u"ﻨ" : u"ن",
        u"ﺠ" : u"ج",
        u"ﻣ" : u"م",
        u"ﷲ" : u"",
        u"ﻳ" : u"ی",
        u"ٻ" : u"ب",
        u"ٱ" : u"ا",
        u"ڵ" : u"ل",
        u"ﭘ" : u"پ",
        u"ﻪ" : u"ه",
        u"ﻳ" : u"ی",
        u"ٻ" : u"ب",
        u"ں" : u"ن",
        u"ٶ" : u"و",
        u"ٲ" : u"ا",
        u"ہ" : u"ه",
        u"ﻩ" : u"ه",
        u"ﻩ" : u"ه",
        u"ك" : u"ک",
        u"ﺆ" : u"و",
        u"أ" : u"ا",
        u"ﺪ" : u"د"
    }
    arabic_keys =  re.compile(r"(" + "|".join(mapping.keys()) + r")")
    return arabic_keys.sub(lambda x: mapping[x.group()], text)


# clean_text function
def clean_comment(text, allspace=True, punc=True, sentence=True, only_persian=True):
    #remove halph space, new line ('\n') and '\r'
    text = text.replace('\u200c', ' ').replace('\n', '').replace('\r', '')
    # remove punctuations
    text = re.sub(symbols_complete_reg, "", text)
    # remove arabic letters
    text = remeove_arabic(text)
    # convert spaces to a one space and delete leading and trailing spaces
    text = re.sub("(\s)+", " ", text)
    text = text.strip()
    return text

In [5]:
df_train['clean_comment'] = df_train['comment'].apply(lambda comment:clean_comment(comment))
df_eval['clean_comment'] = df_eval['comment'].apply(lambda comment:clean_comment(comment))
df_test['clean_comment'] = df_test['comment'].apply(lambda comment:clean_comment(comment))

In [6]:
example_id = 500
example = df_train['clean_comment'][example_id]
example

'خیلی عالیه'

## FastText Embedding

### Download Skipgram Model

In [None]:
# Model 1: Dimension: 100 from # https://github.com/taesiri/PersianWordVectors
# SKIPGRAM_MODEL_FILE_ID_1 = '1wPnMG9_GNUVdSgbznQziQc5nMWI3QKNz'
# !gdown --id $SKIPGRAM_MODEL_FILE_ID 

# Model 2: Dimension: 300 from https://fasttext.cc/docs/en/pretrained-vectors.html
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fa.zip
! unzip wiki.fa.zip
! rm -rf wiki.fa.zip
! rm -rf wiki.fa.vec

In [14]:
EMBEDDING_LEN = 300 # 100 for Model 1 and 300 for Model 2

### Load FastText Model

In [7]:
# Model 1:
# model_skipgram = fasttext.load_model('farsi-dedup-skipgram.bin')
# Model 2:
model_skipgram = fasttext.load_model('wiki.fa.bin')



In [8]:
# Fit Keras Tokenizer on comments
comments = df_train['clean_comment'].values
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=3000)
tokenizer.fit_on_texts(comments)

vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size : {}'.format(vocab_size))

Vocabulary Size : 4323


In [9]:
encoded_comments = tokenizer.texts_to_sequences(comments)

# example of encoded comments
print("Comment : {}".format(comments[1]))
print("Corresponding Encoding : {}".format(encoded_comments[1]))

Comment : سلام به دوستای عزیزم عزاداری هاتون قبول باشه
Corresponding Encoding : [94, 2, 1716, 817, 1717, 818, 526, 68]


In [10]:
# padding
SENT_MAX_LEN = max([len(sent) for sent in encoded_comments])
padded_sequence = pad_sequences(encoded_comments, maxlen=SENT_MAX_LEN, padding='post')
print('Padding Shape: {}'.format(padded_sequence.shape))

Padding Shape: (800, 616)


In [16]:
# initial embedding matrix
embedding_matrix = np.zeros((vocab_size, EMBEDDING_LEN))

for word, i in tokenizer.word_index.items():
  embedding_vector = model_skipgram.get_word_vector(word)
  # words that cannot be found will be set to 0
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

print(f"Embedding Matrix Shape is: {embedding_matrix.shape}")

Embedding Matrix Shape is: (4323, 300)


In [17]:
# Same procedure with a Unique Tokenizer on Evaluation data
eval_comments = df_eval['clean_comment'].values
tokenizer.texts_to_matrix(eval_comments)
eval_encoded_comments = tokenizer.texts_to_sequences(eval_comments)
eval_padded_sequence = pad_sequences(eval_encoded_comments, maxlen=SENT_MAX_LEN, padding='post')

In [18]:
# Same procedure with a Unique Tokenizer on Test data
test_comments = df_test['clean_comment'].values
tokenizer.texts_to_matrix(test_comments)
test_encoded_comments = tokenizer.texts_to_sequences(test_comments)
test_padded_sequence = pad_sequences(test_encoded_comments, maxlen=SENT_MAX_LEN, padding='post')

## LSTM Model Architecture

In [19]:
# LSTM constants
LSTM_UNITS = 32

In [20]:
model_1 = Sequential()
model_1.add(Embedding(vocab_size, EMBEDDING_LEN, input_length=SENT_MAX_LEN, weights=[embedding_matrix], trainable=True))
model_1.add(Bidirectional(LSTM(EMBEDDING_LEN, return_sequences=True, input_shape=(None, 1))))
model_1.add(Dropout(0.2))
model_1.add(Bidirectional(LSTM(LSTM_UNITS)))
model_1.add(Dropout(0.2))
model_1.add(Dense(EMBEDDING_LEN, activation='relu'))
model_1.add(Dropout(0.1))
model_1.add(Dense(1, activation='sigmoid'))
model_1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_1.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 616, 300)          1296900   
                                                                 
 bidirectional (Bidirectiona  (None, 616, 600)         1442400   
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 616, 600)          0         
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               162048    
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense (Dense)               (None, 300)               1

### Fit LSTM Model
You can run the cell bellow as much as you want. keep track on validation accuracy and also change the `epochs`. I got my best result in most of the run times at 5th and 10th epochs.

In [21]:
model_1.fit(
    padded_sequence, 
    train_y, 
    batch_size=32, 
    epochs=5, 
    validation_data=(eval_padded_sequence, eval_y)
    )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f81e01f0e10>

In [22]:
loss_1, acc_1 = model_1.evaluate(test_padded_sequence, test_y, verbose=0)
print(f'Test Accuracy: {acc_1}')

Test Accuracy: 0.7352941036224365


In [29]:
pred_1 = model_1.predict(test_padded_sequence)
y_pred_1 = np.array((pred_1 > 0.5).astype(int)[:,0])
print(confusion_matrix(y_true=test_y, y_pred=y_pred_1))
print(classification_report(y_true=test_y, y_pred=y_pred_1))

[[ 21  31]
 [ 14 104]]
              precision    recall  f1-score   support

           0       0.60      0.40      0.48        52
           1       0.77      0.88      0.82       118

    accuracy                           0.74       170
   macro avg       0.69      0.64      0.65       170
weighted avg       0.72      0.74      0.72       170



## CNN Model Architecture

In [24]:
## CNN Constants
KERNEL_SIZE = 3
FILTERS = 256

In [25]:
model_2 = Sequential()
model_2.add(Embedding(vocab_size, embedding_matrix.shape[1], weights=[embedding_matrix], trainable=False))
model_2.add(Conv1D(filters=FILTERS, kernel_size=KERNEL_SIZE, activation='relu'))
model_2.add(GlobalMaxPooling1D())
model_2.add(Dense(FILTERS, activation='relu'))
model_2.add(Dense(1, activation='sigmoid'))
model_2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_2.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 300)         1296900   
                                                                 
 conv1d (Conv1D)             (None, None, 256)         230656    
                                                                 
 global_max_pooling1d (Globa  (None, 256)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense_2 (Dense)             (None, 256)               65792     
                                                                 
 dense_3 (Dense)             (None, 1)                 257       
                                                                 
Total params: 1,593,605
Trainable params: 296,705
Non-trainable params: 1,296,900
______________________________________

### Fit CNN Model
You can run the cell bellow as much as you want. keep track on validation accuracy and also change the `epochs`. I got my best result in most of the run times at 5th epoch.

In [26]:
model_2.fit(
    padded_sequence, 
    train_y, 
    batch_size=32, 
    epochs=5, 
    validation_data=(eval_padded_sequence, eval_y)
    )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f81776c0850>

In [27]:
loss_2, acc_2 = model_2.evaluate(test_padded_sequence, test_y, verbose=0)
print('Test Accuracy: %f' % (acc_2*100))

Test Accuracy: 77.647060


In [31]:
pred_2 = model_2.predict(test_padded_sequence)
y_pred_2 = np.array((pred_2 > 0.5).astype(int)[:,0])
print(confusion_matrix(y_true=test_y, y_pred=y_pred_2))
print(classification_report(y_true=test_y, y_pred=y_pred_2))

[[ 18  34]
 [  4 114]]
              precision    recall  f1-score   support

           0       0.82      0.35      0.49        52
           1       0.77      0.97      0.86       118

    accuracy                           0.78       170
   macro avg       0.79      0.66      0.67       170
weighted avg       0.78      0.78      0.74       170

