## Before You Run
make a `data` drectory and upload data (eval, test and train csvs)

In [77]:
# ! mkdir data

In [None]:
# install fasttext
! pip install fasttext

# install gdown to download fasttext model from google drive (with maximum speed!)
! pip install gdown

# install matplotlib to prevent unwelcome errors
! pip install matplotlib==3.1.3

! pip install hazm

### Import Libraries

In [57]:
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf 

from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.layers import GlobalMaxPool1D, MaxPooling1D, GlobalMaxPooling1D

from tensorflow.keras.callbacks import EarlyStopping
import fasttext

from hazm import word_tokenize, Normalizer
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import re
import numpy as np

### Load Data

In [5]:
PATH = 'data/'
PATH = PATH.rstrip('/')

# Train
df_train = pd.read_csv(PATH + '/train.csv')
df_train.columns = ['index', 'comment', 'rate']

# Evaluation
df_eval = pd.read_csv(PATH + '/eval.csv')
df_eval.columns = ['index', 'comment', 'rate']

# Test
df_test = pd.read_csv(PATH + '/test.csv')
df_test.columns = ['index', 'comment', 'rate']

# Create Lables
label_encoder = LabelEncoder()

train_y = label_encoder.fit_transform((df_train['rate'] >= 0).astype(int))
eval_y = label_encoder.fit_transform((df_eval['rate'] >= 0).astype(int))
test_y = label_encoder.fit_transform((df_test['rate'] >= 0).astype(int))

### Preprocess

In [8]:
normalizer = Normalizer() # Hazm normlizer
symbols_complete_reg = re.compile(r"(\d|\"|'ٍ|¬|[؛“،,”‘۔’’‘–]|[|\.÷+\]\[\)\(\:\-\?»\=\{}\*«»_…\؟!/ـ]|[۰'ٓ۫'ٔ]|[ٓٔ]|[ًٌٍْﹼ،َُِّ«ٰ»ٖء])")

def remeove_arabic(text):
    # remove arabic alphabet
    mapping = {
        u"ۀ" : u"ه",
        u"ة" : u"ت",
        u"ي" : u"ی",
        u"ؤ" : u"و",
        u"إ" : u"ا",
        u"ٹ" : u"ت",
        u"ڈ" : u"د",
        u"ئ" : u"ی",
        u"ﻨ" : u"ن",
        u"ﺠ" : u"ج",
        u"ﻣ" : u"م",
        u"ﷲ" : u"",
        u"ﻳ" : u"ی",
        u"ٻ" : u"ب",
        u"ٱ" : u"ا",
        u"ڵ" : u"ل",
        u"ﭘ" : u"پ",
        u"ﻪ" : u"ه",
        u"ﻳ" : u"ی",
        u"ٻ" : u"ب",
        u"ں" : u"ن",
        u"ٶ" : u"و",
        u"ٲ" : u"ا",
        u"ہ" : u"ه",
        u"ﻩ" : u"ه",
        u"ﻩ" : u"ه",
        u"ك" : u"ک",
        u"ﺆ" : u"و",
        u"أ" : u"ا",
        u"ﺪ" : u"د"
    }
    arabic_keys =  re.compile(r"(" + "|".join(mapping.keys()) + r")")
    return arabic_keys.sub(lambda x: mapping[x.group()], text)


# clean_text function
def clean_comment(text, allspace=True, punc=True, sentence=True, only_persian=True):
    #remove halph space, new line ('\n') and '\r'
    text = text.replace('\u200c', ' ').replace('\n', '').replace('\r', '')
    # remove punctuations
    text = re.sub(symbols_complete_reg, "", text)
    # remove arabic letters
    text = remeove_arabic(text)
    # convert spaces to a one space and delete leading and trailing spaces
    text = re.sub("(\s)+", " ", text)
    text = text.strip()
    return text

In [9]:
df_train['clean_comment'] = df_train['comment'].apply(lambda comment:clean_comment(comment))
df_eval['clean_comment'] = df_eval['comment'].apply(lambda comment:clean_comment(comment))
df_test['clean_comment'] = df_test['comment'].apply(lambda comment:clean_comment(comment))

In [10]:
example_id = 500
example = df_train['clean_comment'][example_id]
example

'خیلی عالیه'

## FastText Embedding

### Download Skipgram Model

In [19]:
# Model 1: Dimension: 100 from # https://github.com/taesiri/PersianWordVectors
# SKIPGRAM_MODEL_FILE_ID_1 = '1wPnMG9_GNUVdSgbznQziQc5nMWI3QKNz'
# !gdown --id $SKIPGRAM_MODEL_FILE_ID 

# Model 2: Dimension: 300 from https://fasttext.cc/docs/en/pretrained-vectors.html
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fa.zip
! unzip wiki.fa.zip
! rm -rf wiki.fa.zip
! rm -rf wiki.fa.vec
EMBEDDING_LEN = 300 # 100 for Model 1 and 300 for Model 2

--2022-01-10 13:14:16--  https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fa.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 172.67.9.4, 104.22.74.142, 104.22.75.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|172.67.9.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3631863356 (3.4G) [application/zip]
Saving to: ‘wiki.fa.zip’


2022-01-10 13:15:57 (34.3 MB/s) - ‘wiki.fa.zip’ saved [3631863356/3631863356]



### Load FastText Model

In [22]:
# Model 1:
# model_skipgram = fasttext.load_model('farsi-dedup-skipgram.bin')
# Model 2:
model_skipgram = fasttext.load_model('wiki.fa.bin')



In [23]:
# Fit Keras Tokenizer on comments
comments = df_train['clean_comment'].values
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=3000)
tokenizer.fit_on_texts(comments)

vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size : {}'.format(vocab_size))

Vocabulary Size : 4323


In [24]:
encoded_comments = tokenizer.texts_to_sequences(comments)

# example of encoded comments
print("Comment : {}".format(comments[1]))
print("Corresponding Encoding : {}".format(encoded_comments[1]))

Comment : سلام به دوستای عزیزم عزاداری هاتون قبول باشه
Corresponding Encoding : [94, 2, 1716, 817, 1717, 818, 526, 68]


In [25]:
# padding
SENT_MAX_LEN = max([len(sent) for sent in encoded_comments])
padded_sequence = pad_sequences(encoded_comments, maxlen=SENT_MAX_LEN, padding='post')
print('Padding Shape: {}'.format(padded_sequence.shape))

Padding Shape: (800, 616)


In [29]:
# initial embedding matrix
embedding_matrix = np.zeros((vocab_size, EMBEDDING_LEN))

for word, i in tokenizer.word_index.items():
  embedding_vector = model_skipgram.get_word_vector(word)
  # words that cannot be found will be set to 0
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

print(f"Embedding Matrix Shape is: {embedding_matrix.shape}")

Embedding Matrix Shape is: (4323, 300)


In [36]:
# Same procedure with a Unique Tokenizer on Evaluation data
eval_comments = df_eval['clean_comment'].values
tokenizer.texts_to_matrix(eval_comments)
eval_encoded_comments = tokenizer.texts_to_sequences(eval_comments)
eval_padded_sequence = pad_sequences(eval_encoded_comments, maxlen=SENT_MAX_LEN, padding='post')

In [37]:
# Same procedure with a Unique Tokenizer on Test data
test_comments = df_test['clean_comment'].values
tokenizer.texts_to_matrix(test_comments)
test_encoded_comments = tokenizer.texts_to_sequences(test_comments)
test_padded_sequence = pad_sequences(test_encoded_comments, maxlen=SENT_MAX_LEN, padding='post')

## LSTM Model Architecture

In [38]:
# LSTM constants
LSTM_UNITS = 32

In [67]:
model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_LEN, input_length=SENT_MAX_LEN, weights=[embedding_matrix], trainable=True))
model.add(Bidirectional(LSTM(EMBEDDING_LEN, return_sequences=True, input_shape=(None, 1))))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(LSTM_UNITS)))
model.add(Dropout(0.2))
model.add(Dense(EMBEDDING_LEN, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, 616, 300)          1296900   
                                                                 
 bidirectional_7 (Bidirectio  (None, 616, 600)         1442400   
 nal)                                                            
                                                                 
 dropout_13 (Dropout)        (None, 616, 600)          0         
                                                                 
 bidirectional_8 (Bidirectio  (None, 64)               162048    
 nal)                                                            
                                                                 
 dropout_14 (Dropout)        (None, 64)                0         
                                                                 
 dense_9 (Dense)             (None, 300)             

### Fit LSTM Model

In [68]:
model = model.fit(
    padded_sequence, 
    train_y, 
    batch_size=32, 
    epochs=5, 
    validation_data=(eval_padded_sequence, eval_y)
    )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [70]:
loss_lstm, acc_lstm = model.model.evaluate(test_padded_sequence, test_y, verbose=0)
print('Test Accuracy: %f' % (acc_lstm*100))

Test Accuracy: 75.882351


In [None]:
# 75.882351