## Before You Run
make a `data` drectory and upload data (eval, test and train csvs)

In [1]:
! mkdir data

In [None]:
! pip install hazm
! pip install ktrain # ktrain is a lightweight wrapper for the deep learning library TensorFlow Keras

### Import Libraries

In [3]:
from tensorflow.keras.layers import GlobalMaxPool1D, MaxPooling1D, GlobalMaxPooling1D, Conv1D
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
import seaborn as sns

import re
import pandas as pd
import numpy as np

from hazm import word_tokenize, Normalizer

import ktrain
from ktrain import text

### Load Data

In [9]:
PATH = 'data/'
PATH = PATH.rstrip('/')

# Classes
class_names = ['Positive', 'Negative']

# Train
df_train = pd.read_csv(PATH + '/train.csv')
df_train.columns = ['index', 'comment', 'rate']

# Evaluation
df_eval = pd.read_csv(PATH + '/eval.csv')
df_eval.columns = ['index', 'comment', 'rate']

# Test
df_test = pd.read_csv(PATH + '/test.csv')
df_test.columns = ['index', 'comment', 'rate']

### Preprocess

In [11]:
normalizer = Normalizer() # Hazm normlizer
symbols_complete_reg = re.compile(r"(\d|\"|'ٍ|¬|[؛“،,”‘۔’’‘–]|[|\.÷+\]\[\)\(\:\-\?»\=\{}\*«»_…\؟!/ـ]|[۰'ٓ۫'ٔ]|[ٓٔ]|[ًٌٍْﹼ،َُِّ«ٰ»ٖء])")

def remeove_arabic(text):
    # remove arabic alphabet
    mapping = {
        u"ۀ" : u"ه",
        u"ة" : u"ت",
        u"ي" : u"ی",
        u"ؤ" : u"و",
        u"إ" : u"ا",
        u"ٹ" : u"ت",
        u"ڈ" : u"د",
        u"ئ" : u"ی",
        u"ﻨ" : u"ن",
        u"ﺠ" : u"ج",
        u"ﻣ" : u"م",
        u"ﷲ" : u"",
        u"ﻳ" : u"ی",
        u"ٻ" : u"ب",
        u"ٱ" : u"ا",
        u"ڵ" : u"ل",
        u"ﭘ" : u"پ",
        u"ﻪ" : u"ه",
        u"ﻳ" : u"ی",
        u"ٻ" : u"ب",
        u"ں" : u"ن",
        u"ٶ" : u"و",
        u"ٲ" : u"ا",
        u"ہ" : u"ه",
        u"ﻩ" : u"ه",
        u"ﻩ" : u"ه",
        u"ك" : u"ک",
        u"ﺆ" : u"و",
        u"أ" : u"ا",
        u"ﺪ" : u"د"
    }
    arabic_keys =  re.compile(r"(" + "|".join(mapping.keys()) + r")")
    return arabic_keys.sub(lambda x: mapping[x.group()], text)


# clean_text function
def clean_comment(text, allspace=True, punc=True, sentence=True, only_persian=True):
    #remove halph space, new line ('\n') and '\r'
    text = text.replace('\u200c', ' ').replace('\n', '').replace('\r', '')
    # remove punctuations
    text = re.sub(symbols_complete_reg, "", text)
    # remove arabic letters
    text = remeove_arabic(text)
    # convert spaces to a one space and delete leading and trailing spaces
    text = re.sub("(\s)+", " ", text)
    text = text.strip()
    return text

In [12]:
df_train['clean_comment'] = df_train['comment'].apply(lambda comment:clean_comment(comment))
df_eval['clean_comment'] = df_eval['comment'].apply(lambda comment:clean_comment(comment))
df_test['clean_comment'] = df_test['comment'].apply(lambda comment:clean_comment(comment))

In [13]:
# Create Lables
label_encoder = LabelEncoder()

# X
x_train = df_train['clean_comment'].values
x_eval = df_eval['clean_comment'].values
x_test = df_test['clean_comment'].values

# Y
y_train = label_encoder.fit_transform((df_train['rate'] >= 0).astype(int))
y_eval = label_encoder.fit_transform((df_eval['rate'] >= 0).astype(int))
y_test = label_encoder.fit_transform((df_test['rate'] >= 0).astype(int))

In [None]:
# build, train, and validate model (Transformer is wrapper around transformers library)

MODEL_NAME = 'HooshvareLab/distilbert-fa-zwnj-base'  # replace this with model of choice
t = text.Transformer(MODEL_NAME, maxlen=500, class_names=class_names)
trn = t.preprocess_train(x_train, y_train)
val = t.preprocess_test(x_eval, y_eval)
model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)
learner.fit_onecycle(5e-5, 4)
learner.validate(class_names=t.get_classes()) # class_names must be string values

Downloading:   0%|          | 0.00/500 [00:00<?, ?B/s]

preprocessing train...
language: fa
train sequence lengths:
	mean : 23
	95percentile : 59
	99percentile : 126


Downloading:   0%|          | 0.00/292 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/134 [00:00<?, ?B/s]

Is Multi-Label? False
preprocessing test...
language: fa
test sequence lengths:
	mean : 24
	95percentile : 49
	99percentile : 184


Downloading:   0%|          | 0.00/434M [00:00<?, ?B/s]



begin training using onecycle policy with max lr of 5e-05...
Epoch 1/4
  7/134 [>.............................] - ETA: 34:30 - loss: 0.5532 - accuracy: 0.8095