## Before You Run
make a `data` drectory and upload data (eval, test and train csvs)

In [None]:
! mkdir data

In [62]:
# install hazm
! pip install hazm



### Import Libraries

In [66]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from hazm import word_tokenize, Normalizer, Lemmatizer, Stemmer
from sklearn.preprocessing import LabelEncoder
import re

from sklearn.model_selection import PredefinedSplit, RandomizedSearchCV
from sklearn.pipeline import Pipeline


## Data

### Load Data

In [67]:
PATH = 'data/'
PATH = PATH.rstrip('/')

# Train
df_train = pd.read_csv(PATH + '/train.csv')
df_train.columns = ['index', 'comment', 'rate']

# Evaluation
df_eval = pd.read_csv(PATH + '/eval.csv')
df_eval.columns = ['index', 'comment', 'rate']

# Test
df_test = pd.read_csv(PATH + '/test.csv')
df_test.columns = ['index', 'comment', 'rate']

# Create Lables
label_encoder = LabelEncoder()
# Y
train_y = label_encoder.fit_transform((df_train['rate'] >= 0).astype(int))
eval_y = label_encoder.fit_transform((df_eval['rate'] >= 0).astype(int))
test_y = label_encoder.fit_transform((df_test['rate'] >= 0).astype(int))

### Preprocess

In [68]:
normalizer = Normalizer() # Hazm normlizer
lemmatize = Lemmatizer().lemmatize # Hazm lemmatizer
stem = Stemmer().stem # Hazm Stemmer
clean_lemmatize = lambda comment:[lemmatize(word) if '#' not in lemmatize(word) else word for word in comment]
clean_stem = lambda comment:[stem(word) for word in comment]

symbols_complete_reg = re.compile(r"(\d|\"|'ٍ|¬|[؛“،,”‘۔’’‘–]|[|\.÷+\]\[\)\(\:\-\?»\=\{}\*«»_…\؟!/ـ]|[۰'ٓ۫'ٔ]|[ٓٔ]|[ًٌٍْﹼ،َُِّ«ٰ»ٖء])")

def remeove_arabic(text):
    # remove arabic alphabet
    mapping = {
        u"ۀ" : u"ه",
        u"ة" : u"ت",
        u"ي" : u"ی",
        u"ؤ" : u"و",
        u"إ" : u"ا",
        u"ٹ" : u"ت",
        u"ڈ" : u"د",
        u"ئ" : u"ی",
        u"ﻨ" : u"ن",
        u"ﺠ" : u"ج",
        u"ﻣ" : u"م",
        u"ﷲ" : u"",
        u"ﻳ" : u"ی",
        u"ٻ" : u"ب",
        u"ٱ" : u"ا",
        u"ڵ" : u"ل",
        u"ﭘ" : u"پ",
        u"ﻪ" : u"ه",
        u"ﻳ" : u"ی",
        u"ٻ" : u"ب",
        u"ں" : u"ن",
        u"ٶ" : u"و",
        u"ٲ" : u"ا",
        u"ہ" : u"ه",
        u"ﻩ" : u"ه",
        u"ﻩ" : u"ه",
        u"ك" : u"ک",
        u"ﺆ" : u"و",
        u"أ" : u"ا",
        u"ﺪ" : u"د"
    }
    arabic_keys =  re.compile(r"(" + "|".join(mapping.keys()) + r")")
    return arabic_keys.sub(lambda x: mapping[x.group()], text)


# clean_text function
def clean_comment(text, allspace=True, punc=True, sentence=True, only_persian=True):
    #remove halph space, new line ('\n') and '\r'
    text = text.replace('\u200c', ' ').replace('\n', '').replace('\r', '')
    # remove punctuations
    text = re.sub(symbols_complete_reg, "", text)
    # remove arabic letters
    text = remeove_arabic(text)
    # convert spaces to a one space and delete leading and trailing spaces
    text = re.sub("(\s)+", " ", text)
    text = text.strip()
    #lemmatize
    " ".join(clean_lemmatize(text.split(" ")))
    #stemming
    " ".join(clean_stem(text.split(" ")))
    # convert spaces to a one space and delete leading and trailing spaces
    text = re.sub("(\s)+", " ", text)
    text = text.strip()
    return text

In [69]:
# X : Features
df_train['clean_comment'] = df_train['comment'].apply(lambda comment:clean_comment(comment))
df_eval['clean_comment'] = df_eval['comment'].apply(lambda comment:clean_comment(comment))
df_test['clean_comment'] = df_test['comment'].apply(lambda comment:clean_comment(comment))

In [70]:
example_id = 500
example = df_train['clean_comment'][example_id]
example

'خیلی عالیه'

## TFIDF and Logistic Regression

### Base Line Accuracy

In [None]:
# accuracy base line
np.sum(test_y)/len(test_y)

0.6941176470588235

### Pipeline 
This pipeline is designed for finding the best parameters in both tfidf and lr model. Finding best parameter is based on accuracy of both models in the validation data.
To aim this we use `RandomizedSearchCV` method. 


In [93]:
# solvers= ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
# multi_classes = ['multinomial', 'ovr']

pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(analyzer='word')),
        ('lr', LogisticRegression(random_state=0, solver='liblinear', max_iter=1000, multi_class='ovr'))
    ])

In [107]:
parameters = {
    'lr__C': (0.01, 0.1, 2, 5, 10, 15, 20),
    'lr__penalty': ('l1', 'l2'),
    'tfidf__min_df': (0, 1, 3, 5),
    'tfidf__ngram_range': ((1, 1), (1, 2), (1, 3)),
    'tfidf__max_features': (None, 2000, 8000, 12000, 15000)
}

In [82]:
split_index = [-1] *len(df_train['clean_comment']) + [0] *len(df_eval['clean_comment'])
X_train_eval_joint = list(df_train['clean_comment']) + list(df_eval['clean_comment'])
Y_train_eval_joint = list(train_y) + list(eval_y)

In [108]:
ps = PredefinedSplit(test_fold=split_index)

In [113]:
grid = RandomizedSearchCV(pipeline, parameters, scoring='accuracy', cv=ps)

In [114]:
grid.fit(X_train_eval_joint, Y_train_eval_joint)

RandomizedSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
                   estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                             ('lr',
                                              LogisticRegression(max_iter=1000,
                                                                 multi_class='ovr',
                                                                 random_state=0,
                                                                 solver='liblinear'))]),
                   param_distributions={'lr__C': (0.01, 0.1, 2, 5, 10, 15, 20),
                                        'lr__penalty': ('l1', 'l2'),
                                        'tfidf__max_features': (None, 2000,
                                                                8000, 12000,
                                                                15000),
                                        'tfidf__min_df': (0, 1, 3, 5),
               

In [140]:
print(f"Best accuracy in the pipline: {grid.best_score_}")

Best accuracy in the pipline: 0.73


In [142]:
""" best f1: 
{'lr__C': 0.1,
 'lr__penalty': 'l1',
 'tfidf__max_features': 12000,
 'tfidf__min_df': 3,
 'tfidf__ngram_range': (1, 1)}
"""

"""
best accuracy:
{'lr__C': 15,
 'lr__penalty': 'l2',
 'tfidf__max_features': 12000,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 1)}
"""

print("Best parameters in pipeline (accuracy):")
grid.best_params_

Best parameters in pipeline (accuracy):


{'lr__C': 15,
 'lr__penalty': 'l2',
 'tfidf__max_features': 12000,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 1)}

### TFIDF

In [121]:
vectorizer = TfidfVectorizer(min_df=1, ngram_range = (1,1), max_features=12000)
train_data_features = vectorizer.fit_transform(df_train['clean_comment'])
print(train_data_features.shape)

(800, 4273)


In [122]:
## data snooping ALERT: we should transforom not fit again
eval_data_features = vectorizer.transform(df_eval['clean_comment'])
test_data_features = vectorizer.transform(df_test['clean_comment'])

In [123]:
# show
vectorizer.get_feature_names()[200:210]



['آدمو',
 'آذرماه',
 'آذرگانم',
 'آرام',
 'آردن',
 'آرزو',
 'آروستو',
 'آری',
 'آز',
 'آزاد']

### Logistic Regression

In [124]:
# Load model

model = LogisticRegression(C=15, penalty='l2', random_state=0, solver='liblinear', max_iter=1000, multi_class='ovr')
# Train model
model.fit(train_data_features, train_y)


LogisticRegression(C=15, max_iter=1000, multi_class='ovr', random_state=0,
                   solver='liblinear')

### Evaluation

In [125]:
## evaluation on test data
y_test_pred = model.predict(test_data_features)

In [126]:
# On test data
print('----- Accuracy Score ----- ')
print(accuracy_score(test_y, y_test_pred))
print('----- Confusion Matrix ----- ')
print(confusion_matrix(test_y, y_test_pred))
print('----- Classification Report ----- ')
print(classification_report(test_y, y_test_pred))


----- Accuracy Score ----- 
0.7352941176470589
----- Confusion Matrix ----- 
[[ 18  34]
 [ 11 107]]
----- Classification Report ----- 
              precision    recall  f1-score   support

           0       0.62      0.35      0.44        52
           1       0.76      0.91      0.83       118

    accuracy                           0.74       170
   macro avg       0.69      0.63      0.64       170
weighted avg       0.72      0.74      0.71       170



## Key Features

In [143]:
def top_key_features(vectorizer, model, n_top=30):
    weights = model.coef_
    feature_names = vectorizer.get_feature_names()
    sorted_features = weights[0].argsort()[::-1]
    most_important = sorted_features[:n_top]
    least_important = sorted_features[-n_top:]

    print('Most important words in the class 1: \n')
    for i in most_important:
        print(f"{feature_names[i]}: {weights[0, i]}")

    print('Most important words in the class 2: \n')
    for i in least_important:
        print(f"{feature_names[i]}: {weights[0, i]}")


### Most Important Features (Top 30)

In [144]:
top_key_features(vectorizer, model)

Most important words in the class 1: 

دوربین: 3.23638261564184
نیز: 3.2268381428187576
digikala: 2.5309529271587725
تو: 2.479308604417539
اینکه: 2.4694945869725706
واقعا: 2.4462823368506523
وجود: 2.3987426101027483
همین: 2.3792914670190344
می: 2.326414357430155
هست: 2.282688498825936
نسبت: 2.259165864957199
قشنگ: 2.233477102291063
دستگاه: 2.218611980912459
کفش: 2.1748800026603132
رینگ: 2.0940869189893143
نصب: 2.092673687316176
نبود: 2.086794772131762
شده: 2.0393511467273053
پیش: 2.0336723271515424
نظر: 2.013301657967153
تقریبا: 2.0045345996962887
طعمش: 1.976633655219301
گوشفیل: 1.9611449450543894
کاملا: 1.9409312724655396
شارژ: 1.9182286786984186
موجود: 1.8853972547772857
کنه: 1.8686890072866533
جعبه: 1.8662110840563089
مونده: 1.862958366737985
کار: 1.8509578934361548
Most important words in the class 2: 

ممنون: -2.585981917605968
پیشنهاد: -2.601150890635267
اصن: -2.6590737895816665
ارزه: -2.6865214236215222
میتونست: -2.700873505236188
زود: -2.7349909580529834
قیمتی: -2.7367487787341

