In [49]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [76]:
PATH = 'data/'
PATH = PATH.rstrip('/')

In [77]:
df_train = pd.read_csv(PATH + '/train.csv')
df_train.columns = ['index', 'comment', 'rate']

df_eval = pd.read_csv(PATH + '/eval.csv')
df_eval.columns = ['index', 'comment', 'rate']

df_train.head(5)

Unnamed: 0,index,comment,rate
0,2587,پردازنده های Core i5 و Core i3 نیز ذاتا دو هست...,0.0
1,22591,سلام به دوستای عزیزم \nعزاداری هاتون قبول باشه,1.0
2,141037,کلا پولتون رو دور نریزیزد,-1.0
3,58593,از صمیم قلب امیدوارم دایانا با کارن بمونه و پو...,1.0
4,5712,آنطور که اپل ادعا می کند آیپاد شافل دارای طراح...,1.0


In [78]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 3 columns):
index      800 non-null int64
comment    800 non-null object
rate       800 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 18.8+ KB


In [79]:
df_train.describe()

Unnamed: 0,index,rate
count,800.0,800.0
mean,35701.64625,0.278625
std,41159.897273,0.712302
min,20.0,-1.0
25%,5162.0,0.0
50%,16922.0,0.5
75%,53861.75,1.0
max,171764.0,1.0


In [94]:
## Label
y_train = (df_train['rate'] >= 0).astype(int)
y_eval = (df_eval['rate'] >= 0).astype(int)
y_train[:2]

0    1
1    1
Name: rate, dtype: int32

In [95]:
# accuracy base line
np.sum(y_train)/len(y_train)

0.7575

In [96]:
## clean
### lammatize
### arabic
### punc
### stop words

cleaned_train_data = df_train['comment']
cleaned_eval_data = df_eval['comment']

In [97]:
# show
cleaned_train_data[:2]

0    پردازنده های Core i5 و Core i3 نیز ذاتا دو هست...
1       سلام به دوستای عزیزم \nعزاداری هاتون قبول باشه
Name: comment, dtype: object

In [98]:
vectorizer = TfidfVectorizer(analyzer='word', min_df=1, max_df =0.95, ngram_range = (1,1), max_features=10000)

train_data_features = vectorizer.fit_transform(cleaned_train_data)

print(train_data_features.shape)


(800, 4522)


In [99]:
## data snooping ALERT: we should transforom not fit again

eval_data_features = vectorizer.transform(cleaned_eval_data)

In [100]:
# show
vectorizer.get_feature_names()[:1000]

['10',
 '100',
 '1000',
 '1080',
 '1080p',
 '114gr',
 '12',
 '122x87x93',
 '125',
 '13',
 '1300',
 '1332',
 '1345',
 '14',
 '1440پیکسل',
 '15',
 '1600',
 '165',
 '1650',
 '170',
 '179',
 '17gr',
 '18',
 '1920x1200',
 '1ghz',
 '20',
 '200',
 '2000',
 '2005',
 '23',
 '25',
 '297ppi',
 '2و3و4',
 '2گيگ',
 '30',
 '300',
 '3170',
 '32',
 '36',
 '3g',
 '3mm',
 '3mos',
 '3ماهه',
 '40',
 '41',
 '45',
 '4540s',
 '480x854',
 '4g',
 '4k',
 '4mm',
 '4s',
 '4نفری',
 '50',
 '5100',
 '512',
 '520mx',
 '55',
 '580',
 '5ماه',
 '5مگاپيکسلي',
 '60',
 '600',
 '6000',
 '60d',
 '63',
 '6300',
 '64',
 '650d',
 '700',
 '720p',
 '77',
 '78000',
 '80',
 '800',
 '8000',
 '800x480',
 '808',
 '820',
 '83',
 '87',
 '8mgp',
 '920',
 '97',
 '98',
 'aalii',
 'accent',
 'accutype',
 'acer',
 'android',
 'aperture',
 'apple',
 'asus',
 'attack',
 'audio',
 'auto',
 'aux',
 'avi',
 'back',
 'batoning',
 'bay',
 'beats',
 'besiar',
 'black',
 'blade',
 'bood',
 'bordam',
 'bravia',
 'bridge',
 'bsi',
 'camera',
 'canon',
 

In [101]:
# Load model

solvers= ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
multi_classes = ['multinomial', 'ovr']

model = LogisticRegression(penalty='l2', random_state=0, solver='liblinear', max_iter=1000, multi_class='ovr')
# Train model
model.fit(train_data_features, y_train)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [102]:
## evaluation 

y_eval_perd = model.predict(eval_data_features)

In [103]:
print('----- Accuracy Score ----- ')
print(accuracy_score(y_eval, y_eval_perd))
print('----- Confusion Matrix ----- ')
print(confusion_matrix(y_eval, y_eval_perd))
print('----- Classification Report ----- ')
print(classification_report(y_eval, y_eval_perd))


----- Accuracy Score ----- 
0.73
----- Confusion Matrix ----- 
[[  3  52]
 [  2 143]]
----- Classification Report ----- 
             precision    recall  f1-score   support

          0       0.60      0.05      0.10        55
          1       0.73      0.99      0.84       145

avg / total       0.70      0.73      0.64       200

