In [33]:
import pandas as pd
from nested_lookup import nested_lookup
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report,classification,accuracy_score

### function

In [34]:
stopwords = []
file = open('stopwords.txt', encoding='utf-8').read()
[stopwords.append(x) for x in file.split()]
stopwords = set(stopwords)

In [35]:
def remove_general_stopwords(text):
    filtered_tokens = [token for token in text.split() if token not in stopwords]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

### vectorize

In [36]:
def vectorize_tfidf(df_X):
    tf_idf_vectorizer = TfidfVectorizer(lowercase=False)
    tf_idf_X = tf_idf_vectorizer.fit_transform(df_X)
    tf_idf_df = pd.DataFrame(data=tf_idf_X.toarray(),columns=[tf_idf_vectorizer.get_feature_names()])
    return tf_idf_df, tf_idf_vectorizer

### train_test

In [37]:
def split_data(X,y,test_size):
    return train_test_split(X, y, test_size=test_size)

### define and run model

In [38]:
def define_and_run_model(model,X_train,X_test,y_train):
    clf = model
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return y_pred

### check the result

In [39]:
def show_result(y_test,y_pred):
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test, y_pred))
    print(accuracy_score(y_test,y_pred))

### loading data

In [42]:
with open('sample_dataset.json',encoding='utf-8-sig', errors='ignore') as file:
    json_file = json.load(file)

In [43]:
lst_cmnts = nested_lookup('comment', json_file)
lst_sentiments = nested_lookup('sentiment', json_file)

In [44]:
dic = {}
for i,cmnt in enumerate(lst_cmnts):
    dic[cmnt] = lst_sentiments[i]

In [61]:
df = pd.DataFrame(list(dic.items()), columns=['comment','sentiment'])


In [46]:
dic ={0:'مثبت',1:'منفی'}

In [47]:
dic

{0: 'مثبت', 1: 'منفی'}

In [48]:
df["sentiment"].value_counts()

0    2306
1     822
Name: sentiment, dtype: int64

### remove stopwords

In [49]:
df['comment']=df['comment'].apply(remove_general_stopwords)
df.head()

Unnamed: 0,comment,sentiment
0,بشوری پاک بشه بویه ملایمی داره ... پوست مختلطه...,1
1,خوبه موهای تاثیر زیادی,0
2,نخرید رنگش میاد دست,1
3,بوی داره تمیز میکنه اسکراب قوی,0
4,هفته گرفتم.ظاهرش خوبه بعده دوره لبش شکست.۵ کار...,1


In [50]:
tf_idf_df, tf_idf_model = vectorize_tfidf(df['comment'])
X_train, X_test, y_train, y_test = split_data(tf_idf_df,df['sentiment'],.3)

### handle imbalancde data with random forest

In [51]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=1000).fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)
show_result(rfc_pred, y_test)

[[655 148]
 [ 35 101]]
              precision    recall  f1-score   support

           0       0.95      0.82      0.88       803
           1       0.41      0.74      0.52       136

    accuracy                           0.81       939
   macro avg       0.68      0.78      0.70       939
weighted avg       0.87      0.81      0.83       939

0.805111821086262


In [58]:
text = "ممنون از دیجیکالا بابت محصولات خوبشون"

In [59]:
sample = tf_idf_model.transform([text]).toarray()
sentiment = rfc.predict(sample)

In [60]:
print (text,'\n','نتیجه: ',dic[sentiment[0]])


ممنون از دیجیکالا بابت محصولات خوبشون 
 نتیجه:  مثبت
