In [94]:
import pandas as pd
import numpy as np
data = pd.read_csv('reviews.csv', delimiter='\t',header=None)

In [95]:
data=data.rename(columns={0: "Name", 2: "Date",1:"Rating",3:"Review"})

In [96]:
#check data
data.head()

Unnamed: 0,Name,Rating,Date,Review
0,Skype,3,2021-01-12T17:06:39+00:00,It shows my credit in Danish Corona but I live...
1,Skype,1,2021-01-10T16:58:00+00:00,Disgusting. Skype effectively operated as a sc...
2,Skype,3,2020-12-28T11:57:59+00:00,"Outdated.\n\n Simply an outdated platform, it ..."
3,Skype,1,2020-12-27T14:29:41+00:00,When Skype came out I thought it was a great a...
4,Skype,5,2020-12-20T21:05:07+00:00,My favorite app for business calls.


In [97]:
#replace rating with 0 for negative, 1 for neutral, 2 for positive
data['Sentiment']=data['Rating'].replace({1:0,2:0,3:1,4:2,5:2})

In [149]:
#Check data
data=data[['Review','Sentiment']]
data.head()

Unnamed: 0,Review,Sentiment
0,It shows my credit in Danish Corona but I live...,1
1,Disgusting. Skype effectively operated as a sc...,0
2,"Outdated.\n\n Simply an outdated platform, it ...",1
3,When Skype came out I thought it was a great a...,0
4,My favorite app for business calls.,2


In [150]:
#shuffle data and split
msk = np.random.rand(len(data)) < 0.8
train = data[msk]
test = data[~msk]
train.to_csv('training.csv')
test.to_csv('valid.csv')

In [151]:
#Get Train and test for X and y
X_train = train['Review']
y_train = train['Sentiment']
X_test = test['Review']
y_test = test['Sentiment']

In [152]:
# Count 
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train['Review'])
X_train_counts.shape

(1022, 4613)

In [193]:
# Transform to TF-idf
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(1022, 4613)

In [194]:
#Gridsearch parameters
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
mnb_pipeline = Pipeline([
       ('mnb', MultinomialNB())
])
grid_params = {
  'mnb__alpha': np.linspace(0.5, 1.5, 5,10),
  'mnb__fit_prior': [True, False],  
}
clf = GridSearchCV(mnb_pipeline, grid_params)
clf.fit(X_train_tfidf,y_train)
print("Best Score: ", clf.best_score_)
print("Best Params: ", clf.best_params_)

Best Score:  0.8415064562410329
Best Params:  {'mnb__alpha': 0.5, 'mnb__fit_prior': False}


In [195]:
#fit for train
clf = MultinomialNB(alpha=0.5,fit_prior=False)
clf.fit(X_train_tfidf,y_train)


MultinomialNB(alpha=0.5, fit_prior=False)

In [196]:
#tranfor test set to tfidf format and predict result
X_test_counts = count_vect.transform(test['Review'])
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

y_pred = clf.predict(X_test_tfidf)
y_true = test['Sentiment'].values

In [197]:
#print result
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from dmba import classificationSummary

print ('Accuracy:', accuracy_score(y_true, y_pred))
print ('F1 score:', f1_score(y_true, y_pred,average='weighted'))

classificationSummary(y_true, y_pred)

Accuracy: 0.9296296296296296
F1 score: 0.920088835121515
Confusion Matrix (Accuracy 0.9296)

       Prediction
Actual   0   1   2
     0 178   0   2
     1   9   4   1
     2   7   0  69
