In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

# Importing dataset
dataset = pd.read_csv("ratings_small.csv")

# Convert categorical variable to numeric
dataset["ratingNew"]=np.where(dataset["rating"]<=2.5,0,1)
dataset=dataset[[
    "userId",
    "movieId",
    "ratingNew"
]].dropna(axis=0, how='any') 

X = dataset.iloc[:, 0:2].values  
y = dataset.iloc[:, 2].values

print("Values of rating after conversion: ")
print(y)
    
# Split dataset in training and test datasets
X_train, X_test = train_test_split(dataset, test_size=0.3, random_state=int(time.time()))

Values of rating after conversion: 
[0 1 1 ... 1 0 1]


In [2]:
# Instantiate the classifier
gnb = GaussianNB()
features =[
     "userId",
     "movieId",
     "ratingNew"
]

# Train classifier
gnb.fit(
    X_train[features].values,
    X_train["ratingNew"]
)
predict_y = gnb.predict(X_test[features])

# Print results
print("Number of mislabeled points out of a total {}\n points : {}\n performance {:05.2f}%"
      .format(
          X_test.shape[0],
          (X_test["ratingNew"] != predict_y).sum(),
          100*(1-(X_test["ratingNew"] != predict_y).sum()/X_test.shape[0])
))

Number of mislabeled points out of a total 30002
 points : 5127
 performance 82.91%


In [3]:
print("Showing the predicted values for ratingNew: ")
print(predict_y)

Showing the predicted values for ratingNew: 
[1 1 1 ... 1 1 1]


In [6]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.30)

# precision, recall, f1-score, support
print(confusion_matrix(y_test,predict_y.round()))  
print(classification_report(y_test,predict_y.round()))  
print("Accuracy: ")
print(round(accuracy_score(y_test,predict_y.round())*100,2),"%")

[[  129  5310]
 [  524 24039]]
              precision    recall  f1-score   support

           0       0.20      0.02      0.04      5439
           1       0.82      0.98      0.89     24563

   micro avg       0.81      0.81      0.81     30002
   macro avg       0.51      0.50      0.47     30002
weighted avg       0.71      0.81      0.74     30002

Accuracy: 
80.55 %
