In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
pd.options.display.max_columns = None

### Read dataset file

In [2]:
data = pd.read_csv('dataset/movies.csv', sep=",")
data_ratings = pd.read_csv('dataset/ratings.csv', sep=",")

### Converts the list of genres to columns

In [3]:
allGenres = ["Action","Adventure","Animation","Children","Comedy","Crime","Documentary","Drama","Fantasy","Film-Noir",
          "Horror","Musical","Mystery","Romance","Sci-Fi","Thriller","War","Western"]
for x in range(0, len(allGenres)):
    data[allGenres[x]] = 0
    
for i in range(0, len(data.iloc[:, 2])):
    genres = data.iloc[i, 2].split("|")
    for j in range(0, len(genres)):
        if(genres[j] != "IMAX" and genres[j] != "(no genres listed)"):
            data.at[i, genres[j]] = 1

In [4]:
data = data.drop('genres', axis=1)
data_ratings = data_ratings.drop('timestamp', axis=1)

 Converts columns that are strings into numbers and store the labels in the labels array
 the index of the labels array corresponds to the column number

In [5]:
labels = {}
for x in range(0, len(data.columns)):
    if type(data.iloc[:, x][0]) == str:
        data.iloc[:, x], labels[data.columns[x]] = pd.factorize(data.iloc[:, x])
    else:
        labels[data.columns[x]] = None
        

In [6]:
data.head()

Unnamed: 0,movieId,title,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0
1,2,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,3,2,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,3,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0
4,5,4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


### Selects the ratings given by the user 547

In [7]:
data_ratings = data_ratings.loc[data_ratings['userId'] == 547]
data_ratings.head()

Unnamed: 0,userId,movieId,rating
78433,547,1,1
78434,547,6,0
78435,547,7,0
78436,547,11,1
78437,547,14,1


In [8]:
# removing userId column since all the data is from one user
data_ratings = data_ratings.drop('userId', axis=1)
data_merge = pd.merge(data,data_ratings)
data_merge.head()

Unnamed: 0,movieId,title,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,rating
0,1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1
1,6,5,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0
2,7,6,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,11,10,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1
4,14,13,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1


### Choose training and testing set
Randomly split dataset in order to avoid overfitting and obtain a more realistc precision metric

In [9]:
from sklearn.model_selection import train_test_split
y = data_merge['rating']

In [10]:
data_merge = data_merge.drop('rating', axis=1)
data_merge = data_merge.drop('movieId', axis=1)

In [11]:
y.shape

(2391,)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(data_merge, y, test_size=0.4, random_state=0)

# Apply the KNNeighbors to train the machine

In [13]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)

In [14]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [15]:
knn.predict(X_test)

array([1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,

In [16]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, knn.predict(X_test))

0.722048066875653

In [17]:
from sklearn.model_selection import cross_val_predict

predicted = cross_val_predict(knn, data_merge,y, cv=3)
accuracy_score(y,predicted)

0.45420326223337515

In [18]:
title = 0
Action = 0
Adventure = 0
Animation = 0
Children = 0
Comedy = 0
Crime = 0
Documentary = 1
Drama = 1
Fantasy = 1
Film_Noir = 1
Horror = 1
Musical = 1
Mystery = 0
Romance = 0
Sci_Fi = 0
Thriller = 0
War = 0
Western = 0
infos = [title,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film_Noir,Horror,Musical,Mystery,Romance,Sci_Fi,Thriller,War,Western]
knn.predict([infos])[0]

1

# Applying Bernoulli Naive Bayes

In [19]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB(alpha=1.0)

In [20]:
bnb.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [None]:
bnb.predict(y_test)

In [21]:
accuracy_score(y_test, bnb.predict(X_test))

0.7638453500522466

In [22]:
predicted = cross_val_predict(bnb, data_merge,y, cv=3)
accuracy_score(y,predicted)

0.7461313258051024

# Testes com Logistical Regression

In [23]:
from sklearn.linear_model import LogisticRegression

In [24]:
lr = LogisticRegression()

In [25]:
lr.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [26]:
lr_results = lr.predict(X_test)
lr_results[:]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [27]:
accuracy_score(y_test, lr_results)

0.7669801462904912