In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
pd.options.display.max_columns = None

### Read dataset file

In [2]:
data = pd.read_csv('dataset/movies.csv', sep=",")
data_ratings = pd.read_csv('dataset/ratings.csv', sep=",")

### Converts the list of genres to columns

In [3]:
allGenres = ["Action","Adventure","Animation","Children","Comedy","Crime","Documentary","Drama","Fantasy","Film-Noir",
          "Horror","Musical","Mystery","Romance","Sci-Fi","Thriller","War","Western"]
for x in range(0, len(allGenres)):
    data[allGenres[x]] = 0
    
for i in range(0, len(data.iloc[:, 2])):
    genres = data.iloc[i, 2].split("|")
    for j in range(0, len(genres)):
        if(genres[j] != "IMAX" and genres[j] != "(no genres listed)"):
            data.at[i, genres[j]] = 1

In [4]:
data = data.drop('genres', axis=1)
data_ratings = data_ratings.drop('timestamp', axis=1)

 Converts columns that are strings into numbers and store the labels in the labels array
 the index of the labels array corresponds to the column number

In [5]:
labels = {}
for x in range(0, len(data.columns)):
    if type(data.iloc[:, x][0]) == str:
        data.iloc[:, x], labels[data.columns[x]] = pd.factorize(data.iloc[:, x])
    else:
        labels[data.columns[x]] = None
        

In [7]:
data.head()

Unnamed: 0,movieId,title,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0
1,2,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,3,2,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,3,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0
4,5,4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [8]:
data_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,31,25
1,1,1029,30
2,1,1061,30
3,1,1129,20
4,1,1172,40


In [6]:
data_merge = pd.merge(data,data_ratings)
data_merge.head()

Unnamed: 0,movieId,title,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,userId,rating
0,1,0,0,5,5,5,5,0,0,0,5,0,0,0,0,0,0,0,0,0,7,30
1,1,0,0,5,5,5,5,0,0,0,5,0,0,0,0,0,0,0,0,0,9,40
2,1,0,0,5,5,5,5,0,0,0,5,0,0,0,0,0,0,0,0,0,13,50
3,1,0,0,5,5,5,5,0,0,0,5,0,0,0,0,0,0,0,0,0,15,20
4,1,0,0,5,5,5,5,0,0,0,5,0,0,0,0,0,0,0,0,0,19,30


### Choose training and testing set
Randomly split dataset in order to avoid overfitting and obtain a more realistc precision metric

In [7]:
from sklearn.model_selection import train_test_split
y = data_merge['rating']

In [8]:
data_merge = data_merge.drop('rating', axis=1)

In [9]:
y.shape

(100004,)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(data_merge, y, test_size=0.4, random_state=0)

# Apply the KNNeighbors to train the machine

In [42]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)

In [43]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [44]:
from sklearn.metrics import accuracy_score
knn.predict(X_test)

array([40, 20, 15, ..., 25, 10, 20])

In [45]:
accuracy_score(y_test, knn.predict(X_test))

0.19624018799060047

In [46]:
from sklearn.model_selection import cross_val_predict

predicted = cross_val_predict(knn,data_merge,y,cv=10)
accuracy_score(y,predicted)

0.024879004839806408

In [None]:
title = 0
genres = 0
Action = 0
Adventure = 0
Animation = 0
Children = 0
Comedy = 0
Crime = 0
Documentary = 0
Drama = 0
Fantasy = 0
Film_Noir = 0
Horror = 0
Musical = 0
Mystery = 0
Romance = 0
Sci_Fi = 0
Thriller = 0
War = 0
Western = 0
infos = [title,genres,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film_Noir,Horror,Musical,Mystery,Romance,Sci_Fi,Thriller,War,Western]
knn.predict([infos])[0]

# Testes com Logistical Regression

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
lr = LogisticRegression()

In [18]:
lr.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [32]:
lr_results = lr.predict(X_test)
lr_results[:5]

array([40, 40, 40, 40, 40])

In [None]:
accuracy_score(y_test, lr_results)