# 1. Aquisição dos Dados

In [68]:
import pandas as pd
import numpy as np

In [69]:
movies = pd.DataFrame = pd.read_csv('movies.csv')
ratings = pd.DataFrame = pd.read_csv('ratings.csv')

# 2. Pré-processamento dos Dados

In [70]:
movies.columns = ['movie_id', 'title', 'genres']
movies.set_index('movie_id', inplace=True)
movies.head()

Unnamed: 0_level_0,title,genres
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [71]:
ratings.columns = ['user_id', 'movie_id', 'rating', 'timestamp']

In [72]:
movies["watched"] = ratings.groupby("movie_id")["rating"].count()
movies["mean_rating"] = ratings.groupby("movie_id")["rating"].mean()

In [73]:
movies.head()

Unnamed: 0_level_0,title,genres,watched,mean_rating
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215.0,3.92093
2,Jumanji (1995),Adventure|Children|Fantasy,110.0,3.431818
3,Grumpier Old Men (1995),Comedy|Romance,52.0,3.259615
4,Waiting to Exhale (1995),Comedy|Drama|Romance,7.0,2.357143
5,Father of the Bride Part II (1995),Comedy,49.0,3.071429


# 3. Modelagem (KNN)

## Obter distância entre o usuário e os outros

In [74]:
def get_distances(user_id):
    distancias = []
    for user in ratings["user_id"].unique():
        if user_id == user:
            continue

        user1_ratings = ratings[ratings["user_id"]==user_id][["movie_id", "rating"]]
        user2_ratings = ratings[ratings["user_id"]==user][["movie_id", "rating"]]

        movies_in_common = pd.merge(user1_ratings, user2_ratings, on="movie_id", suffixes=("_user1", "_user2"))

        if len(movies_in_common) < 10:
            continue

        distance = np.linalg.norm(movies_in_common["rating_user1"] - movies_in_common["rating_user2"])

        distancias.append([user_id, user, distance])

    distancias.sort(key=lambda x: x[2])
    return distancias

## Obter as recomendações com base nos K usuários mais próximos

In [75]:
def get_recommendations(user_id, k=15, min_popularity=5):

    distances = get_distances(user_id)
    user_movies = ratings[ratings["user_id"]==user_id][["movie_id", "rating"]]

    recomendations = []

    for neighbor in distances[:k]:

        neighbor_id = neighbor[1]

        neighbor_movies = ratings[ratings["user_id"]==neighbor_id][["movie_id", "rating"]]

        for movie_id in neighbor_movies["movie_id"].unique():
            if movie_id in user_movies["movie_id"].unique():
                continue

            if movies.loc[movie_id]["watched"] < min_popularity:
                continue                

            recomendations.append([movie_id, neighbor_id])

    return recomendations

## Transformar num dataframe

In [82]:
recomendations = get_recommendations(3)

recomendations = movies.loc[[x[0] for x in recomendations]]
recomendations.head(10)

Unnamed: 0_level_0,title,genres,watched,mean_rating
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215.0,3.92093
2,Jumanji (1995),Adventure|Children|Fantasy,110.0,3.431818
6,Heat (1995),Action|Crime|Thriller,102.0,3.946078
10,GoldenEye (1995),Action|Adventure|Thriller,132.0,3.496212
12,Dracula: Dead and Loving It (1995),Comedy|Horror,19.0,2.421053
16,Casino (1995),Crime|Drama,82.0,3.926829
18,Four Rooms (1995),Comedy,20.0,3.7
19,Ace Ventura: When Nature Calls (1995),Comedy,88.0,2.727273
21,Get Shorty (1995),Comedy|Crime|Thriller,89.0,3.494382
32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller,177.0,3.983051
