# 1. Aquisição dos Dados

In [1]:
import pandas as pd
import numpy as np

In [2]:
movies = pd.DataFrame = pd.read_csv('movies.csv')
ratings = pd.DataFrame = pd.read_csv('ratings.csv')

# 2. Pré-processamento dos Dados

In [3]:
movies.columns = ['movie_id', 'title', 'genres']
movies.set_index('movie_id', inplace=True)
movies.head()

Unnamed: 0_level_0,title,genres
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.columns = ['user_id', 'movie_id', 'rating', 'timestamp']

In [5]:
movies["watched"] = ratings.groupby("movie_id")["rating"].count()
movies["mean_rating"] = ratings.groupby("movie_id")["rating"].mean()

In [6]:
movies.head()

Unnamed: 0_level_0,title,genres,watched,mean_rating
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215.0,3.92093
2,Jumanji (1995),Adventure|Children|Fantasy,110.0,3.431818
3,Grumpier Old Men (1995),Comedy|Romance,52.0,3.259615
4,Waiting to Exhale (1995),Comedy|Drama|Romance,7.0,2.357143
5,Father of the Bride Part II (1995),Comedy,49.0,3.071429


# 3. Modelagem (KNN)

## Obter distância entre o usuário e os outros

In [7]:
def get_distances(user_id):
    distancias = []
    for user in ratings["user_id"].unique():
        if user_id == user:
            continue

        user1_ratings = ratings[ratings["user_id"]==user_id][["movie_id", "rating"]]
        user2_ratings = ratings[ratings["user_id"]==user][["movie_id", "rating"]]

        movies_in_common = pd.merge(user1_ratings, user2_ratings, on="movie_id", suffixes=("_user1", "_user2"))

        if len(movies_in_common) < 10:
            continue

        distance = np.linalg.norm(movies_in_common["rating_user1"] - movies_in_common["rating_user2"])

        distancias.append([user_id, user, distance])

    distancias.sort(key=lambda x: x[2])
    return distancias

## Obter as recomendações com base nos K usuários mais próximos

In [33]:
def get_recommendations(user_id, k=15, min_popularity=5, min_recomendations=3):
    distances = get_distances(user_id)
    user_movies = ratings[ratings["user_id"]==user_id][["movie_id", "rating"]]

    recommendations = []
    movie_neighbors_count = {}
    movie_neighbors_score = {}

    for neighbor in distances[:k]:
        neighbor_id = neighbor[1]
        neighbor_movies = ratings[ratings["user_id"]==neighbor_id][["movie_id", "rating"]]

        for movie_id in neighbor_movies["movie_id"].unique():

            neighbor_movie_score = neighbor_movies[neighbor_movies["movie_id"]==movie_id]["rating"].values[0]

            if movie_id in user_movies["movie_id"].unique():
                continue

            if movies.loc[movie_id]["watched"] < min_popularity:
                continue

            if movie_id in movie_neighbors_count:
                movie_neighbors_count[movie_id] += 1
                movie_neighbors_score[movie_id] += neighbor_movie_score

            else:
                movie_neighbors_count[movie_id] = 1
                movie_neighbors_score[movie_id] = neighbor_movie_score

    for movie_id in movie_neighbors_count:
        if movie_neighbors_count[movie_id] >= min_recomendations:
            recommendations.append([movie_id, movie_neighbors_score[movie_id] / movie_neighbors_count[movie_id]])

    recommendations.sort(key=lambda x: x[1], reverse=True)
    return recommendations

## Transformar num dataframe

In [36]:
recomendations = get_recommendations(1, k=15, min_popularity=50, min_recomendations=5)

recomendacoes = movies.loc[[x[0] for x in recomendations]]
recomendacoes["expected_rating"] = [x[1] for x in recomendations]
recomendacoes.sort_values("expected_rating", ascending=False, inplace=True)

recomendacoes.head(10)


Unnamed: 0_level_0,title,genres,watched,mean_rating,expected_rating
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
318,"Shawshank Redemption, The (1994)",Crime|Drama,317.0,4.429022,4.833333
589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,224.0,3.970982,4.7
1221,"Godfather: Part II, The (1974)",Crime|Drama,129.0,4.25969,4.6
858,"Godfather, The (1972)",Crime|Drama,192.0,4.289062,4.5
32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller,177.0,3.983051,4.1
