# 1. Aquisição dos Dados

In [28]:
import pandas as pd
import numpy as np

In [29]:
movies = pd.DataFrame = pd.read_csv('movies.csv')
ratings = pd.DataFrame = pd.read_csv('ratings.csv')

# 2. Pré-processamento dos Dados

In [30]:
movies.columns = ['movie_id', 'title', 'genres']
movies.set_index('movie_id', inplace=True)
movies.head()

Unnamed: 0_level_0,title,genres
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [31]:
ratings.columns = ['user_id', 'movie_id', 'rating', 'timestamp']

In [32]:
movies["watched"] = ratings.groupby("movie_id")["rating"].count()
movies["mean_rating"] = ratings.groupby("movie_id")["rating"].mean()

In [33]:
movies.head()

Unnamed: 0_level_0,title,genres,watched,mean_rating
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215.0,3.92093
2,Jumanji (1995),Adventure|Children|Fantasy,110.0,3.431818
3,Grumpier Old Men (1995),Comedy|Romance,52.0,3.259615
4,Waiting to Exhale (1995),Comedy|Drama|Romance,7.0,2.357143
5,Father of the Bride Part II (1995),Comedy,49.0,3.071429


# 3. Modelagem (KNN)

## Obter distância entre o usuário e os outros

In [34]:
def get_distances(user_id):
    distancias = []
    for user in ratings["user_id"].unique():
        if user_id == user:
            continue

        user1_ratings = ratings[ratings["user_id"]==user_id][["movie_id", "rating"]]
        user2_ratings = ratings[ratings["user_id"]==user][["movie_id", "rating"]]

        movies_in_common = pd.merge(user1_ratings, user2_ratings, on="movie_id", suffixes=("_user1", "_user2"))

        if len(movies_in_common) < 10:
            continue

        distance = np.linalg.norm(movies_in_common["rating_user1"] - movies_in_common["rating_user2"])

        distancias.append([user_id, user, distance])

    distancias.sort(key=lambda x: x[2])
    return distancias

## Obter as recomendações com base nos K usuários mais próximos

In [38]:
def get_recommendations(user_id, k=15, min_popularity=5, min_recomendations=3):
    distances = get_distances(user_id)
    user_movies = ratings[ratings["user_id"]==user_id][["movie_id", "rating"]]

    recommendations = []
    movie_neighbors_count = {}
    movie_neighbors_score = {}

    for neighbor in distances[:k]:
        neighbor_id = neighbor[1]
        neighbor_movies = ratings[ratings["user_id"]==neighbor_id][["movie_id", "rating"]]

        for movie_id in neighbor_movies["movie_id"].unique():

            neighbor_movie_score = neighbor_movies[neighbor_movies["movie_id"]==movie_id]["rating"].values[0]

            if movie_id in user_movies["movie_id"].unique():
                continue

            if movies.loc[movie_id]["watched"] < min_popularity:
                continue

            if movie_id in movie_neighbors_count:
                movie_neighbors_count[movie_id] += 1
                movie_neighbors_score[movie_id] += neighbor_movie_score

            else:
                movie_neighbors_count[movie_id] = 1
                movie_neighbors_score[movie_id] = neighbor_movie_score

    for movie_id in movie_neighbors_count:
        if movie_neighbors_count[movie_id] >= min_recomendations:
            recommendations.append([movie_id, movie_neighbors_score[movie_id] / movie_neighbors_count[movie_id], movie_neighbors_count[movie_id]])

    recommendations.sort(key=lambda x: x[1], reverse=True)

    recommendations_df = movies.loc[[x[0] for x in recommendations]]
    recommendations_df["expected_rating"] = [x[1] for x in recommendations]
    recommendations_df["recomendations"] = [x[2] for x in recommendations]
    recommendations_df.sort_values("expected_rating", ascending=False, inplace=True)

    return recommendations_df

## Transformar num dataframe

In [39]:
recommendations = get_recommendations(9, k=10, min_recomendations=3, min_popularity=5)
recommendations.head(10)


Unnamed: 0_level_0,title,genres,watched,mean_rating,expected_rating,recomendations
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
318,"Shawshank Redemption, The (1994)",Crime|Drama,317.0,4.429022,4.928571,7
1247,"Graduate, The (1967)",Comedy|Drama|Romance,79.0,4.063291,4.833333,3
778,Trainspotting (1996),Comedy|Crime|Drama,102.0,4.039216,4.7,5
2288,"Thing, The (1982)",Action|Horror|Sci-Fi|Thriller,45.0,3.933333,4.7,5
2951,"Fistful of Dollars, A (Per un pugno di dollari...",Action|Western,39.0,3.935897,4.7,5
1222,Full Metal Jacket (1987),Drama|War,102.0,4.098039,4.6875,8
1201,"Good, the Bad and the Ugly, The (Buono, il bru...",Action|Adventure|Western,72.0,4.145833,4.666667,6
1732,"Big Lebowski, The (1998)",Comedy|Crime,106.0,3.924528,4.666667,6
3000,Princess Mononoke (Mononoke-hime) (1997),Action|Adventure|Animation|Drama|Fantasy,48.0,3.958333,4.666667,3
31658,Howl's Moving Castle (Hauru no ugoku shiro) (2...,Adventure|Animation|Fantasy|Romance,40.0,4.075,4.666667,3
