In [6]:
import pandas as pd
import numpy as np
from typing import *
from IPython.display import display, HTML, Markdown

import warnings
warnings.filterwarnings('ignore')


def display_best_and_worse_recommendations(recommendations: pd.DataFrame):
    recommendations.sort_values('Estimated Prediction', ascending=False, inplace=True)

    top_recommendations = recommendations.iloc[:10]
    top_recommendations.columns = ['Prediction (sorted by best)', 'Movie Title']

    worse_recommendations = recommendations.iloc[-10:]
    worse_recommendations.columns = ['Prediction (sorted by worse)', 'Movie Title']

    display(HTML("<h1>Recommendations your user will love</h1>"))
    display(top_recommendations)

    display(HTML("<h1>Recommendations your user will hate</h1>"))
    display(worse_recommendations)
    

def load_movies_dataset() -> pd.DataFrame:
    movie_data_columns = [
    'movie_id', 'title', 'release_date', 'video_release_date', 'url',
    'unknown', 'Action', 'Adventure', 'Animation', "Children's",
    'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
    'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
    'War', 'Western'
    ]

    movie_data = pd.read_csv(
        'datasets/ml-100k/u.item', 
        sep = '|', 
        encoding = "ISO-8859-1", 
        header = None, 
        names = movie_data_columns,
        index_col = 'movie_id'
    )
    movie_data['release_date'] = pd.to_datetime(movie_data['release_date'])
    return movie_data

def load_ratings() -> pd.DataFrame:
    ratings_data = pd.read_csv(
        'datasets/ml-100k/u.data',
        sep = '\t',
        encoding = "ISO-8859-1",
        header = None,
        names=['user_id', 'movie_id', 'rating', 'timestamp']
    )
    return ratings_data[['user_id', 'movie_id', 'rating']]

def load_ratings_with_name() -> pd.DataFrame:
    ratings_data = load_ratings()
    movies_data = load_movies_dataset()
    ratings_data['user_id'] = ratings_data['user_id'].map(lambda k: f"User {k}")
    
    ratings_and_movies = ratings_data \
        .set_index('movie_id') \
        .join(movies_data['title']) \
        .reset_index()
    
    ratings_and_movies['movie_title'] = ratings_and_movies['title']
    return ratings_and_movies[['user_id', 'movie_title', 'rating']].sample(frac=1)
    

In [7]:
ratings = load_ratings()
ratings

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1
...,...,...,...
99995,880,476,3
99996,716,204,5
99997,276,1090,1
99998,13,225,2


In [4]:
ratings['rating'].values

array([3, 3, 1, ..., 1, 2, 3])

In [9]:
ratings_with_name = load_ratings_with_name()
ratings_with_name

Unnamed: 0,user_id,movie_title,rating
13799,User 848,Dances with Wolves (1990),5
59989,User 862,Forbidden Planet (1956),5
67280,User 85,Annie Hall (1977),5
85636,User 497,Manhattan Murder Mystery (1993),3
79002,User 624,McHale's Navy (1997),2
...,...,...,...
54488,User 712,Houseguest (1994),3
80014,User 269,Singin' in the Rain (1952),2
13299,User 312,Terminator 2: Judgment Day (1991),5
40855,User 345,Chasing Amy (1997),4
