In [61]:
# Imports
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

In [62]:
def format_title(title: str) -> str:
    return title.strip().title()

In [63]:
# Get Data
user_df = pd.read_csv('data/Dataset.csv')
movie_title_df = pd.read_csv('data/Movie_Id_Titles.csv')

In [64]:
# Standardizing the 'titles'
movie_title_df['title'] = movie_title_df['title'].apply(format_title)

In [65]:
# Merge
movies_df = pd.merge(user_df, movie_title_df, on='item_id')

In [66]:
# Users with more 100 reviews
user_above_100_ratings = movies_df['user_id'].value_counts() > 100
user_above_100_ratings = user_above_100_ratings[user_above_100_ratings].index
movies_df = movies_df[movies_df['user_id'].isin(user_above_100_ratings)]

In [67]:
# Number of reviews for each movie
number_of_ratings = movies_df.groupby('title')['rating'].count().reset_index()
number_of_ratings.rename(columns={'rating': 'number_of_ratings'}, inplace=True)

In [68]:
# Merge with Number of ratings
movies_df = movies_df.merge(number_of_ratings, on='title')

In [69]:
# Movies with more 50 ratings
movies_df = movies_df[movies_df['number_of_ratings'] >= 50]

In [70]:
# Delete(drop) duplicates values
movies_df.drop_duplicates(subset=['user_id', 'title'], inplace=True)

In [71]:
# Transposition of rows(user_id) into columns
movies_pivot = movies_df.pivot_table(
    columns='user_id',
    index='title',
    values='rating'
)

In [72]:
# Fill the NaN values
movies_pivot.fillna(0, inplace=True)
movies_pivot.head(10)

user_id,1,5,6,7,10,11,13,15,16,18,...,921,922,927,932,933,934,936,938,940,943
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
101 Dalmatians (1996),2.0,2.0,0.0,0.0,0.0,0.0,2.0,3.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,2.0,0.0,4.0,0.0,0.0
12 Angry Men (1957),5.0,0.0,4.0,4.0,5.0,0.0,4.0,0.0,5.0,3.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
2 Days In The Valley (1996),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,2.0
"20,000 Leagues Under The Sea (1954)",3.0,0.0,0.0,5.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),4.0,4.0,5.0,5.0,5.0,4.0,5.0,0.0,4.0,3.0,...,0.0,2.0,0.0,5.0,4.0,4.0,0.0,0.0,0.0,0.0
Absolute Power (1997),0.0,0.0,0.0,0.0,0.0,4.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0
"Abyss, The (1989)",3.0,0.0,0.0,5.0,4.0,0.0,3.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0
Ace Ventura: Pet Detective (1994),3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,3.0,4.0,2.0,1.0,4.0,0.0,0.0,0.0,4.0
Addams Family Values (1993),0.0,2.0,0.0,4.0,0.0,3.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
"Adventures Of Priscilla, Queen Of The Desert, The (1994)",0.0,5.0,0.0,4.0,0.0,3.0,1.0,0.0,0.0,3.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0


In [73]:
# Converting to a sparce matrix
movie_sparce = csr_matrix(movies_pivot)

In [74]:
# Create and training the model
model = NearestNeighbors(algorithm='brute')
model.fit(movie_sparce)

In [75]:
def get_movie_index(movie_name: str) -> int:
    return movies_pivot.reset_index()[
        movies_pivot.reset_index()['title'] == movie_name
        ].index

In [76]:
def get_suggestions_movies_index(movie_name: str = None, movie_index: int = None) -> list[int] or None:
    if movie_name:
        movie_index = get_movie_index(movie_name=movie_name)
        distances, suggestions_index = model.kneighbors(
            movies_pivot.iloc[movie_index, :].values.reshape(1, -1)
        )

        return suggestions_index
    elif movie_index:
        distances, suggestions_index = model.kneighbors(
            movies_pivot.iloc[movie_index, :].values.reshape(1, -1)
        )

        return suggestions_index
    else:
        return None

In [77]:
def predict(movie_name: str = None, movie_index: int = None) -> list[str]:
    suggestions_names = list()

    if movie_name:
        movie_name = movie_name.strip().title()

    suggestions_index = get_suggestions_movies_index(
        movie_name=movie_name,
        movie_index=movie_index
    )

    for movie_index in suggestions_index:
        suggestions_names.append(movies_pivot.index[movie_index])

    suggestions_names = list(suggestions_names[0])
    suggestions_names.pop(0)

    return suggestions_names

In [78]:
if __name__ == '__main__':
    suggestions = predict(movie_index=238)
    print('\n')
    for movie in suggestions:
        print(f'→ {movie}')
    print('\n')



→ 101 Dalmatians (1996)
→ Muppet Treasure Island (1996)
→ Island Of Dr. Moreau, The (1996)
→ Star Trek V: The Final Frontier (1989)




In [79]:
if __name__ == '__main__':
    suggestions = predict(movie_name='absolute power (1997)')
    print('\n')
    for movie in suggestions:
        print(f'→ {movie}')
    print('\n')



→ City Hall (1996)
→ Escape From L.A. (1996)
→ Juror, The (1996)
→ Chain Reaction (1996)


