In [1]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import NearestNeighbors

In [2]:
genome_scores = pd.read_csv('ml-latest/genome-scores.csv')
genome_tags = pd.read_csv('ml-latest/genome-tags.csv')
links = pd.read_csv('ml-latest/links.csv')
tags = pd.read_csv('ml-latest/tags.csv')

In [3]:
ratings = pd.read_csv('ml-latest/ratings.csv')
movies = pd.read_csv("movies_FINAL.csv")
  
n_ratings = len(ratings)
n_movies = len(ratings['movieId'].unique())
n_users = len(ratings['userId'].unique())

In [4]:
print(f"Number of ratings: {n_ratings}")
print(f"Number of unique movieId's: {n_movies}")
print(f"Number of unique users: {n_users}")
print(f"Average ratings per user: {round(n_ratings/n_users, 2)}")
print(f"Average ratings per movie: {round(n_ratings/n_movies, 2)}")

Number of ratings: 27753444
Number of unique movieId's: 53889
Number of unique users: 283228
Average ratings per user: 97.99
Average ratings per movie: 515.01


In [5]:
user_freq = ratings[['userId', 'movieId']].groupby('userId').count().reset_index()
user_freq.columns = ['userId', 'n_ratings']
user_freq.head()

Unnamed: 0,userId,n_ratings
0,1,16
1,2,15
2,3,11
3,4,736
4,5,72


In [6]:
# lista ze sredniimi z ocen filmow
mean_rating = ratings.groupby('movieId')[['rating']].mean()

# Lowest rated movies
lowest_rated = mean_rating['rating'].idxmin()
movies.loc[movies['movieId'] == lowest_rated]

# Highest rated movies
highest_rated = mean_rating['rating'].idxmax()
movies.loc[movies['movieId'] == highest_rated]

# show number of people who rated movies rated movie highest
ratings[ratings['movieId']==highest_rated]
# show number of people who rated movies rated movie lowest
ratings[ratings['movieId']==lowest_rated]
  
## the above movies has very low dataset. We will use bayesian average
movie_stats = ratings.groupby('movieId')[['rating']].agg(['count', 'mean'])
movie_stats.columns = movie_stats.columns.droplevel()

In [7]:
from scipy.sparse import csr_matrix

def create_matrix(df):
      
    N = len(df['userId'].unique())
    M = len(df['movieId'].unique())
      
    # Map Ids to indices
    # dict( pair: id, list )
    user_mapper = dict( zip( np.unique(df["userId"]), list(range(N)) ) )    # N - liczba uzytkownikow
    movie_mapper = dict( zip( np.unique(df["movieId"]), list(range(M)) ) )  # M - liczba filmow

    # Map indices to IDs
    # to co wyzej, tylko pary na odwrot
    user_inv_mapper = dict( zip( list(range(N)), np.unique(df["userId"]) ) )
    movie_inv_mapper = dict(zip(list(range(M)), np.unique(df["movieId"])))
      
    user_index = [user_mapper[i] for i in df['userId']]
    movie_index = [movie_mapper[i] for i in df['movieId']]

#     X = csr_matrix((df["rating"], (movie_index, user_index)), shape=(M, N))
    X = csr_matrix((df["rating"], (user_index, movie_index)), shape=(N, M))#, dtype=np.float32)

    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

In [8]:
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(ratings)

In [9]:
# def find_similar_movies(movie_id, X, k, metric='cosine', show_distance=False):
def find_similar_users(user_id, X, k, metric='cosine', show_distance=False):

    neighbour_ids = []
      
    user_ind = user_mapper[user_id]
    user_vec = X[user_ind]
    k+=1
    kNN = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric)
    kNN.fit(X)
    user_vec = user_vec.reshape(1,-1)
    neighbour = kNN.kneighbors(user_vec, return_distance=show_distance)
    for i in range(0,k):
        n = neighbour.item(i)
        neighbour_ids.append(user_inv_mapper[n])
    neighbour_ids.pop(0)
    return neighbour_ids

In [10]:
# movie_titles = dict(zip(movies['movieId'], movies['title']))

In [11]:
# movie_id = 112
# similar_ids = find_similar_movies(movie_id, X, k=10)
# movie_title = movie_titles[movie_id]
# print(f"Since you watched {movie_title}\n")
# for i in similar_ids:
#     print(movie_titles[i])

In [12]:
user_id = 1
similar_ids = find_similar_users(user_id, X, k=10)
print("Your movie preferences are similar to users:\n")
for i in similar_ids:
    print(i)

Your movie preferences are similar to users:

69380
24743
279985
47864
245536
89732
269173
49501
264168
70866


Szybka weryfikacja:

In [13]:
rates_1 = ratings[ratings["userId"] == 1]
rates_1 = rates_1.drop(['timestamp', 'userId'], axis=1)
rates_1 = rates_1.set_index('movieId')
rates_1

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
307,3.5
481,3.5
1091,1.5
1257,4.5
1449,4.5
1590,2.5
1591,1.5
2134,4.5
2478,4.0
2840,3.0


In [14]:
rates_69380 = ratings[ratings["userId"] == 69380]
rates_69380 = rates_69380.drop(['timestamp', 'userId'], axis=1)
rates_69380 = rates_69380.set_index('movieId')
rates_69380

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
88,3.0
481,3.5
637,0.5
1100,4.5
1347,2.0
1350,4.0
1911,2.0
2109,4.0
2126,2.0
2471,3.5


In [15]:
rates_1.join(rates_69380, lsuffix='_1', rsuffix='_69380', on='movieId', how='inner')

Unnamed: 0_level_0,rating_1,rating_69380
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
481,3.5,3.5
2478,4.0,4.0
3020,4.0,3.5
3424,4.5,3.0
3698,3.5,5.0


Wygląda ok :)

#### Funkcja zwracająca rekomendacje fimów

In [24]:
movie_titles = dict(zip(movies['movieId'], movies['title']))

def recommend_movies(user_id):
    similar_ids = find_similar_users(user_id, X, k=10)
#     print(similar_ids)
    
    # oceny filmow, srednie
    rates = pd.concat([pd.DataFrame(
        ratings[ratings['userId'] == i].drop(['timestamp', 'userId'], axis=1),
        columns=['movieId', 'rating']) for i in similar_ids ])
    rates = rates.groupby(['movieId'], as_index=False).mean().sort_values(by=['rating'], ascending=False)             
#     print(rates)
    
    # filmy ktore usuniemy
    remove = ratings[ratings['userId'] == user_id].movieId
#     print(remove)
    
    result = rates[rates['rating'] >= 5]
    result = result[~result['movieId'].isin(remove)]
    result = [(mid,movie_titles[mid]) for mid in result.movieId]
#     print(*result, sep='\n')
    return result


In [25]:
user_id = 1
recommendations = recommend_movies(user_id)
print(f"User {user_id}, you might like movies:")
for movie in recommendations:
    print(movie)

User 1, you might like movies:
(60, 'Indian in the Cupboard, The')
(1077, 'Sleeper')
(99114, 'Django Unchained')
(37240, 'Why We Fight')
(318, 'Shawshank Redemption, The')
(2329, 'American History X')
(65, 'Bio-Dome')
(1945, 'On the Waterfront')
(1476, 'Private Parts')


In [26]:
user_id = 10
recommendations = recommend_movies(user_id)
print(f"User {user_id}, you might like movies:")
for movie in recommendations:
    print(movie)

User 10, you might like movies:
(2528, "Logan's Run")
(1949, 'Man for All Seasons, A')
(308, 'Three Colors: White (Trzy kolory: Bialy)')
(1615, 'Edge, The')
(3801, 'Anatomy of a Murder')
(1185, 'My Left Foot')
(3730, 'Conversation, The')
(1914, 'Smoke Signals')
(1178, 'Paths of Glory')
(123, 'Chungking Express (Chung Hing sam lam)')
(1734, 'My Life in Pink (Ma vie en rose)')
(1147, 'When We Were Kings')
(3476, "Jacob's Ladder")
(154, 'Beauty of the Day (Belle de jour)')
(1719, 'Sweet Hereafter, The')
(3462, 'Modern Times')
(3435, 'Double Indemnity')
(3424, 'Do the Right Thing')
(3091, 'Kagemusha')
(3100, 'River Runs Through It, A')
(3364, 'Asphalt Jungle, The')
(1059, "William Shakespeare's Romeo + Juliet")
(1303, 'Man Who Would Be King, The')
(1633, "Ulee's Gold")
(1394, 'Raising Arizona')
(1411, 'Hamlet')
(1933, 'Life of Emile Zola, The')
(307, 'Three Colors: Blue (Trois couleurs: Bleu)')
(3030, 'Yojimbo')
(1211, 'Wings of Desire (Himmel über Berlin, Der)')
(2682, 'Limbo')
(2691, "Le