In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
movies = pd.read_csv('../ml-latest-small/movies.csv')

In [4]:
ratings = pd.read_csv('../ml-latest-small/ratings.csv')

In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [7]:
selected_movies=pd.merge(movies, ratings, on='movieId', how='right')

In [8]:
selected_movies.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,3,Grumpier Old Men (1995),Comedy|Romance,1,4.0,964981247
2,6,Heat (1995),Action|Crime|Thriller,1,4.0,964982224
3,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,1,5.0,964983815
4,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,1,5.0,964982931


# Content-Based Filtering

In [9]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(selected_movies['genres'])

def content_based_prediction(movie_id):
    movie_tfidf = tfidf_matrix[movie_id]
    similarity_scores = cosine_similarity(movie_tfidf, tfidf_matrix)
    return similarity_scores

In [10]:
def get_content_based_recommendations(movie_id, top_n=5):
    similarity_scores = content_based_prediction(movie_id)
    similar_movies_idx = np.argsort(similarity_scores[0])[::-1][1:]

    # Проверяем, чтобы movie_id был в пределах размера selected_movies
    if 0 <= movie_id < len(selected_movies):
        movie_title = selected_movies.iloc[movie_id]['title']

        # Фильтруем по условию, что название фильма не равно movie_title
        rec = (selected_movies.iloc[similar_movies_idx]
               .query("title != @movie_title") # Query Language в pandas 
               .drop_duplicates(subset=['movieId'])
               .head(top_n)
               .loc[:, ['title', 'genres']])
               
        return rec
    else:
        print(f"Error: movie_id {movie_id} is out of range.")
        return None

In [11]:
movie_id = 15
recommendations = get_content_based_recommendations(movie_id)

In [12]:
print(f"Рекомендации для фильма '{selected_movies.iloc[movie_id]['title']}':")
recommendations[['title', 'genres']]

Рекомендации для фильма 'Star Wars: Episode IV - A New Hope (1977)':


Unnamed: 0,title,genres
2779,Superman IV: The Quest for Peace (1987),Action|Adventure|Sci-Fi
8393,Superman (1978),Action|Adventure|Sci-Fi
98711,Waterworld (1995),Action|Adventure|Sci-Fi
8392,Star Wars: Episode I - The Phantom Menace (1999),Action|Adventure|Sci-Fi
19691,Stargate (1994),Action|Adventure|Sci-Fi


# Collaborative Filtering

In [13]:
from surprise import Dataset, Reader, KNNBasic

reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(selected_movies[['userId', 'movieId', 'rating']], reader)
trainset = data.build_full_trainset()

In [14]:
# Обучение коллаборативной модели
sim_options = {'name': 'cosine', 'user_based': False}
model = KNNBasic(sim_options=sim_options)
model.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f9fb8978a30>

In [15]:
# Функция для предсказания рекомендаций на основе коллаборативной фильтрации
def collaborative_recommendation(userId, top_n=5):
    anti_testset = trainset.build_anti_testset()
    anti_testset = filter(lambda x: x[0] == userId, anti_testset)
    predictions = model.test(anti_testset)
    top_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)[:top_n]
    recommended_movie_ids = [int(prediction.iid) for prediction in top_predictions]
    recommended_movie_names = [movies[movies['movieId'] == movie_id]['title'].values[0] for movie_id in recommended_movie_ids]
    return recommended_movie_names

In [16]:
# Пример использования
userId = 1
collab_rec = collaborative_recommendation(userId)
print(f"Collaborative Recommendation for user {userId}:")
pd.DataFrame(collab_rec)

Collaborative Recommendation for user 1:


Unnamed: 0,0
0,The Jinx: The Life and Deaths of Robert Durst ...
1,Entertaining Angels: The Dorothy Day Story (1996)
2,Broken English (1996)
3,Fullmetal Alchemist: The Sacred Star of Milos ...
4,The Stanford Prison Experiment (2015)


# Hybrid Model

In [17]:
def hybrid_recommendation(userId, movieId):
    # Получаем рекомендации от контентно-коллаборативной модели
    content_rec =  get_content_based_recommendations(movieId)

    # Получаем рекомендации от коллаборативной модели для пользователя
    collab_rec = collaborative_recommendation(userId)

    # Объединяем рекомендации обоих методов
    hybrid_rec = list(set(content_rec).union(set(collab_rec)))

    return hybrid_rec

# Пример использования
userId = 1
movieId = 1
hybrid_rec = hybrid_recommendation(userId, movieId)
print(f"Hybrid Recommendation for user {userId} on movie {movieId}:")
pd.DataFrame(hybrid_rec)


Hybrid Recommendation for user 1 on movie 1:


Unnamed: 0,0
0,Broken English (1996)
1,genres
2,The Jinx: The Life and Deaths of Robert Durst ...
3,The Stanford Prison Experiment (2015)
4,Entertaining Angels: The Dorothy Day Story (1996)
5,title
6,Fullmetal Alchemist: The Sacred Star of Milos ...


In [51]:
def hybrid_recommendation(userId, movie_id, weight_num1=0.3, weight_num2=0.7, top_n=5):
    # Получаем рекомендации от контентно-коллаборативной модели
    content_ = get_content_based_recommendations(movie_id, 1000)
    content_ = content_.drop(['genres'], axis=1)
    content_['num1'] = range(1, len(content_) + 1)
    # print(content_)
    collab_rec = pd.DataFrame(collaborative_recommendation(userId, 1000))
    collab_rec.columns = ['title']
    collab_rec['num2'] = range(1, len(collab_rec) + 1)
    
    merged_df = pd.merge(content_, collab_rec, on='title', how='outer')
    merged_df['sum_num_weighted'] = (merged_df['num1'].fillna(0) * weight_num1) + (merged_df['num2'].fillna(0) * weight_num2)
    
    # Сортируем DataFrame по взвешенному столбцу 'sum_num_weighted'
    merged_df = merged_df.sort_values(by='sum_num_weighted')
    # print(merged_df)
    # merged_df['sum_num'] = merged_df['num1'].fillna(0) + merged_df['num2'].fillna(0)
    merged_df = merged_df.drop(['num1', 'num2'], axis=1)
    # merged_df = merged_df.sort_values(by='sum_num')

    return merged_df[:top_n]

In [52]:
userId = 1
movie_id = 15
hybrid_rec = hybrid_recommendation(userId, movie_id, top_n=10)
print(f"Hybrid Recommendation for user {userId} on movie {movieId} - {selected_movies.iloc[movie_id]['title']} :")
pd.DataFrame(hybrid_rec, columns=['title'])

Hybrid Recommendation for user 1 on movie 1 - Star Wars: Episode IV - A New Hope (1977) :


Unnamed: 0,title
0,Superman IV: The Quest for Peace (1987)
1,Superman (1978)
1000,The Jinx: The Life and Deaths of Robert Durst ...
2,Waterworld (1995)
3,Star Wars: Episode I - The Phantom Menace (1999)
1001,Entertaining Angels: The Dorothy Day Story (1996)
4,Stargate (1994)
5,Lost in Space (1998)
1002,Broken English (1996)
6,Star Wars: Episode VI - Return of the Jedi (1983)
