In [None]:
%pip install surprise

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate


# 1. Recommandation simple

In [None]:
metadata = pd.read_csv('data/movies_metadata.csv')
metadata.head()

In [None]:
metadata['genres'] = metadata['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])


In [None]:
vote_counts = metadata[metadata['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = metadata[metadata['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

In [None]:
m = vote_counts.quantile(0.95)
m

In [None]:
metadata['year'] = pd.to_datetime(metadata['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [None]:
qualified = metadata[(metadata['vote_count'] >= m) & (metadata['vote_count'].notnull()) & (metadata['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

In [None]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [None]:
qualified['wr'] = qualified.apply(weighted_rating, axis=1)

In [None]:
qualified = qualified.sort_values('wr', ascending=False).head(250)

In [None]:
qualified.head(15)

Exercice : écrire une fonction qui permet de construire le tableau pour un genre particulier. Afficher le top 15 des films de romance.

# Content-based filtering

préparation des données :  on ne garde qu'une partie des données (identifiant, titre et overview)

In [None]:
df = metadata[['id','title','overview']]
df.head(1)

Création de la matrice TF-IDF

In [None]:
tfidf = TfidfVectorizer(stop_words='english')
df['overview'] = df['overview'].fillna('')

In [None]:
tfidf_matrix = tfidf.fit_transform(df['overview'])
tfidf_matrix.shape

Similarité cosinus

In [None]:
cosine_sim=cosine_similarity(tfidf_matrix,tfidf_matrix)

In [None]:
cosine_sim[1]

Recommandation en utilisant les similarités

In [None]:
df=df[~df['title'].isna()]

In [None]:
indices = pd.Series(df.index, index=df['title'])
indices = indices[~indices.index.duplicated(keep='last')]
indices

In [None]:
target_movie_index = indices['Toy Story']
similarity_scores = pd.DataFrame(cosine_sim[target_movie_index], columns=["score"])
similarity_scores

In [None]:
movie_indices = similarity_scores.sort_values("score", ascending=False)[0:11].index

In [None]:
df['title'].iloc[movie_indices]

# Collaborative Filtering

In [None]:
reader = Reader()

In [None]:
ratings = pd.read_csv('data/ratings_small.csv')
ratings.head()

In [None]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [None]:
#svd
svd = SVD()

In [None]:
# Run 5-fold cross-validation and then print results
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
trainset = data.build_full_trainset()
svd.fit(trainset)

In [None]:
ratings[ratings['userId'] == 1]

In [None]:
svd.predict(1, 302,3)
