# Recommender Systems

## What are recommender systems

# Simple Recommenders

## Dataset

**Full MovieLense Dataset**. Dataset vsebuje metapodatke za 45'000 filmov izdanih pred Julijem 2017. Za filme imamo podatke o igralcih, direktorju, plot keywords, revenue, jezik, production companies, TMDB vote counts, vote averages, itd.

* `movies_metadata.csv`: datoteka vsebuje informacije za okoli 45'000 filmov kot so *budget, žanr, revenue, production countries, copmanies, ...*
* `keywords.csv`: datoteka vsebuje *plot keywords* 
* `credits.csv`: vsebuje informacije o *Cast and Crew*
* `links.csv`: vsebuje TMDB in IMDB IDs za vse filme
* `links_small.csv`: vsebuje TMDB in IMDB IDs za manjši subset 9'000 filmov
* `ratings_small.csv`: vsebuje informacije glede 100'000 ratingsov od 700 userjev za 9'000 filmov

Podatke katere bomo uporabljali je dostopna na [linku](https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset/)



In [None]:
import pandas as pd

metadata = pd.read_csv('./data/movies_metadata.csv', low_memory=False)
metadata.head(3)

In [None]:
C = metadata['vote_average'].mean()
print("C:", C)

m = metadata['vote_count'].quantile(0.90)
print("m:", m)

In [None]:
# Filter out all qualified movies into a new DataFrame
q_movies = metadata.copy().loc[metadata['vote_count'] >= m]
q_movies.shape

**Izračun metrik**

In [None]:
# Function that computes the weighted rating of each movie
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [None]:
# Define a new feature 'score' and calculate its value with `weighted_rating()`
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)
q_movies[["original_title", "score"]].head()

**Sortiranje glede na metriko**

In [None]:
#Sort movies based on score calculated above
q_movies = q_movies.sort_values('score', ascending=False)

#Print the top 15 movies
q_movies[['title', 'vote_count', 'vote_average', 'score']].head(20)

# Content-Based Recommender

## Plot Description Based Recommender

In [None]:
#Print plot overviews of the first 5 movies.
metadata['overview'].head()

In [None]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
metadata['overview'] = metadata['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(metadata['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

In [None]:
tfidf.vocabulary_

In [None]:
tfidf.get_feature_names_out()[38693:38700]

In [None]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim.shape

In [None]:
cosine_sim.shape

In [None]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()
indices[:10]

In [None]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]

In [None]:
get_recommendations('The Dark Knight Rises')

In [None]:
get_recommendations('The Godfather')

## Credits, Genres and Keywords Based Recommender

In [None]:
metadata = pd.read_csv('./data/movies_metadata.csv', low_memory=False)

# Load keywords and credits
credits = pd.read_csv('./data/credits.csv')
keywords = pd.read_csv('./data/keywords.csv')

# Remove rows with bad IDs.
metadata = metadata.drop([19730, 29503, 35587])

# Convert IDs to int. Required for merging
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
metadata['id'] = metadata['id'].astype('int')

# Merge keywords and credits into your main metadata dataframe
metadata = metadata.merge(credits, on='id')
metadata = metadata.merge(keywords, on='id')

In [None]:
metadata[["original_title", "cast", "crew", "keywords"]].head()

In [None]:
from ast import literal_eval
import numpy as np


features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(literal_eval)

def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

# Define new director, cast, genres and keywords features that are in a suitable form.
metadata['director'] = metadata['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(get_list)

# Print the new features of the first 3 films
metadata[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

In [None]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    metadata[feature] = metadata[feature].apply(clean_data)

metadata[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

In [None]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

# Create a new soup feature
metadata['soup'] = metadata.apply(create_soup, axis=1)
print(metadata.shape)
metadata[['title', 'soup']].head(2)

In [None]:
metadata = metadata[metadata["vote_count"] > 20]
metadata.shape

In [None]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(metadata['soup'])

count_matrix.shape

In [None]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(count_matrix, count_matrix)

# Reset index of your main DataFrame and construct reverse mapping as before
metadata = metadata.reset_index()
indices = pd.Series(metadata.index, index=metadata['title'])

In [None]:
get_recommendations('The Dark Knight Rises', cosine_sim)

# Collaborative Filtering

![data](./images/rating-matrix.png)

## Dataset

In [None]:
import pandas as pd

In [None]:
ratings = pd.read_csv("./data/u.data", sep="\t", names=["user_id", "item_id", "rating", "timestamp"])
ratings.head()

In [None]:
movies = pd.read_csv("./data/u.item", sep="|", header=None)
movies.head()

---

In [None]:
import matplotlib.pyplot as plt

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Simple data
a = [1.0, 2.0]
b = [2.0, 4.0]
c = [2.5, 4.0]
d = [4.5, 5.0]

data = pd.DataFrame([a, b, c, d], columns=["Movie_1", "Movie_2"], index=list("ABCD"))

# Scatter plot
plt.scatter(data["Movie_1"], data["Movie_2"])

# Add labels and index names
for idx, row in data.iterrows():
    plt.annotate(idx, (row["Movie_1"] - 0.2, row["Movie_2"] + 0.2))

# Add labels to axes
plt.xlabel("Movie 1")
plt.ylabel("Movie 2")

plt.xlim(0,6)
plt.ylim(0,6)

# Display gridlines
plt.grid(True)

# Show the plot
plt.show()

**Calculating similarity**

In [None]:
from scipy import spatial

print(spatial.distance.euclidean(c, a))
print(spatial.distance.euclidean(c, b))
print(spatial.distance.euclidean(c, d))

In [None]:
# Create a scatter plot
fig, ax = plt.subplots()

# Add labels and index names
for idx, row in data.iterrows():
    ax.plot([0, row["Movie_1"]], [0, row["Movie_2"]], c="blue", alpha=0.3)
    ax.annotate(idx, (row["Movie_1"] - 0.2, row["Movie_2"] + 0.2))

# Add labels to axes
plt.xlabel("Movie 1")
plt.ylabel("Movie 2")

plt.xlim(0,6)
plt.ylim(0,6)

# Display gridlines
plt.grid(True)

# Show the plot
plt.show()

In [None]:
print(spatial.distance.cosine(a,b))

print(spatial.distance.cosine(c,b))
print(spatial.distance.cosine(c,d))

---

**Calculating new rating**

---

## User-Based vs Item-Based Collaborative Filtering

# Realen Primer

In [None]:
movies = pd.read_csv("./data/u.item", sep="|", header=None)
movies.head()

In [None]:
# Load the MovieLens 100k dataset
columns = ['user_id', 'item_id', 'rating', 'timestamp']
raw_data = pd.read_csv('./data/u.data', sep='\t', names=columns)
print("Raw data shape:", raw_data.shape)

In [None]:
data = raw_data.copy()
data["item_id"] = data["item_id"].replace(movies[0].values, movies[1].values)
data.drop_duplicates(subset=["user_id", "item_id"], inplace=True)
print("Data shape after duplicated drop: ", data.shape)
data.head()

In [None]:
data.reset_index(drop=True, inplace=True)
data.info()

In [None]:
# Create a user-item rating matrix
user_item_matrix = data.pivot(index='user_id', columns='item_id', values='rating')
# each row is user. Each column is movie. Row shows what rating did user give to what movies

user_item_matrix

In [None]:
unbiased = user_item_matrix.iloc[:, :].apply(lambda row: row - row.mean(), axis=1)
# unbiased.fillna(0, inplace=True)
unbiased.head()

In [None]:
from scipy import spatial

def calc_cosine_distance(user, other_user):
    user = user.copy().fillna(0)
    other_user = other_user.copy().fillna(0)
    user = user.values
    other_user = other_user.values
    return spatial.distance.cosine(user, other_user)

In [None]:
def get_rating(user_id, movie, data):
    """We want to get a rating for user U, about movie I he hadn't seen yet"""
    print(f"We have user ID: {user_id}.")
    print(f"We want to see how he would rate movie: {movie}")

    user = data.loc[user_id, :]
    user_bias = user_item_matrix.loc[user_id, :].mean()
    
    if not pd.isna(user[movie]):
        print("User already saw the movie.")
    else:
        print("User hasn't seen the movie yet.")

        # Get all people who saw the movie
        filter_ = data[movie].notnull()
        watched_movie = data[filter_]
        print(f"{watched_movie.shape[0]} people have rated this movie.")

        print("The user rated following movies:\n")
        print(user[user.notnull()] + user_bias)

        # Calculate similarity between users
        watched_movie["similarity"] = watched_movie.apply(lambda row: calc_cosine_distance(user, row), axis=1)
        watched_movie.sort_values(by="similarity", inplace=True, ascending=True)
        top_5_similar_users = watched_movie.head()
        print("\nTop 5 similar users by ID: ", top_5_similar_users.index.values)

        # Calculating the average score
        top_5_scores = data.loc[top_5_similar_users.index.values, movie]
        average_score = top_5_scores.mean()

        print(f"Naš user bi {movie} ocenil z {average_score + user_bias : .2f}")



user_id = 3
movie = "101 Dalmatians (1996)"
get_rating(user_id, movie, unbiased)