In [48]:
import pandas as pd 

df = pd.read_parquet('../data/movie_data.parquet')

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99806 entries, 0 to 99805
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   userId             99806 non-null  int64  
 1   rating             99806 non-null  float64
 2   imdb_id            99806 non-null  object 
 3   adult              99806 non-null  object 
 4   genres             99806 non-null  object 
 5   original_language  99806 non-null  object 
 6   overview           99792 non-null  object 
 7   popularity         99806 non-null  float64
 8   poster_path        99800 non-null  object 
 9   release_date       99800 non-null  object 
 10  runtime            99806 non-null  float64
 11  title              99806 non-null  object 
 12  vote_average       99806 non-null  float64
 13  vote_count         99806 non-null  float64
dtypes: float64(5), int64(1), object(8)
memory usage: 10.7+ MB


In [50]:
ratings = df[['userId','rating','title']]
ratings.head()

Unnamed: 0,userId,rating,title
0,7,3.0,Toy Story
1,9,4.0,Toy Story
2,13,5.0,Toy Story
3,15,2.0,Toy Story
4,19,3.0,Toy Story


### User-based Recommendation

In [51]:
movieRatings = ratings.pivot_table(index=['userId'], columns=['title'], values =r'rating')
movieRatings.head()

title,$9.99,'Neath the Arizona Skies,"'night, Mother",(500) Days of Summer,...And God Created Woman,...And Justice for All,1-900,10,10 Attitudes,10 Cloverfield Lane,...,eXistenZ,loudQUIETloud: A Film About the Pixies,xXx,xXx: State of the Union,¡Three Amigos!,À Nous la Liberté,Æon Flux,İtirazım Var,Želary,’Round Midnight
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [52]:
import numpy as np

# Only users who rated both Toy Story and other movies are considered in the correlation.
def corrwith(series, df, min_overlap=5):
    correlations = {}
    target = series.dropna()
    target_users = target.index

    for col in df.columns:
        other = df[col].dropna()

        # Find common users who rated both movies
        common_users = target_users.intersection(other.index)
        if len(common_users) < min_overlap:

            # Skip movies with too few shared ratings
            continue

        target_vals = target.loc[common_users]
        other_vals = other.loc[common_users]

        # Check for zero variance
        if target_vals.std() == 0 or other_vals.std() == 0:
            continue

        corr = target_vals.corr(other_vals)
        if not np.isnan(corr):
            correlations[col] = corr

    return pd.Series(correlations)

In [53]:
# TEST
target_movie = movieRatings['Toy Story']
similarMovies = corrwith(target_movie, movieRatings, min_overlap=10)

# Filter results 
movieStats = ratings.groupby('title').agg({'rating': ['size', 'mean']})
popularMovies = movieStats['rating']['size'] >= 100

# Top 10 movies 
top_movies = movieStats[popularMovies].sort_values(('rating', 'mean'), ascending=False).head(10)

mappedColumnsMoviestat = movieStats[popularMovies]
mappedColumnsMoviestat.columns =[ f'{i}|{j}' if j != '' else f'{i}' for i,j in mappedColumnsMoviestat.columns]
df1 = mappedColumnsMoviestat.join(pd.DataFrame(similarMovies, columns=['similarity']))

# Remove target movie
df1 = df1.drop('Toy Story')

In [54]:
df1.sort_values(['similarity'], ascending=False)[:10]

Unnamed: 0_level_0,rating|size,rating|mean,similarity
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Toy Story 2,125,3.844,0.743352
A Bug's Life,105,3.609524,0.677299
"Monsters, Inc.",130,3.884615,0.549582
The Dark Knight,121,4.235537,0.540978
Finding Nemo,122,3.803279,0.537958
Austin Powers: The Spy Who Shagged Me,112,3.272321,0.519847
The Lion King,200,3.7775,0.517524
Spider-Man,134,3.522388,0.512995
The Incredibles,126,3.861111,0.508661
Stand by Me,112,4.09375,0.497638


### Function

In [55]:
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [56]:
def get_similar_movie_user_based(title, n):
    target_movie = movieRatings[title]
    
    similarMovies = movieRatings.corrwith(target_movie)

    # Remove movies with no shared ratings with title
    similarMovies = similarMovies.dropna()

    movieStats = ratings.groupby('title').agg({'rating': ['size', 'mean']})
    popularMovies = movieStats['rating']['size'] >= 100

    mappedColumnsMoviestat = movieStats[popularMovies]
    mappedColumnsMoviestat.columns = [f'{i}|{j}' if j != '' else f'{i}' for i,j in mappedColumnsMoviestat.columns]
    df = mappedColumnsMoviestat.join(pd.DataFrame(similarMovies, columns=['similarity']))
    df = df.drop(title)
    return df.sort_values(['similarity'], ascending=False)[:10]

In [57]:
get_similar_movie_user_based('Star Wars', 10)

Unnamed: 0_level_0,rating|size,rating|mean,similarity
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Return of the Jedi,217,4.059908,0.747774
The Empire Strikes Back,234,4.232906,0.70079
The Dark Knight,121,4.235537,0.549486
The Lord of the Rings: The Fellowship of the Ring,200,4.1825,0.477582
Raiders of the Lost Ark,220,4.193182,0.476442
The Incredibles,126,3.861111,0.450914
The Lord of the Rings: The Two Towers,188,4.06117,0.448153
E.T. the Extra-Terrestrial,160,3.76875,0.428289
Star Trek: Generations,114,3.350877,0.413682
"Monsters, Inc.",130,3.884615,0.40318


### Item-based Recommendation

In [58]:
corr = movieRatings.corr(min_periods=10)
corr.head()

title,$9.99,'Neath the Arizona Skies,"'night, Mother",(500) Days of Summer,...And God Created Woman,...And Justice for All,1-900,10,10 Attitudes,10 Cloverfield Lane,...,eXistenZ,loudQUIETloud: A Film About the Pixies,xXx,xXx: State of the Union,¡Three Amigos!,À Nous la Liberté,Æon Flux,İtirazım Var,Želary,’Round Midnight
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
$9.99,,,,,,,,,,,...,,,,,,,,,,
'Neath the Arizona Skies,,,,,,,,,,,...,,,,,,,,,,
"'night, Mother",,,,,,,,,,,...,,,,,,,,,,
(500) Days of Summer,,,,1.0,,,,,,,...,,,,,,,,,,
...And God Created Woman,,,,,,,,,,,...,,,,,,,,,,


In [75]:
top10 = corr['Star Wars'].drop('Star Wars').sort_values(ascending=False).head(50)

# Build a DataFrame for the top 10 similar movies
top10_df = pd.DataFrame({
    'title': top10.index,
    'similarity': top10.values.round(2)
})

# Merge with vote_count and vote_average from df
movie_info = df[['title', 'vote_count', 'vote_average']].drop_duplicates(subset='title')
top10_df = top10_df.merge(movie_info, on='title', how='left')
top10_df = top10_df[top10_df['vote_count'] > 150]

# Sort by similarity descending
top10_df = top10_df.sort_values(by='similarity', ascending=False).reset_index(drop=True)
top10_df.head(10)

Unnamed: 0,title,similarity,vote_count,vote_average
0,Predator 2,0.84,743.0,6.0
1,The Adventures of Robin Hood,0.82,170.0,7.6
2,The Passion of the Christ,0.76,888.0,6.9
3,Return of the Jedi,0.75,4763.0,7.9
4,National Lampoon’s Van Wilder,0.74,867.0,5.9
5,Moneyball,0.73,1409.0,7.0
6,A Little Princess,0.73,207.0,7.4
7,The Empire Strikes Back,0.7,5998.0,8.2
8,We Were Soldiers,0.7,531.0,6.7
9,Band of Brothers,0.68,725.0,8.2


### Recommend based on multi movie ratings

In [77]:
user_ratings = {
    "Star Wars": 2,
    "Blade Runner": 3,
    "Men in Black": 5,
    "The Lion King": 2,
    "The Lord of the Rings: The Return of the King":4,
    "The Amazing Spider-Man": 2
}

In [78]:
scores = pd.Series(dtype=float)

for movie, rating in user_ratings.items():
    if movie in corr:
        similar_scores = corr[movie]
        weighted_scores = similar_scores * (rating - 2)
        scores = scores.add(weighted_scores, fill_value=0)

# Remove already rated movies
for movie in user_ratings:
    scores.drop(movie, errors="ignore", inplace=True)

# Convert to DataFrame
recommended_df = pd.DataFrame({
    'title': scores.index,
    'score': scores.values
})

# Merge with vote metadata
movie_info = df[['title', 'vote_count', 'vote_average']].drop_duplicates(subset='title')
recommended_df = recommended_df.merge(movie_info, on='title', how='left')

# Filter for popular movies
recommended_df = recommended_df[recommended_df['vote_count'] > 150]

top10_recommendations = recommended_df.sort_values(by='score', ascending=False).head(10).reset_index(drop=True)
top10_recommendations

Unnamed: 0,title,score,vote_count,vote_average
0,Shanghai Knights,3.831527,707.0,6.0
1,Ice Age: The Meltdown,3.277348,3034.0,6.5
2,The Passion of the Christ,3.221604,888.0,6.9
3,Lethal Weapon 3,3.115912,824.0,6.4
4,Bedazzled,3.043121,526.0,5.6
5,High Noon,3.015239,343.0,7.6
6,The Client,2.995664,287.0,6.4
7,The Family Man,2.963854,530.0,6.5
8,Mr. Deeds,2.956386,660.0,5.6
9,October Sky,2.954934,259.0,7.4
