In [10]:
import pandas as pd 

df = pd.read_parquet('../data/movie_data.parquet')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99806 entries, 0 to 99805
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   userId             99806 non-null  int64  
 1   rating             99806 non-null  float64
 2   imdb_id            99806 non-null  object 
 3   adult              99806 non-null  object 
 4   genres             99806 non-null  object 
 5   original_language  99806 non-null  object 
 6   overview           99792 non-null  object 
 7   popularity         99806 non-null  float64
 8   poster_path        99800 non-null  object 
 9   release_date       99800 non-null  object 
 10  runtime            99806 non-null  float64
 11  title              99806 non-null  object 
 12  vote_average       99806 non-null  float64
 13  vote_count         99806 non-null  float64
dtypes: float64(5), int64(1), object(8)
memory usage: 10.7+ MB


In [12]:
ratings = df[['userId','rating','title']]
ratings.head()

Unnamed: 0,userId,rating,title
0,7,3.0,Toy Story
1,9,4.0,Toy Story
2,13,5.0,Toy Story
3,15,2.0,Toy Story
4,19,3.0,Toy Story


### User-based Recommendation

In [13]:
movieRatings = ratings.pivot_table(index=['userId'], columns=['title'], values =r'rating')
movieRatings.head()

title,$9.99,'Neath the Arizona Skies,"'night, Mother",(500) Days of Summer,...And God Created Woman,...And Justice for All,1-900,10,10 Attitudes,10 Cloverfield Lane,...,eXistenZ,loudQUIETloud: A Film About the Pixies,xXx,xXx: State of the Union,¡Three Amigos!,À Nous la Liberté,Æon Flux,İtirazım Var,Želary,’Round Midnight
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [14]:
target_movie = movieRatings['Toy Story']

### Only users who rated both Toy Story and M are considered in the correlation.

In [15]:
import numpy as np

def safe_corrwith(series, df, min_overlap=5):
    correlations = {}
    target = series.dropna()
    target_users = target.index

    for col in df.columns:
        other = df[col].dropna()

        # Find common users who rated both movies
        common_users = target_users.intersection(other.index)
        if len(common_users) < min_overlap:
            # Skip movies with too few shared ratings
            continue

        target_vals = target.loc[common_users]
        other_vals = other.loc[common_users]

        # Check for zero variance
        if target_vals.std() == 0 or other_vals.std() == 0:
            continue

        corr = target_vals.corr(other_vals)
        if not np.isnan(corr):
            correlations[col] = corr

    return pd.Series(correlations)

# Usage:
target_movie = movieRatings['Toy Story']
similarMovies = safe_corrwith(target_movie, movieRatings, min_overlap=10)
similarity_table = pd.DataFrame(similarMovies)
similarity_table.head()


Unnamed: 0,0
(500) Days of Summer,0.407521
10 Things I Hate About You,-0.045788
101 Dalmatians,0.453423
12 Angry Men,0.364079
127 Hours,0.370739


In [27]:
movieStats = ratings.groupby('title').agg({'rating': ['size', 'mean']})
popularMovies = movieStats['rating']['size'] >= 100
top_movies = movieStats[popularMovies].sort_values(('rating', 'mean'), ascending=False).head(10)

In [17]:
mappedColumnsMoviestat=movieStats[popularMovies]
mappedColumnsMoviestat.columns=[f'{i}|{j}' if j != '' else f'{i}' for i,j in mappedColumnsMoviestat.columns]
df = mappedColumnsMoviestat.join(pd.DataFrame(similarMovies, columns=['similarity']))
df = df.drop('Toy Story')


In [18]:
df.sort_values(['similarity'], ascending=False)[:10]

Unnamed: 0_level_0,rating|size,rating|mean,similarity
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Toy Story 2,125,3.844,0.743352
A Bug's Life,105,3.609524,0.677299
"Monsters, Inc.",130,3.884615,0.549582
The Dark Knight,121,4.235537,0.540978
Finding Nemo,122,3.803279,0.537958
Austin Powers: The Spy Who Shagged Me,112,3.272321,0.519847
The Lion King,200,3.7775,0.517524
Spider-Man,134,3.522388,0.512995
The Incredibles,126,3.861111,0.508661
Stand by Me,112,4.09375,0.497638


### Function

In [31]:
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [32]:
def get_similar_movie_user_based(title, n):
    target_movie = movieRatings[title]
    
    similarMovies = movieRatings.corrwith(target_movie)

    # Remove movies with no shared ratings with title
    similarMovies = similarMovies.dropna()

    movieStats = ratings.groupby('title').agg({'rating': ['size', 'mean']})
    popularMovies = movieStats['rating']['size'] >= 100

    mappedColumnsMoviestat=movieStats[popularMovies]
    mappedColumnsMoviestat.columns=[f'{i}|{j}' if j != '' else f'{i}' for i,j in mappedColumnsMoviestat.columns]
    df = mappedColumnsMoviestat.join(pd.DataFrame(similarMovies, columns=['similarity']))
    df = df.drop(title)
    return df.sort_values(['similarity'], ascending=False)[:10]

In [33]:
get_similar_movie_user_based('Star Wars', 10)

Unnamed: 0_level_0,rating|size,rating|mean,similarity
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Return of the Jedi,217,4.059908,0.747774
The Empire Strikes Back,234,4.232906,0.70079
The Dark Knight,121,4.235537,0.549486
The Lord of the Rings: The Fellowship of the Ring,200,4.1825,0.477582
Raiders of the Lost Ark,220,4.193182,0.476442
The Incredibles,126,3.861111,0.450914
The Lord of the Rings: The Two Towers,188,4.06117,0.448153
E.T. the Extra-Terrestrial,160,3.76875,0.428289
Star Trek: Generations,114,3.350877,0.413682
"Monsters, Inc.",130,3.884615,0.40318


### Item-based Recommendation

In [21]:
corr = movieRatings.corr(min_periods=100)
corr.head()

title,$9.99,'Neath the Arizona Skies,"'night, Mother",(500) Days of Summer,...And God Created Woman,...And Justice for All,1-900,10,10 Attitudes,10 Cloverfield Lane,...,eXistenZ,loudQUIETloud: A Film About the Pixies,xXx,xXx: State of the Union,¡Three Amigos!,À Nous la Liberté,Æon Flux,İtirazım Var,Želary,’Round Midnight
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
$9.99,,,,,,,,,,,...,,,,,,,,,,
'Neath the Arizona Skies,,,,,,,,,,,...,,,,,,,,,,
"'night, Mother",,,,,,,,,,,...,,,,,,,,,,
(500) Days of Summer,,,,,,,,,,,...,,,,,,,,,,
...And God Created Woman,,,,,,,,,,,...,,,,,,,,,,


In [22]:
corr['Star Wars'].drop('Star Wars').sort_values(ascending=False).head(10)

title
Return of the Jedi                                   0.747774
The Empire Strikes Back                              0.700790
The Lord of the Rings: The Fellowship of the Ring    0.477582
Raiders of the Lost Ark                              0.476442
The Lord of the Rings: The Two Towers                0.448153
E.T. the Extra-Terrestrial                           0.428289
The Lord of the Rings: The Return of the King        0.383217
Men in Black                                         0.341665
The Lion King                                        0.337286
Blade Runner                                         0.333179
Name: Star Wars, dtype: float64

### Recommend based on multi movie ratings

In [25]:
user_ratings = {
    "Star Wars": 2,
    "Blade Runner": 3,
    "Men in Black": 5,
    "The Lion King": 2,
    "The Lord of the Rings: The Return of the King":4
}

In [37]:
scores = pd.Series(dtype=float)

for movie, rating in user_ratings.items():
    if movie in corr:
        # Get similarity scores for this movie
        similar_scores = corr[movie]
        
        # Weight by (rating - baseline)
        # You can use baseline = 2.5 to center around neutral
        weighted_scores = similar_scores * (rating - 2)

        # Sum up contributions
        scores = scores.add(weighted_scores, fill_value=0)

# Remove movies already rated
for movie in user_ratings:
    scores.drop(movie, errors="ignore", inplace=True)

# Sort descending
recommended = scores.sort_values(ascending=False)
recommended.head()

title
The Lord of the Rings: The Two Towers                2.252402
The Lord of the Rings: The Fellowship of the Ring    2.197611
The Empire Strikes Back                              1.827331
The Matrix                                           1.710042
Back to the Future                                   1.613415
dtype: float64