# Because you like this movie...

This very simple recommendation systems is using a factorizatin matrix with pearson correlation to match similar movies.  
Before runnning it, apply a filter on the rating.csv and movie.csv with our other custom made tool: "DataFrame Filter".  
It will fasten the factorization and overall runtime of our recommendation system.
Enjoy !

In [30]:
# Import libraries
import pandas as pd
import numpy as np

In [31]:
# Set path to csv
path_rating = "output_data\\rating_confidential_movies.csv"

# Load csv to a Dataframe
df = pd.read_csv(path_rating)

# Display number of rated movies
print(df['movieId'].nunique())

df.head(2)

24060


Unnamed: 0,userId,movieId,rating
0,1,1525,3.0
1,1,1750,3.5


#### Pivoting the DataFrame into a Factorization Matrix

In [32]:
# Splitting the DataFrame into 10 equal chunks to lighten Pandas work load
user_splits = np.array_split(df['userId'].unique(), 10)

# Create empty list
df_pivot_list = []

# For loop to pivot table
for split in user_splits:
    df_subset = df[df['userId'].isin(split)]
    df_pivot_subset = pd.pivot_table(df_subset, values='rating', index='userId', columns='movieId')
    df_pivot_list.append(df_pivot_subset)
    print("New split on duty !")

# Concatenate the 10 chunks
df_pivot = pd.concat(df_pivot_list)

print('Job Done: Factorization Matrix Ready')


New split on duty !
New split on duty !
New split on duty !
New split on duty !
New split on duty !
New split on duty !
New split on duty !
New split on duty !
New split on duty !
New split on duty !
Job Done: Factorization Matrix Ready


In [33]:
# Display matrix shape
df_pivot.shape

(102443, 24060)

In [34]:
# Show the first rows of the matrix
df_pivot.head(2)

movieId,30,33,37,38,40,49,51,53,54,55,...,130586,130604,130614,130622,130656,130828,131110,131172,131237,131262
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,


#### Introducing movie df with one hot encoded genres

In [37]:
# Set path to csv
path_movie = "output_data\movie_IQR_users.csv"

# Load csv to a Dataframe
movie_df = pd.read_csv(path_movie)

# Set 'movieId' as the index of the DataFrame
movie_df.set_index('movieId', inplace = True)

# Set year column as str
movie_df['year'] = movie_df['year'].astype(str)

print(movie_df.shape)
movie_df.head(2)

(19658, 22)


Unnamed: 0_level_0,title,genres,year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995.0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Jumanji,Adventure|Children|Fantasy,1995.0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Creating Movie/genres Dataframe with numeric types only columns

In [38]:
movie_df_num = movie_df.select_dtypes(include=[float, int])
movie_df_num.head(2)

Unnamed: 0_level_0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


#### Create Pearson Correlation Function

This function correlate the target movie genres on one side, and correlate target movie rating with users preferences. Then sort both of these scores in a descending order to display only the 10 first movies on the list.

In [39]:
import warnings

# Ignore warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)


def matching_genres(movie_title):
    # Find index of target movie
    target_movie_index = int(movie_df.index[movie_df['title'] == movie_title][0])
    
    # Select target movie genres
    target_genres = movie_df_num.loc[target_movie_index]

    # Calculate correlation score of target movie genres
    correlations = movie_df_num.apply(lambda row: row.corr(target_genres), axis=1)
    
    # Create DataFrame with genres correlation scores
    df_genre = pd.DataFrame(correlations, columns=['PearsonG'])
    
    # Add titles, genres and year
    df_genre = df_genre.join(movie_df[['title', 'genres', 'year']])
    
    # Sort score in descending order
    df_genre = df_genre.sort_values('PearsonG', ascending=False)
    df_genre.dropna(inplace=True)

    # Calculate correlation score with movie ratings
    target = df_pivot[target_movie_index]
    similar_to_target = df_pivot.corrwith(target)
    
    # Create DataFrame with movie correlation scores
    corr_target = pd.DataFrame(similar_to_target, columns=['PearsonR'])
    
    # Drop NaN
    corr_target.dropna(inplace=True)

    # Sort score in descending order
    corr_target = corr_target.sort_values('PearsonR', ascending=False)
    
    # Cast index in int type
    corr_target.index = corr_target.index.map(int)
    
    # Join genres and ratings correlation DataFrames
    df_combined = corr_target.join(df_genre.set_index(df_genre.index), how='inner')
    
    # Exclude the target movie from recommendations
    df_combined = df_combined[df_combined.index != target_movie_index]
    
    # Sort descending PearsonR et PearsonG columns
    df_combined = df_combined.sort_values(['PearsonR', 'PearsonG'], ascending=False)
    
    # Display results
    print("Because you watch this movie :", movie_title)
    print("Our Top10 Movies Selection for you !")
    print(df_combined.head(10))#.to_string(index=False))

# Call function
#matching_genres(matching_title)


#### Call it randomly with a function to generate recommandation
To avoid title mistyping and because we can recommend based on any movie of the catalog, we have coded this random chooser. You just have to run it, it chooses for you.

In [40]:
import random

def random_movie_recommendation():
    # Randomly select a movie title from the movie_df['title'] column
    random_title = random.choice(movie_df['title'].tolist())
    
    # Print the selected title (optional)
    print(f"Randomly selected movie: '{random_title}'")
    
    # Call the matching_genres function with the randomly selected title
    matching_genres(random_title)

# Call the function to get a random movie recommendation
random_movie_recommendation()


Randomly selected movie: 'Octane'
Because you watch this movie : Octane
Our Top10 Movies Selection for you !
         PearsonR  PearsonG                        title           genres  \
movieId                                                                     
68923         1.0       1.0                    Catacombs  Horror|Thriller   
67898         1.0       1.0            Gravedancers, The  Horror|Thriller   
67356         1.0       1.0                 Insanitarium  Horror|Thriller   
70227         1.0       1.0                    Dark Ride  Horror|Thriller   
69974         1.0       1.0          Shiver (Eskalofrío)  Horror|Thriller   
66140         1.0       1.0                     Blackout  Horror|Thriller   
75351         1.0       1.0           Night Stalker, The  Horror|Thriller   
74752         1.0       1.0  Cabin Fever 2: Spring Fever  Horror|Thriller   
76860         1.0       1.0                    Razorback  Horror|Thriller   
4358          1.0       1.0                 

#### Call it manually to generate recommendation

In [41]:
matching_genres('Jersey Girl')

Because you watch this movie : Jersey Girl
Our Top10 Movies Selection for you !
         PearsonR  PearsonG                                   title  \
movieId                                                               
39941         1.0  1.000000    Love on the Run (Amour en fuite, L')   
70661         1.0  1.000000           Tyler Perry's Meet the Browns   
61048         1.0  1.000000                                 Expired   
8671          1.0  1.000000                                Zus & Zo   
74649         1.0  1.000000                           Shades of Ray   
5271          1.0  1.000000                        30 Years to Life   
69466         1.0  1.000000            Alibi, The (Lies and Alibis)   
26868         1.0  0.792118  Lucky Break (a.k.a. Paperback Romance)   
87103         1.0  0.792118                        Vieraalla maalla   
92867         1.0  0.792118                   Truth About Love, The   

                       genres    year  
movieId                    