In [3]:
# Import libraries
import pandas as pd
import numpy as np

In [9]:
# Load csv to a Dataframe
df = pd.read_csv('output_data\df_2000.csv')
df.head(5)

In [70]:
# Display number of rated movies
df['movieId'].nunique()

12530

#### Pivoting the DataFrame into a Factorization Matrix

In [72]:
# Splitting the DataFrame into 10 equal chunks to lighten Pandas work load
user_splits = np.array_split(df['userId'].unique(), 10)

# Create empty list
df_pivot_list = []

# For loop to pivot table
for split in user_splits:
    df_subset = df[df['userId'].isin(split)]
    df_pivot_subset = pd.pivot_table(df_subset, values='rating', index='userId', columns='movieId')
    df_pivot_list.append(df_pivot_subset)
    print("New split on duty !")

# Concatenate the 10 chunks
df_pivot = pd.concat(df_pivot_list)

print('Job Done: Factorization Matrix Ready')


New split on duty !
New split on duty !
New split on duty !
New split on duty !
New split on duty !
New split on duty !
New split on duty !
New split on duty !
New split on duty !
New split on duty !
Job Done: Factorization Matrix Ready


In [12]:
# Display matrix shape
df_pivot.shape

(87850, 12530)

In [13]:
# Show the first rows of the matrix
df_pivot.head(2)

movieId,2769,3177,3190,3225,3228,3239,3273,3275,3276,3279,...,129313,129340,129841,129905,130524,130622,131110,131172,131237,131262
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,


#### Recommendation with genres

In [14]:
# Load csv to a Dataframe
movie_df = pd.read_csv('output_data\\movie_2000.csv')

# Set 'movieId' as the index of the DataFrame
movie_df.set_index('movieId', inplace = True)
movie_df.head(2)

Unnamed: 0_level_0,title,genres,year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2769,yards the,Crime|Drama,2000,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3177,next friday,Comedy,2000,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
#movie_df = movie_df.drop(columns=['movie_youth_rate'])
movie_df['year'] = movie_df['year'].astype(str)
movie_df.shape

(12729, 22)

#### Creating Dataframe with numeric types only columns

In [17]:
movie_df_num = movie_df.select_dtypes(include=[float, int])
movie_df_num.head(2)

#### Create Pearson Correlation Function

In [61]:
import warnings

# Ignore warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)


def matching_genres(movie_title):
    # Find index of target movie
    target_movie_index = int(movie_df.index[movie_df['title'] == movie_title][0])
    
    # Select target movie genres
    target_genres = movie_df_num.loc[target_movie_index]

    # Calculate correlation score of target movie genres
    correlations = movie_df_num.apply(lambda row: row.corr(target_genres), axis=1)
    
    # Create DataFrame with genres correlation scores
    df_genre = pd.DataFrame(correlations, columns=['PearsonG'])
    
    # Add titles, genres and year
    df_genre = df_genre.join(movie_df[['title', 'genres', 'year']])
    
    # Sort score in descending order
    df_genre = df_genre.sort_values('PearsonG', ascending=False)
    df_genre.dropna(inplace=True)

    # Calculate correlation score with movie ratings
    target = df_pivot[target_movie_index]
    similar_to_target = df_pivot.corrwith(target)
    
    # Create DataFrame with movie correlation scores
    corr_target = pd.DataFrame(similar_to_target, columns=['PearsonR'])
    
    # Drop NaN
    corr_target.dropna(inplace=True)

    # Sort score in descending order
    corr_target = corr_target.sort_values('PearsonR', ascending=False)
    
    # Cast index in int type
    corr_target.index = corr_target.index.map(int)
    
    # Join genres and ratings correlation DataFrames
    df_combined = corr_target.join(df_genre.set_index(df_genre.index), how='inner')
    
    # Sort descending PearsonR et PearsonG columns
    df_combined = df_combined.sort_values(['PearsonR', 'PearsonG'], ascending=False)
    
    # Display results
    print("Because you watch this movie :", movie_title)
    print("Our Top10 Selection of 2000's movies for you !")
    print(df_combined.head(10))#.to_string(index=False))

# Call function
#matching_genres(matching_title)


#### Call it randomly with a function to generate recommandation

In [73]:
import random

def random_movie_recommendation():
    # Randomly select a movie title from the movie_df['title'] column
    random_title = random.choice(movie_df['title'].tolist())
    
    # Print the selected title (optional)
    print(f"Randomly selected movie: '{random_title}'")
    
    # Call the matching_genres function with the randomly selected title
    matching_genres(random_title)

# Call the function to get a random movie recommendation
random_movie_recommendation()


Randomly selected movie: 'southland tales'
Because you watch this movie : southland tales
Our Top10 Selection of 2000's movies for you !
         PearsonR  PearsonG                                       title  \
movieId                                                                   
56003         1.0  1.000000                             southland tales   
95506         1.0  0.838525             extraterrestrial extraterrestre   
101498        1.0  0.683333                                in the house   
83244         1.0  0.664211  its not me i swear cest pas moi je le jure   
27468         1.0  0.664211                                    stranded   
57236         1.0  0.664211                                         lol   
87992         1.0  0.664211                                     mammuth   
87792         1.0  0.664211                                 route irish   
76082         1.0  0.664211                                  blackwoods   
119216        1.0  0.664211           

#### Call it manually to generate recommandation

In [75]:
matching_genres('mammuth')

Because you watch this movie : mammuth
Our Top10 Selection of 2000's movies for you !
         PearsonR  PearsonG                              title        genres  \
movieId                                                                        
65230         1.0       1.0                         marley  me  Comedy|Drama   
66509         1.0       1.0                       funny people  Comedy|Drama   
8366          1.0       1.0                              saved  Comedy|Drama   
7460          1.0       1.0              coffee and cigarettes  Comedy|Drama   
71466         1.0       1.0                        city island  Comedy|Drama   
5890          1.0       1.0                             elling  Comedy|Drama   
78116         1.0       1.0                        please give  Comedy|Drama   
6620          1.0       1.0                  american splendor  Comedy|Drama   
6339          1.0       1.0  man on the train homme du train l  Comedy|Drama   
55280         1.0       1.0       