In [None]:
# Created on Sep 2021
# author: 임일
# modifier : KatieMinjoo

In [1]:
import numpy as np
import pandas as pd

In [3]:
# Load the u.user file into a dataframe
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('./u.user', sep = '|', names = u_cols, encoding = 'latin-1')

In [5]:
# Load the u.items file into a dataframe
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 
          'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
          'Thriller', 'War', 'Western']
movies = pd.read_csv('./u.item', sep='|', names=i_cols, encoding='latin-1')

In [6]:
# movie ID와 title을 제외한 컬럼 지우기
movies = movies[['movie_id', 'title']]

In [7]:
# Load the u.data file into a dataframe
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('./u.data', sep='\t', names=r_cols, encoding='latin-1')

In [8]:
# Delete timestamp
ratings = ratings.drop('timestamp', axis=1)

In [9]:
# import the train_test_split func
from sklearn.model_selection import train_test_split
x = ratings.copy()
y = ratings['user_id']

# Split the data into train/test datset (Stratified : used for classification dataset to split every classes in a different set as a equal ratio)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size= 0.25, stratify=y)

In [10]:
# Def RMSE
import numpy as np
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

In [11]:
# Baseline Model (return the mean value of the rating of each movie, if there's no ratings, then it returns 3)
def baseline(user_id, movie_id):
    try:
        rating = train_mean[movie_id]
    except:
        rating = 3.0
    return rating

In [12]:
# Calculating RMSE according to the recommendation model
def score(model):
    # Construct a list of user-movie tuples from the testing dataset
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    
    # Predict the rating for every user-movie tuple
    y_pred = np.array([model(user,movie) for (user,movie) in id_pairs])

    # Extract the actual ratings given by the users in the test data
    y_true = np.array(x_test['rating'])

    # Return the final RMSE score
    return RMSE(y_true, y_pred)

In [13]:
train_mean = x_train.groupby(['movie_id'])['rating'].mean()

In [15]:
# baseline model score
score(baseline)

1.0211189634238362

In [16]:
# Construct a full matrix with a pivot func of DataFrame
rating_matrix = x_train.pivot(values = 'rating', index = 'user_id', columns = 'movie_id')
rating_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1671,1672,1673,1674,1675,1676,1677,1679,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,,5.0,4.0,1.0,,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,3.0,,,,,,,,,...,,,,,,,,,,


In [19]:
# Merge user table
merged_data = pd.merge(x_train, users)
merged_data.head()

Unnamed: 0,user_id,movie_id,rating,age,sex,occupation,zip_code
0,152,88,5,33,F,educator,68767
1,152,69,5,33,F,educator,68767
2,152,120,2,33,F,educator,68767
3,152,80,5,33,F,educator,68767
4,152,402,5,33,F,educator,68767


#### Gender

In [25]:
# Calculate rating mean by gender for every movie
gender_mean = merged_data[['movie_id','rating','sex']].groupby(['movie_id','sex'])['rating'].mean()

In [27]:
gender_mean

movie_id  sex
1         F      3.717647
          M      3.928000
2         F      3.411765
          M      3.224490
3         F      2.500000
                   ...   
1676      M      2.000000
1677      F      3.000000
1679      M      3.000000
1681      M      3.000000
1682      M      3.000000
Name: rating, Length: 3039, dtype: float64

In [26]:
# set user_id as an index
users = users.set_index('user_id')

In [31]:
gender_mean[1]['F']

3.7176470588235295

In [32]:
# Recommendation model for specific gender
def cf_gender(user_id, movie_id):
    # check if the movie_id is in rating_matrix
    if movie_id in rating_matrix:
        # which gender
        gender = users.loc[user_id]['sex']
        #check if there's a rating from this gender
        if gender in gender_mean[movie_id]:
            # bring the mean value of it
            gender_rating = gender_mean[movie_id][gender]
        else: # returns 3.0 (default) if there's no ratings for this movie from specific gender
            gender_rating = 3.0
    else: # returns 3.0 (default) if there's no movie_id in rating_matrix (cold start)
        gender_rating = 3.0
    return gender_rating

In [33]:
score(cf_gender)

1.0321793144752165

### Occupation

In [34]:
# Calculate rating mean by occupation for every movie
occupation_mean = merged_data[['movie_id','rating','occupation']].groupby(['movie_id','occupation'])['rating'].mean()

In [35]:
def cf_occupation(user_id, movie_id):
    if movie_id in rating_matrix:
        occupation = users.loc[user_id]['occupation']
        if occupation in occupation_mean[movie_id]:
            occupation_rating = occupation_mean[movie_id][occupation]
        else:
            occupation_rating = 3.0
    else:
        occupation_rating = 3.0
    return occupation_rating

In [36]:
score(cf_occupation)

1.119225091770368