Import Library

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from google.colab import drive

Giant Function of MF

In [None]:
#Function that combines the processes of matrix factorization and returns recommended movies by performing matrix factorization from matrix creation
#Input : userid(int) / test_data(string)
#Output : recomm_movies(Dataframe)
def mf_giantfunction(userid,test_data):
    #Function that create a matrix from dataset
    #Input : data_name(string)
    #Output : user_data(Dataframe)
    def data_preprocessing_matrix(data_name):
        data = data_name.drop_duplicates() #Drop the duplicate data
        data = data.reset_index(drop=True) #Reset the index due to drop data
        data = data.dropna()#Drop the nan data
        user_df = data.pivot(index='userId', columns='imdb_id', values='rating').fillna(0) #Create matrix between userid and imdbid
        user_data = pd.DataFrame(user_df)
        return user_data
    #Function that calculate RMSE Score to evaluate the Predict matrix
    #Input : A(Full Matrix) / U & V(Partial Matrix) / Non_zeros(List)
    #Output : rmse(Int)
    def get_rmse(A, U, V, non_zeros):
        #Make full predict matrix use P and Q.T
        full_pred_matrix = np.dot(U, V.T)#Convert v matrix to transposition matrix for full matrix
        #Save users who evaluated movies stored in y_non_zero
        x_non_zero = [non_zero[0] for non_zero in non_zeros]
        #Store movies that have already been rated by users stored in x_non_zero
        y_non_zero = [non_zero[1] for non_zero in non_zeros]
        #Save Real Rating score
        A_non_zeros = A[x_non_zero, y_non_zero]
        #Make predictive metrics using information from x_non_zero and y_non_zero
        full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero, y_non_zero]
        #Calculate mse score and rmse score
        mse = mean_squared_error(A_non_zeros, full_pred_matrix_non_zeros)
        rmse = np.sqrt(mse)
        return rmse
    #Function that perform matrix factorization used SGD
    #Input : A(Dataframe)
    #Output : U & V(Numpy Array)
    #Ref: https://big-dream-world.tistory.com/69
    def matrix_factorization(A): 
        R = A.values
        #Get the actual size of the training dataset
        num_users, num_movies = R.shape

        #Hyperparameter of Matrix factorization and SGD
        K=100#Finish to update
        steps = 400 #Finish to update
        learning_rate=0.001 #Finish to update
        r_lambda = 0.01
    
        np.random.seed(42)
        u = np.random.normal(scale=1./K, size=(num_users, K)) #Create virtual random matrix with the size of user and the value of k 
        v = np.random.normal(scale=1./K, size=(num_movies, K)) #Create virtual random matrix with the size of movie and the value of k 
        
        #Save the part of the training dataset that has already been evaluated to a list
        non_zeros = [ (i, j, R[i,j]) for i in range(num_users) for j in range(num_movies) if R[i,j] > 0 ]
  
        #Update P & Q matrix used SGD
        for step in range(steps):
            for i, j, r in non_zeros:
                # Calculate error
                    err = r - np.dot(u[i, :], v[j, :].T)
                # Update by applying SGD
                    u[i,:] = u[i,:] + learning_rate*(err * v[j, :] - r_lambda*u[i,:])
                    v[j,:] = v[j,:] + learning_rate*(err * u[i, :] - r_lambda*v[j,:])
        #Calculate RMSE
            rmse = get_rmse(R, u, v, non_zeros)
            if(step==steps):
                print("### Final step is finish, The rmse Score : ", rmse)
        return u, v
    
    #Function that organizes movies that users haven't seen
    #Input: ratings_matrix(Dataframe) / id(Int)
    #Output: unseen_movie(List)
    def get_unseen_list(ratings_matrix, id):
        user_rating = ratings_matrix.loc[id,:] #Extract only the parts that match the user ID
        seen_movie = user_rating[ user_rating > 0].index.tolist() #The part with a value greater than 0 is already a movie, so only that part is saved
        movies_list = ratings_matrix.columns.tolist() #Convert movie titles saved by column names to a list
        unseen_movie = [ movie for movie in movies_list if movie not in seen_movie] #Save movie titles that are not included in this movie list
        return unseen_movie
    

    #train model
    data = all_movie_df[['userId','title','imdb_id','rating']]
    traindf = data_preprocessing_matrix(data) #Make matrix
    u,v = matrix_factorization(traindf) #Matrix factorization
    pred_matrix = np.dot(u,v.T)#Convert v matrix to transposition matrix for full matrix
    ratings_pred_matrix = pd.DataFrame(data=pred_matrix, index= traindf.index,columns = traindf.columns) 
    #test model
    testdf = data_preprocessing_matrix(test_data)#Make matrix
    u_tt,v_tt = matrix_factorization(testdf) #Matrix factorization
    pred_matrix_tt = np.dot(u_tt,v_tt.T)#Convert v matrix to transposition matrix for full matrix
    ratings_pred_matrix_tt = pd.DataFrame(data=pred_matrix_tt, index= testdf.index,columns = testdf.columns)
    #Calculate RMSE SCORE to Evaluate the model
    mse = mean_squared_error(testdf, ratings_pred_matrix_tt) #Test dataset and Matrix made from test dataset
    rmse = np.sqrt(mse) 
    print("The RMSE Score evaluated using test dataset is : " + str(rmse))
    #Recommendation 
    unseen_list = get_unseen_list(traindf, userid)
    recomm_movies = ratings_pred_matrix.loc[userid, unseen_list].sort_values(ascending=False)[:5] #Extract only the top five movies
    recomm_movies = pd.DataFrame(data=recomm_movies.values,index=recomm_movies.index,columns=['pred_score'])
    recomm_movies = pd.merge(recomm_movies, movies_metadata_title, on = 'imdb_id', how = 'left') #To show movie title
    return recomm_movies

Load training dataset
- rating과 movies_metadata만 필요함

In [None]:
ratings = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML Data/training_dt/ratings_small.csv')
movies_metadata = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML Data/training_dt/movies_metadata.csv')
credits = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML Data/training_dt/credits.csv')

Load test dataset

In [None]:
test = np.load('/content/drive/MyDrive/Colab Notebooks/ML Data/test_dt/Dataset.npy')
test = pd.DataFrame(test)
test = test.rename(columns={0:'userId'})
test[['userId', 'imdb_id', 'rating', 'date']] = pd.DataFrame(test.userId.str.split(',', 3).tolist())
test['rating'] = pd.to_numeric(test['rating'])
test = test.iloc[:1000]

Rating data preprocessing

In [None]:
ratings.rename(columns={'movieId':'id'}, inplace = True)
ratings.loc['id'] = ratings['id'].astype('str')
ratings['id'] = pd.to_numeric(ratings['id'])

Credit data preprocessing

In [None]:
#credits의 [] - null값 제거
credits = credits.loc[credits['cast'] != "[]"]
credits = credits.loc[credits['crew'] != "[]"]
#추후 merge를 위해 index id의 값들을 숫자형태로 변경
credits['id'] = pd.to_numeric(credits['id'])

Metadata preprocessing

In [None]:
#계산에 불필요한 ft drop
movies_metadata.drop(['homepage','overview', 'tagline', 'poster_path'], axis = 1, inplace = True)
movies_metadata.drop(['vote_average','vote_count'], axis = 1, inplace = True)
movies_metadata.drop(['production_companies'], axis = 1, inplace = True)
#belongs_to_collection null 값을 no collection 값으로 변경
movies_metadata['belongs_to_collection'].replace(np.nan,'no collection', inplace = True)
#null값 없애기
movies_metadata.dropna(inplace = True)
#'genres', 'proudction_countries'가 null인 경우 drop
movies_metadata = movies_metadata.loc[movies_metadata['genres'] != "[]"]
movies_metadata = movies_metadata.loc[movies_metadata['production_countries'] != "[]"]
#movies_metadata에서 budget이 0 인 것 개수
len(movies_metadata.loc[movies_metadata['budget'] == '0']['budget'])
movies_metadata['budget'] = pd.to_numeric(movies_metadata['budget'])
#movies_metadata['budget'] 0인 것은 나머지의 평균으로 넣는다.
budget_mean = movies_metadata.loc[movies_metadata['budget'] != 0]['budget'].mean()
movies_metadata['budget'].replace(0, budget_mean, inplace = True)
#'id'형식이 날짜 형식인 경우 -> 잘못된 경우
movies_metadata['isIdRight'] = movies_metadata['id'].str.contains('|'.join('-'))
movies_metadata = movies_metadata[movies_metadata['isIdRight'] == False]
movies_metadata.drop(['isIdRight'], axis = 1, inplace = True)
#추후 merge를 위해 index id의 값들을 숫자형태로 변경
movies_metadata.loc['id'] = movies_metadata['id'].astype('str')
movies_metadata['id'] = pd.to_numeric(movies_metadata['id'])
movies_metadata_title = movies_metadata[["imdb_id", "title"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Merge training dataset

In [None]:
all_movie_df = pd.merge(movies_metadata, credits)
all_movie_df = pd.merge(all_movie_df, ratings)

Main

In [None]:
user_id = float(input('order of user : '))
result = mf_giantfunction(user_id,test)
result

order of user : 1
The RMSE Score evaluated using test dataset is : 0.4341623716571369


Unnamed: 0,imdb_id,pred_score,title
0,tt1642665,3.484363,Urban Explorer
1,tt1865335,3.395263,Confession of a Child of the Century
2,tt0018737,3.227056,Pandora's Box
3,tt0101393,3.222429,Backdraft
4,tt0045810,3.213671,Gentlemen Prefer Blondes
