Import Library

In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from google.colab import drive

Giant Function of MF

In [30]:
def mf_giantfunction(userid,test_data):
    # create a user movie rating matrix
    def data_preprocessing_matrix(data_name):
        data = data_name.drop_duplicates()
        data = data.reset_index(drop=True)
        data = data.dropna()
        if(len(data)>1000):
            data = data.iloc[:1000]
        user_df = data.pivot(index='userId', columns='imdb_id', values='rating').fillna(0)
        user_data = pd.DataFrame(user_df)
        return user_data
    def get_rmse(A, U, V, non_zeros):
        #Make full predict matrix use P and Q.T
        full_pred_matrix = np.dot(U, V.T)#Convert v matrix to transposition matrix for full matrix
        #Save users who evaluated movies stored in y_non_zero
        x_non_zero = [non_zero[0] for non_zero in non_zeros]
        #Store movies that have already been rated by users stored in x_non_zero
        y_non_zero = [non_zero[1] for non_zero in non_zeros]
        #Save Real Rating score
        A_non_zeros = A[x_non_zero, y_non_zero]
        #Make predictive metrics using information from x_non_zero and y_non_zero
        full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero, y_non_zero]

        #Calculate mse score and rmse score
        mse = mean_squared_error(A_non_zeros, full_pred_matrix_non_zeros)
        rmse = np.sqrt(mse)
        return rmse
    def matrix_factorization(A): 
        R = A.values
        #Get the actual size of the training dataset
        num_users, num_movies = R.shape

        #Hyperparameter of Matrix factorization and SGD
        K=50
        steps=200
        learning_rate=0.01 
        r_lambda = 0.01

        np.random.seed(42)
        u = np.random.normal(scale=1./K, size=(num_users, K)) #Create virtual random matrix with the size of user and the value of k 
        v = np.random.normal(scale=1./K, size=(num_movies, K)) #Create virtual random matrix with the size of movie and the value of k 
        
        #Save the part of the training dataset that has already been evaluated to a list
        non_zeros = [ (i, j, R[i,j]) for i in range(num_users) for j in range(num_movies) if R[i,j] > 0 ]
  
        #Update P & Q matrix used SGD
        for step in range(steps):
            for i, j, r in non_zeros:
                # Calculate error
                err = r - np.dot(u[i, :], v[j, :].T)
                # Update by applying SGD
                u[i,:] = u[i,:] + learning_rate*(err * v[j, :] - r_lambda*u[i,:])
                v[j,:] = v[j,:] + learning_rate*(err * u[i, :] - r_lambda*v[j,:])
            #Calculate RMSE
            rmse = get_rmse(R, u, v, non_zeros)
            if(step==steps):
                print("### iteration step : ", step," rmse : ", rmse)
        return u, v
    
    def get_unseen_list(ratings_matrix, userId):
        user_rating = ratings_matrix.loc[userId,:]
        seen_movie = user_rating[ user_rating > 0].index.tolist()
        movies_list = ratings_matrix.columns.tolist()
        unseen_movie = [ movie for movie in movies_list if movie not in seen_movie]
        return unseen_movie
    

    #train model
    data = all_movie_df[['userId','title','imdb_id','rating']]
    traindf = data_preprocessing_matrix(data)
    u,v = matrix_factorization(traindf)
    pred_matrix = np.dot(u,v.T)#Convert v matrix to transposition matrix for full matrix
    ratings_pred_matrix = pd.DataFrame(data=pred_matrix, index= traindf.index,columns = traindf.columns)
    #test model
    testdf = data_preprocessing_matrix(test_data)
    u_tt,v_tt = matrix_factorization(testdf)
    pred_matrix_tt = np.dot(u_tt,v_tt.T)#Convert v matrix to transposition matrix for full matrix
    ratings_pred_matrix_tt = pd.DataFrame(data=pred_matrix_tt, index= testdf.index,columns = testdf.columns)
    mse = mean_squared_error(testdf, ratings_pred_matrix_tt)
    rmse = np.sqrt(mse)
    print("The RMSE Score is : " + str(rmse))
    #Recommendation 
    unseen_list = get_unseen_list(traindf, userid)
    recomm_movies = ratings_pred_matrix.loc[userid, unseen_list].sort_values(ascending=False)[:5]
    recomm_movies = pd.DataFrame(data=recomm_movies.values,index=recomm_movies.index,columns=['pred_score'])
    return recomm_movies

    #unseen_list = get_unseen_list(testdf, userid)
    #recomm_movies_tt = ratings_pred_matrix_tt.loc[userid, unseen_list].sort_values(ascending=False)[:5]
    #recomm_movies_tt = pd.DataFrame(data=recomm_movies.values_tt,index=recomm_movies_tt.index,columns=['pred_score'])
    #return recomm_movies_tt

Load training dataset

In [7]:
ratings = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML Data/training_dt/ratings_small.csv')
movies_metadata = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML Data/training_dt/movies_metadata.csv')
credits = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML Data/training_dt/credits.csv')
keywords = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML Data/training_dt/keywords.csv')
links = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML Data/training_dt/links.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


Load train dataset

In [23]:
test = np.load('/content/drive/MyDrive/Colab Notebooks/ML Data/test_dt/Dataset.npy')
test = pd.DataFrame(test)
test = test.rename(columns={0:'userId'})
test[['userId', 'imdb_id', 'rating', 'date']] = pd.DataFrame(test.userId.str.split(',', 3).tolist())
test['rating'] = pd.to_numeric(test['rating'])

Rating data preprocessing

In [9]:
ratings.rename(columns={'movieId':'id'}, inplace = True)
ratings.loc['id'] = ratings['id'].astype('str')
ratings['id'] = pd.to_numeric(ratings['id'])

Metadata preprocessing

In [10]:
#나중에 최종결과 보여줄 때 movies_metadata_origin을 사용할 예정
movies_metadata_origin = movies_metadata
#계산에 불필요한 ft drop
movies_metadata.drop(['homepage','overview', 'tagline', 'poster_path'], axis = 1, inplace = True)
movies_metadata.drop(['vote_average','vote_count'], axis = 1, inplace = True)
movies_metadata.drop(['production_companies'], axis = 1, inplace = True)
#belongs_to_collection null 값을 no collection 값으로 변경
movies_metadata['belongs_to_collection'].replace(np.nan,'no collection', inplace = True)
#null값 없애기
movies_metadata.dropna(inplace = True)
#'genres', 'proudction_countries'가 null인 경우 drop
movies_metadata = movies_metadata.loc[movies_metadata_origin['genres'] != "[]"]
movies_metadata = movies_metadata.loc[movies_metadata_origin['production_countries'] != "[]"]
#movies_metadata에서 budget이 0 인 것 개수
len(movies_metadata.loc[movies_metadata['budget'] == '0']['budget'])
movies_metadata['budget'] = pd.to_numeric(movies_metadata['budget'])
#movies_metadata['budget'] 0인 것은 나머지의 평균으로 넣는다.
budget_mean = movies_metadata.loc[movies_metadata['budget'] != 0]['budget'].mean()
movies_metadata['budget'].replace(0, budget_mean, inplace = True)
#'id'형식이 날짜 형식인 경우 -> 잘못된 경우
movies_metadata['isIdRight'] = movies_metadata['id'].str.contains('|'.join('-'))
movies_metadata = movies_metadata[movies_metadata['isIdRight'] == False]
movies_metadata.drop(['isIdRight'], axis = 1, inplace = True)
#추후 merge를 위해 index id의 값들을 숫자형태로 변경
movies_metadata.loc['id'] = movies_metadata['id'].astype('str')
movies_metadata['id'] = pd.to_numeric(movies_metadata['id'])

Credits preprocessing

In [11]:
#credits의 [] - null값 제거
credits = credits.loc[credits['cast'] != "[]"]
credits = credits.loc[credits['crew'] != "[]"]
#추후 merge를 위해 index id의 값들을 숫자형태로 변경
#credits['id'] = pd.to_numeric(credits['id'])

Link preprocessing

In [12]:
links = links.dropna()

Keyword preprocessing

In [13]:
keywords = keywords.loc[keywords['keywords'] != "[]"]

Merge training dataset

In [14]:
all_movie_df = pd.merge(movies_metadata, credits)
#print(all_movie_df.head())
all_movie_df = pd.merge(all_movie_df, ratings)
print(all_movie_df.columns)
#rating 별로 movie df가 합쳐진 것을 알 수 있음
#print(len(ratings[ratings['id'] == 949]))
#print(len(all_movie_df[all_movie_df['id'] == 949]))
#ratings['movield'] = pd.to_numeric(ratings['movield'])
#movies_metadata['id'] = pd.to_numeric(movies_metadata['id'])

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'id', 'imdb_id',
       'original_language', 'original_title', 'popularity',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'title', 'video', 'cast', 'crew',
       'userId', 'rating', 'timestamp'],
      dtype='object')


Main

In [31]:
user_id = int(input('order of user : '))
result = mf_giantfunction(user_id,test)
result

order of user : 18
The RMSE Score is : 0.9659131710526279


Unnamed: 0_level_0,pred_score
imdb_id,Unnamed: 1_level_1
tt0076759,3.531069
tt0110729,3.505566
tt0111495,3.4208
tt0114781,3.17383
tt0110638,3.143518
