Import Library

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from google.colab import drive

Load training dataset

In [None]:
drive.mount('/content/drive')
ratings = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML Data/training_dt/ratings_small.csv')
movies_metadata = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML Data/training_dt/movies_metadata.csv', low_memory = False)
credits = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML Data/training_dt/credits.csv')
keywords = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML Data/training_dt/keywords.csv')

Load test dataset

In [None]:
#load test dataset
test = np.load('/content/drive/MyDrive/Colab Notebooks/ML Data/test_dt/Dataset.npy')
test = pd.DataFrame(test)
test = test[0].str.split(",", expand=True).rename(columns={0:'userId', 1:"imdb_id", 2:"rating", 3:'rating_date'})

## Preprocessing

### Training dataset preprocessing

Rating preprocessing

In [None]:
ratings.rename(columns={'movieId':'id'}, inplace = True)
ratings.loc['id'] = ratings['id'].astype('str')
ratings['id'] = pd.to_numeric(ratings['id'])

Movies_metadata preprocessing

In [None]:
#drop unnecessary features
movies_metadata.drop(['homepage', 'tagline', 'poster_path'], axis = 1, inplace = True)
movies_metadata.drop(['vote_average','vote_count'], axis = 1, inplace = True)
movies_metadata.drop(['production_companies'], axis = 1, inplace = True)

#change belongs_to_collection null value to no collection value
movies_metadata['belongs_to_collection'].replace(np.nan,'no collection', inplace = True)

#remove null value
movies_metadata.dropna(inplace = True)

#drop when 'genres', 'proudction_countries' has null value
movies_metadata = movies_metadata.loc[movies_metadata['genres'] != "[]"]
movies_metadata = movies_metadata.loc[movies_metadata['production_countries'] != "[]"]

#if 'id' is in date format -> invalid
movies_metadata['isIdRight'] = movies_metadata['id'].str.contains('|'.join('-'))
movies_metadata = movies_metadata[movies_metadata['isIdRight'] == False]
movies_metadata.drop(['isIdRight'], axis = 1, inplace = True)

#replace missing values with null values
movies_metadata['overview'] = movies_metadata['overview'].fillna('')

#change the values of index id to numeric form for future merge
movies_metadata.loc['id'] = movies_metadata['id'].astype('str')
movies_metadata['id'] = pd.to_numeric(movies_metadata['id'])
movies_metadata_title = movies_metadata[["imdb_id", "title"]]

Credit preprocessing

In [None]:
#remove null value
credits = credits.loc[credits['cast'] != "[]"]
credits = credits.loc[credits['crew'] != "[]"]
#change the values of index id to numeric form for future merge
credits['id'] = pd.to_numeric(credits['id'])

Keyword preprocessing

In [None]:
keywords['id'] = keywords['id'].astype('int')

### Test dataset preprocessing

In [None]:
test['rating'] = pd.to_numeric(test['rating'])

test = pd.merge(test, movies_metadata, on="imdb_id", how="left")
test = test.dropna()
test = test[['userId','imdb_id','rating']]

#we will use just 1000 data for fast test
test = test.iloc[:1000]

## Recommendation system

In [None]:
#Movie recommendation system function
#Input : ratings, movie_metadata, credits, keywords, test
#Output : matrix factorization, content-based recommendation's test dt's RMSE and user input's result
def recommendataion_system(ratings, movies_metadata, credits, keywords, test):
    #Collaborative Filtering,matrix factorization recommendation
    all_movie_df = pd.merge(movies_metadata, credits)
    all_movie_df = pd.merge(all_movie_df, ratings)
    
    user_id = float(input('order of user : '))
    result = mf_giantfunction(user_id,test,all_movie_df)
    print('Collaborative matrix factorization recommendation result--------------------')
    result = result[['title', 'imdb_id']]
    print(result)
    
    #content-based recommendation
    RMSE, accuracy = content_based_recommendation_measure(keywords, credits, movies_metadata, test)
    print('content-based recommendation result-------------------')
    print('RMSE : ',round(RMSE, 2),'accuracy',round(accuracy, 2))
    recommend_movie_list = content_based_recommendation_user(keywords, credits, movies_metadata)
    print(recommend_movie_list)
    

Functions for Collaborative Filtering recommendation

In [None]:
#Function that combines the processes of matrix factorization and returns recommended movies by performing matrix factorization from matrix creation
#Input : userid(int) / test_data(string)
#Output : recomm_movies(Dataframe)
def mf_giantfunction(userid,test_data,all_movie_df):
    #Function that create a matrix from dataset
    #Input : data_name(string)
    #Output : user_data(Dataframe)
    def data_preprocessing_matrix(data_name):
        data = data_name.drop_duplicates() #Drop the duplicate data
        data = data.reset_index(drop=True) #Reset the index due to drop data
        data = data.dropna()#Drop the nan data
        user_df = data.pivot(index='userId', columns='imdb_id', values='rating').fillna(0) #Create matrix between userid and imdbid
        user_data = pd.DataFrame(user_df)
        return user_data
    #Function that calculate RMSE Score to evaluate the Predict matrix
    #Input : A(Full Matrix) / U & V(Partial Matrix) / Non_zeros(List)
    #Output : rmse(Int)
    def get_rmse(A, U, V, non_zeros):
        #Make full predict matrix use P and Q.T
        full_pred_matrix = np.dot(U, V.T)#Convert v matrix to transposition matrix for full matrix
        #Save users who evaluated movies stored in y_non_zero
        x_non_zero = [non_zero[0] for non_zero in non_zeros]
        #Store movies that have already been rated by users stored in x_non_zero
        y_non_zero = [non_zero[1] for non_zero in non_zeros]
        #Save Real Rating score
        A_non_zeros = A[x_non_zero, y_non_zero]
        #Make predictive metrics using information from x_non_zero and y_non_zero
        full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero, y_non_zero]
        #Calculate mse score and rmse score
        mse = mean_squared_error(A_non_zeros, full_pred_matrix_non_zeros)
        rmse = np.sqrt(mse)
        return rmse
    #Function that perform matrix factorization used SGD
    #Input : A(Dataframe)
    #Output : U & V(Numpy Array)
    #Ref: https://big-dream-world.tistory.com/69
    def matrix_factorization(A): 
        R = A.values
        #Get the actual size of the training dataset
        num_users, num_movies = R.shape

        #Hyperparameter of Matrix factorization and SGD
        K=100#Finish to update
        steps = 400 #Finish to update
        learning_rate=0.001 #Finish to update
        r_lambda = 0.01
    
        np.random.seed(42)
        u = np.random.normal(scale=1./K, size=(num_users, K)) #Create virtual random matrix with the size of user and the value of k 
        v = np.random.normal(scale=1./K, size=(num_movies, K)) #Create virtual random matrix with the size of movie and the value of k 
        
        #Save the part of the training dataset that has already been evaluated to a list
        non_zeros = [ (i, j, R[i,j]) for i in range(num_users) for j in range(num_movies) if R[i,j] > 0 ]
  
        #Update P & Q matrix used SGD
        for step in range(steps):
            for i, j, r in non_zeros:
                # Calculate error
                    err = r - np.dot(u[i, :], v[j, :].T)
                # Update by applying SGD
                    u[i,:] = u[i,:] + learning_rate*(err * v[j, :] - r_lambda*u[i,:])
                    v[j,:] = v[j,:] + learning_rate*(err * u[i, :] - r_lambda*v[j,:])
        #Calculate RMSE
            rmse = get_rmse(R, u, v, non_zeros)
            if(step+1==steps):
                print("### Final step is finish, The rmse Score : ", round(rmse,2))
        return u, v
    
    #Function that organizes movies that users haven't seen
    #Input: ratings_matrix(Dataframe) / id(Int)
    #Output: unseen_movie(List)
    def get_unseen_list(ratings_matrix, id):
        user_rating = ratings_matrix.loc[id,:] #Extract only the parts that match the user ID
        seen_movie = user_rating[ user_rating > 0].index.tolist() #The part with a value greater than 0 is already a movie, so only that part is saved
        movies_list = ratings_matrix.columns.tolist() #Convert movie titles saved by column names to a list
        unseen_movie = [ movie for movie in movies_list if movie not in seen_movie] #Save movie titles that are not included in this movie list
        return unseen_movie
    
    #train model
    data = all_movie_df[['userId','title','imdb_id','rating']]
    traindf = data_preprocessing_matrix(data) #Make matrix
    u,v = matrix_factorization(traindf) #Matrix factorization
    pred_matrix = np.dot(u,v.T)#Convert v matrix to transposition matrix for full matrix
    ratings_pred_matrix = pd.DataFrame(data=pred_matrix, index= traindf.index,columns = traindf.columns) 
    #test model
    testdf = data_preprocessing_matrix(test_data)#Make matrix
    u_tt,v_tt = matrix_factorization(testdf) #Matrix factorization
    pred_matrix_tt = np.dot(u_tt,v_tt.T)#Convert v matrix to transposition matrix for full matrix
    ratings_pred_matrix_tt = pd.DataFrame(data=pred_matrix_tt, index= testdf.index,columns = testdf.columns)
    #Calculate RMSE SCORE to Evaluate the model
    mse = mean_squared_error(testdf, ratings_pred_matrix_tt) #Test dataset and Matrix made from test dataset
    rmse = np.sqrt(mse) 
    print("The RMSE Score evaluated using test dataset is : " + str(rmse))
    #Recommendation 
    unseen_list = get_unseen_list(traindf, userid)
    recomm_movies = ratings_pred_matrix.loc[userid, unseen_list].sort_values(ascending=False)[:5] #Extract only the top five movies
    recomm_movies = pd.DataFrame(data=recomm_movies.values,index=recomm_movies.index,columns=['pred_score'])
    recomm_movies = pd.merge(recomm_movies, movies_metadata_title, on = 'imdb_id', how = 'left') #To show movie title
    
    return recomm_movies

Functions for content-based recommendation system

In [None]:
#Function that selects the top 6 rating for each user and calculates and returns the RMSE and acuity of the entire user
#Input : keywords, credits, movies_metadata, test_dt
#Output : RMSE, accuracy score's mean value
def content_based_recommendation_measure(keywords, credits, movies_metadata, test_dt):
    #make test_dataset_different for test
    temp = pd.DataFrame(test_dt['userId'].value_counts())
    row_list = list(temp.index.values)
    temp.reset_index(inplace = True)
    temp.columns = ['userId','rating_cnt']
    temp = temp.drop(temp[temp['rating_cnt'] < 6].index)
    temp2 = pd.merge(test_dt, temp)
    temp2 = temp2[['userId','imdb_id','rating']]
    groups = temp2.groupby('userId')
    result = dict(list(groups))

    result_top5 = {} #store test set result
    for key, value in result.items():
        value['rating'] = value['rating'].astype(int)
        value.sort_values(by = 'rating', ascending = False, inplace = True)
        result_top5[key] = value.head(6)#5+1
    
    #content based dataset
    cb_trained_dt = movies_metadata
    cb_trained_dt = cb_trained_dt.merge(credits, on='id')
    cb_trained_dt = cb_trained_dt.merge(keywords, on='id')
    
    from ast import literal_eval
    #Parse the stringified features into their corresponding python objects
    features = ['cast', 'crew', 'keywords', 'genres']
    for feature in features:
        cb_trained_dt[feature] = cb_trained_dt[feature].apply(literal_eval)
        
    cb_trained_dt['director'] = cb_trained_dt['crew'].apply(get_director)

    features = ['cast', 'keywords', 'genres']
    for feature in features:
        cb_trained_dt[feature] = cb_trained_dt[feature].apply(get_list)

    features = ['cast', 'keywords', 'director', 'genres']
    for feature in features:
        cb_trained_dt[feature] = cb_trained_dt[feature].apply(clean_data)
        
    cb_trained_dt['soup'] = cb_trained_dt.apply(create_soup, axis=1)
    
    #Measure the rmse and acuity using the top 5 rating for each user
    accuracy_list = []#list for measuring accuracy of each test cases
    RMSE_list = []
    for key,value in result_top5.items():
        input_imdb_id = value.iloc[0]['imdb_id']#imdb_id of first(most liked)
        searchTerms = cb_trained_dt[cb_trained_dt['imdb_id'] == input_imdb_id]['soup']
        real_result = list(value['imdb_id'][1:6])
        real_result_rating = list(value['rating'][1:6])
        temp_RMSE, temp_accuracy = make_recommendation_for_accuracy(cb_trained_dt, searchTerms, real_result, real_result_rating)
        accuracy_list.append(temp_accuracy)
        RMSE_list.append(temp_RMSE)
    return sum(RMSE_list) / len(RMSE_list), sum(accuracy_list) / len(accuracy_list)
#Function that measures and outputs RMSE, acuity of test data in content-based
#Input : keywords, credits, movies_metadata, test_dt
#Output : RMSE, accuracy
def make_recommendation_for_accuracy(cb_trained_dt, searchTerms, real_result, rating):
    
    metadata = cb_trained_dt
    new_row = metadata.iloc[-1,:].copy() #creating a copy of the last row of the 
  #dataset, which we will use to input the user's input

  #grabbing the new wordsoup from the user
    new_row.iloc[-1] = " ".join(searchTerms) #adding the input to our new row
  
  #adding the new row to the dataset
    #metadata = metadata.append(new_row) #-> replace because of FutureWarning
    metadata.loc[len(metadata)] = new_row
  
  #Vectorizing the entire matrix as described above!
    count = CountVectorizer(stop_words='english')
    count_matrix = count.fit_transform(metadata['soup'])

  #running pairwise cosine similarity 
    cosine_sim2 = cosine_similarity(count_matrix, count_matrix) #getting a similarity matrix
  
  #sorting cosine similarities by highest to lowest
    sim_scores = list(enumerate(cosine_sim2[-1,:]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
  #matching the similarities to the movie titles and ids
    predict_val = []
    predict_sim = []
    #doing just TOP 5
    for i in range(1, 5 + 1):
        indx = sim_scores[i][0]
        predict_val.append(metadata['imdb_id'].iloc[indx]) #대신 사이트 주소 출력
        predict_sim.append(sim_scores[i][1])
    new_rating = []
    for val in rating:
        #new_value = ( (old_value - old_min) / (old_max - old_min) ) * (new_max - new_min) + new_min
        new_rating_val = ( (val - 1) / (10 - 1) ) * (1 - (-1)) - 1
        new_rating.append(new_rating_val)
    RMSE = mean_squared_error(new_rating, predict_sim)**0.5
    intersection_list = list(set(predict_val) & set(real_result))
    accuracy = len(intersection_list) / 5
    return RMSE, accuracy

In [None]:
#Function that receives information about genres, actors, directors, and keywords from users and returns 5 recommended movies
#input : kewords, credits, movies_metadata
#output : dataframe with recommendation movie_name with imdb site link
def content_based_recommendation_user(keywords, credits, movies_metadata):
    #content based dataset
    cb_trained_dt = movies_metadata
    cb_trained_dt = cb_trained_dt.merge(credits, on='id')
    cb_trained_dt = cb_trained_dt.merge(keywords, on='id')
    
    from ast import literal_eval
    #Parse the stringified features into their corresponding python objects
    features = ['cast', 'crew', 'keywords', 'genres']
    for feature in features:
        cb_trained_dt[feature] = cb_trained_dt[feature].apply(literal_eval)
        
    #apply func : apply functions in parenthese throughout the dataframe
    #related link : https://m.blog.naver.com/PostView.naver?isHttpsRedirect=true&blogId=wideeyed&logNo=221559041280
    cb_trained_dt['director'] = cb_trained_dt['crew'].apply(get_director)

    features = ['cast', 'keywords', 'genres']
    for feature in features:
        cb_trained_dt[feature] = cb_trained_dt[feature].apply(get_list)

    features = ['cast', 'keywords', 'director', 'genres']
    for feature in features:
        cb_trained_dt[feature] = cb_trained_dt[feature].apply(clean_data)
        
    #In a column called soup, keyword, cast, director, and genres are all tied together and put values that can measure cosine similarity
    cb_trained_dt['soup'] = cb_trained_dt.apply(create_soup, axis=1)
    
    recommend_movie_list = make_recommendation_user_input(cb_trained_dt)
    
    result = pd.DataFrame(recommend_movie_list, columns = ['movie_name', 'link'])
    for idx in range(0, len(result)):
        temp_link = result['link'][idx].replace(result['link'][idx], "<a href = \"" + result['link'][idx] + "\" >" + result['link'][idx] + "</a>")
    return result
#Function that only actors (parsing) among movie-related people are kept separately
#input : corpus of movie_related people name
#output : movie_director's name
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan
#function : get_list
#If it is complicated with #id, etc., it is a function that parses it and returns it to the list
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        return names

    #Return empty list in case of missing/malformed data
    return []
#fnction : clean_data
#Blankout function
def clean_data(x):
    #Remove blanks inside the list -> [cast, keywords, genres]
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x] #uppercase -> change lowercase + remove blanks
    else:
        #Remove blanks inside string -> [director]
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''
#function : create_soup
#Creating a corpus for cosine severity function
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres']) +' '+ ' '.join(x['overview']) + ' ' + ' '.join(x['original_title'])
#get input and return output
def get_searchTerms():
    searchTerms = [] 
    
    print('Write down about movie(Genre, actors, directors, keywords)')
    user_input = input("(if multiple, please separate them with a comma)[Type 'null' if you don't want to type]")
    user_input = " ".join(["".join(n.split()) for n in user_input.lower().split(',')])
    if user_input != 'null':
        searchTerms.append(user_input)

    return searchTerms
#function : make_recommendation_user_input
#receives information about genres, actors, directors, and keywords from users and returns 5 recommended movies
#input : cb_trained_dt
#output : dataframe with recommendation movie_name with imdb site link
def make_recommendation_user_input(cb_trained_dt):
    metadata = cb_trained_dt
    new_row = metadata.iloc[-1,:].copy() #creating a copy of the last row of the 
  #dataset, which we will use to input the user's input
  #grabbing the new wordsoup from the user
    searchTerms = get_searchTerms()  
    new_row.iloc[-1] = " ".join(searchTerms) #adding the input to our new row
  
  #adding the new row to the dataset
    #metadata = metadata.append(new_row)
    metadata.loc[len(metadata)] = new_row
    
  #Vectorizing the entire matrix as described above!
    count = CountVectorizer(stop_words='english')
    count_matrix = count.fit_transform(metadata['soup'])
    
  #running pairwise cosine similarity 
    cosine_sim2 = cosine_similarity(count_matrix, count_matrix) #getting a similarity matrix
  
  #sorting cosine similarities by highest to lowest
    sim_scores = list(enumerate(cosine_sim2[-1,:]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
  #matching the similarities to the movie titles and ids
    ranked_titles = []
    for i in range(1, 5 + 1):
        indx = sim_scores[i][0] 
        ranked_titles.append([metadata['title'].iloc[indx], 'https://imdb.com/title/' + metadata['imdb_id'].iloc[indx]]) #대신 사이트 주소 출력
  
    return ranked_titles

Main

In [None]:
recommendataion_system(ratings, movies_metadata, credits, keywords, test)