In [52]:
import warnings
import pandas as pd
import numpy as np
import math
import ast as ast
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds

warnings.filterwarnings(action='ignore')

In [45]:
# Read two datasets
movies_ori = pd.read_csv("movies_metadata.csv", encoding='UTF-8')
ratings_ori = pd.read_csv("ratings_small.csv")

## Preprocessing (Clean the datasets)
# Missing value
# remove records of movies without title.
title_mask = movies_ori['title'].isna()
df_movies = movies_ori.loc[title_mask == False]
# remove records of movies with wrong 'id'
movieId_mask = df_movies.index[[df_movies['id'].str.contains('-')]].tolist()
df_movies.drop(movieId_mask, inplace=True)
# fill 'NaN' value of all records of columns 'overview' and 'tagline' with ''
df_movies['overview'] = df_movies['overview'].fillna('')
df_movies['tagline'] = df_movies['tagline'].fillna('')
# convert values of 'id' datatype str to int
df_movies['id'] = pd.to_numeric(df_movies['id'])
# convert values of 'genres' datatype dict to str, and abstract values
df_movies['genres'] = df_movies['genres'].apply(ast.literal_eval)
df_movies['genres'] = df_movies['genres'].apply(lambda x:[d['name'] for d in x]).apply(lambda x:' '.join(x))
# change column name 'id' to 'movieId' & 'tagline' to 'keywords'
df_movies.rename(columns={'id':'movieId'}, inplace=True) 
df_movies.rename(columns={'tagline':'keywords'}, inplace=True)
# Features selection
df_movies = df_movies[['movieId', 'title', 'overview', 'genres', 'keywords', 'vote_average', 'vote_count']]
df_ratings = ratings_ori.iloc[:, :-1]
# Merge two datasets 'df_movies' & 'df_ratings'
df = pd.merge(df_ratings, df_movies[['movieId', 'title']], on='movieId')


## Datasets for each model and preprocessing
# dataset for euclidean distance
df_euclidean = df.pivot_table(index='movieId', columns='userId', values='rating').fillna(-1)
# dataset for cosine similarity (CBF)
df_cos = df_movies[['movieId', 'overview', 'title']].head(30000)
df_cos = df_cos.reset_index(drop=True)
# dataset for cosine similarity (CF item-based)
df_cos_item = df.pivot_table(index='title', columns='userId', values='rating').fillna(0)
# dataset for matrix factorization
df_mf = df.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)
# dataset for apriori
def encode_ratings(x):  
    if x <= 0: 
        return 0
    return 1
df_apriori = df.drop_duplicates(['userId', 'title']) # drop duplicated values in 'userId' & 'title'
df_apriori = df_apriori.pivot_table(index='userId', columns='title', values='rating').fillna(0).astype('int64')
df_apriori = df_apriori.applymap(encode_ratings)  # encoding values to 0 or 1
# dataset for content based filtering
df_cbf = df_movies


df_cos


Unnamed: 0,movieId,overview,title
0,862,"Led by Woody, Andy's toys live happily in his ...",Toy Story
1,8844,When siblings Judy and Peter discover an encha...,Jumanji
2,15602,A family wedding reignites the ancient feud be...,Grumpier Old Men
3,31357,"Cheated on, mistreated and stepped on, the wom...",Waiting to Exhale
4,11862,Just when George Banks has recovered from his ...,Father of the Bride Part II
...,...,...,...
29995,170277,Bumbling navy officer Lieutenant Humphrey Fair...,Up the Creek
29996,25973,It's Valentine's Day and Tomhas big plans. He'...,What Love Is
29997,132641,"Ten years into a marriage, the wife is disappo...",Wife
29998,59142,A reporter stumbles on a runaway heiress whose...,You Can't Run Away from It


In [75]:
### Collaborative Filtering
## Euclidean distance (item-based)
def euclidean(data, title):
    # movie title로 movieId 가져오기
    index = df_cos[df_cos['title'] == title].index
    movie_id= list(df_cos.loc[index, 'movieId'])[0]
    # simiality function
    def sim_distance(data, n1, n2):  
        sum = 0
        # i 값은 df_euclidean 데이터셋의 선택한 row(movieId)중 평점값이 >=0 값들의 index(userId)
        for i in data.loc[n1, data.loc[n1, :]>=0].index:
            # n2 = input movieId와 값이 다른 movieId
            if data.loc[n2, i]>=0:
                sum += math.pow(data.loc[n1, i] - data.loc[n2, i], 2)
        return math.sqrt(1/(sum+1)) # return similarity value
    
    def top_match(data, movieId, rank):
        simList = []
        # i값은 df_euclidean 데이터셋 절반 중에서의 index(movieId)
        # data값이 많아 시간을 줄이기 위해 전체 중 절반만 사용
        for i in data.index[-len(data):]:
            # input movieId 와 값이 다른 movieId의 similarity값을 simList에 append
            if movieId != i:
                simList.append((sim_distance(data, movieId, i), i))
        simList.sort(reverse=True)
        return simList[:rank] # (similarity, movieId) 리스트를 return
    
    def recommendation(data, movie_id):
        res = top_match(data, movie_id, len(data))
        score_dic = {}
        sim_dic = {}
        myList = []
        for sim, mv in res:
            # similarity >= 0을 때만 실행
            if sim < 0:
                continue
            for movie in data.loc[movie_id, data.loc[movie_id, :] < 0].index:
                simSum = 0
                if data.loc[mv, movie] >= 0:
                    simSum += sim * data.loc[mv, movie]
                    score_dic.setdefault(movie, 0)
                    score_dic[movie] += simSum
                    sim_dic.setdefault(movie, 0)
                    sim_dic[movie] += sim
        for key in score_dic:
            myList.append((score_dic[key] / sim_dic[key], key))
        myList.sort(reverse=True)
        return myList
    # 추천 점수가 가장 높은 순으로 예상평점과 영화제목을 추천 (10개까지)
    movieList = []
    for rate, m_id in recommendation(data, movie_id):
        if list(df_movies.loc[df_movies['movieId']==m_id, 'title']) == []:
            continue
        movieList.append((rate, df_movies.loc[df_movies['movieId']==m_id, 'title'].values[0]))
    print("- Collaborative Filtering")
    print("- Euclidean distance similarity\n")
    print(pd.DataFrame(movieList[:10], columns=['Rating', 'Title']))


## Cosine similarity (CF item-based)
def cos_item(data, title):
    # cosine similarity
    sim_rate = cosine_similarity(data, data)
    sim_rate_df = pd.DataFrame(data=sim_rate, index=data.index, columns=data.index)
    sim_rate_df = pd.DataFrame(sim_rate_df[title].sort_values(ascending=False)[1:11]).reset_index()
    sim_rate_df.columns = ['Title', 'Cosine Similarity']
    sim_rate_df = sim_rate_df[['Cosine Similarity', 'Title']]
    print("- Collaborative Filtering")
    print("- Cosine simiarity (item-based)\n")
    print(sim_rate_df)
    

## Matrix Factorization
# *parameters mf(data, userId)
def mf(df_mat, user_id):
    # convert pivot_table dataset to numpy matrix
    matrix = df_mat.to_numpy()
    rating_mean = np.mean(matrix, axis=1)  # user's mean rating
    matrix_mean = matrix - rating_mean.reshape(-1, 1)  # 사용자-영화에 대해 사용자평균 뺀 값
    # get U matrix, sigma matrix, Vt transposed matrix from 'svds' meaning 'Truncated SVD'
    U, sigma, Vt = svds(matrix_mean, k = 12)
    sigma = np.diag(sigma)
    # recover original matrix
    # dot(U, sigma, Vt) + user's mean rating
    svd_ratings = np.dot(np.dot(U, sigma), Vt) + rating_mean.reshape(-1, 1)
    df_svd = pd.DataFrame(svd_ratings, columns = df_mat.columns)

    def recommendation(data, userId, ori_movie, ori_rating):
        # 현재는 index로 적용되어 있어 userId-1
        user_row_num = userId - 1
        sorted_pre = data.iloc[user_row_num].sort_values(ascending=False)
        # abstract datas with same 'userId's from original ratings dataset
        user_data = ori_rating[ori_rating.userId == userId]
        user_history = user_data.merge(ori_movie, on='movieId').sort_values(['rating'], ascending=False)
        user_history = user_history[['userId', 'movieId', 'rating']]
        # abstract datas without movie datas users have seen already from original movies dataset
        recommendations = ori_movie[~ori_movie['movieId'].isin(user_history['movieId'])]
        recommendations = recommendations.merge(pd.DataFrame(sorted_pre).reset_index(), on='movieId')
        recommendations = recommendations.rename(columns = {user_row_num: 'Predictions'}).sort_values('Predictions', ascending=False)
        recommendations = recommendations[['movieId', 'title', 'Predictions']]

        return user_history, recommendations

    already_rated, predictions = recommendation(df_svd, user_id, df_movies, df_ratings)
#     print("User's history")
#     print(already_rated.head(10))
    print("- Collaborative Filtering")
    print("- Matrix Factorization (SVD)\n")
    predictions = predictions[['Predictions', 'title']]
    predictions.columns = ['Predictions rate', 'Title']
    print(predictions[:10].reset_index(drop=True))
    

In [65]:
### Contant Based Filtering
## Cosine similarity (CBF)
def cos(data, title):
    # movie title로 movieId 가져오기
    index = df_cos[df_cos['title'] == title].index
    movie = list(df_cos.loc[index, 'movieId'])[0]
    # tf-idf matrix
    tfidf = TfidfVectorizer(stop_words='english')  # 불용어 제거
    tfidf_mat = tfidf.fit_transform(data['overview']).toarray()
    # similarity function
    def cos_sim(X, Y):
        return np.dot(X,Y)/((norm(X)*norm(Y))+1e-7)  # 분모 0이 안되게 '1e-7'추가

    def top_match(data, mv_id, rank=1):
        sim = []
        for i in range(len(data)):
            # input movieId와 다른 movieId의 cosine similarity 구하기
            if mv_id != df_cos.loc[i, 'movieId']:
                tfidf_idx1 = df_cos[df_cos['movieId'] == mv_id].index
                tfidf_idx2 = df_cos[df_cos['movieId'] == df_cos.loc[i, 'movieId']].index
                sim.append((cos_sim(data[tfidf_idx2[0]], data[tfidf_idx1[0]]), df_cos.loc[i, 'movieId']))
        sim.sort(reverse=True)
        return sim[:rank]
    
    movieList = []
    for sim, movie_id in top_match(tfidf_mat, movie, 10):
        # cosine similarity가 높은 순서대로 영화 추천
        movieList.append((sim, list(data.loc[data['movieId']==movie_id, 'title'])[0]))
    print("- Contant Based Filtering")
    print("- Cosine simiarity\n")
    print(pd.DataFrame(movieList, columns=['Cosine Similarity', 'Title']).drop_duplicates(['Title'])[:10])

In [66]:
### Association Rule Mining
## Apriori
def apriori(data, name, min_sup=0.08):    
    from mlxtend.frequent_patterns import association_rules
    from mlxtend.frequent_patterns import apriori
    frequent_itemset = apriori(data, min_support=min_sup, use_colnames=True)

    rules = association_rules(frequent_itemset, metric='lift', min_threshold=1)
    rules.sort_values(by=['lift'], ascending=False, inplace=True)
    
    df_res = rules[rules['antecedents'].apply(lambda x: len(x) == 1 and next(iter(x)) == name)]
    df_res = df_res[df_res['lift'] > 2]
    movies = df_res['consequents'].values

    movieList = []
    for movie in movies:
        lift = df_res.loc[df_res['consequents']==movie, ['lift']].values
        for title in movie:
            if title not in movieList:
                movieList.append((round(lift[0][0], 4), title))
    print("- Association Rule Mining")
    print("- Apriori\n")
    print(pd.DataFrame(movieList, columns=['Lift rate', 'Title']).drop_duplicates(['Title']).reset_index(drop=True)[:10])


In [59]:
apriori(df_apriori, 'The Hours')

- Association Rule Mining
- Apriori

   Lift rate                               Title
0     3.2129                             48 Hrs.
1     3.2129                      Romeo + Juliet
2     3.2129                   Three Colors: Red
3     3.2092            The Million Dollar Hotel
4     3.2092        Dave Chappelle's Block Party
5     3.2052                     Monsoon Wedding
6     3.2052                               Sissi
7     3.2031                 Cockles and Muscles
8     3.2031                            Rain Man
9     3.2031  Terminator 3: Rise of the Machines


In [61]:
cos_item(df_cos_item, 'The Hours')

- Collaborative Filtering
- Cosine simiarity (item-based)

   Cosine Similarity                Title
0           0.664757             Rain Man
1           0.656278              48 Hrs.
2           0.655502                Sissi
3           0.626577     The Conversation
4           0.617037    Three Colors: Red
5           0.613450      Monsoon Wedding
6           0.601700  Cockles and Muscles
7           0.599899       Romeo + Juliet
8           0.587942          Silent Hill
9           0.574302       Batman Returns


In [64]:
cos(df_cos, 'The Hours')

590
- Contant Based Filtering
- Cosine simiarity

   Cosine Similarity                      Title
0           0.171953            Possible Worlds
1           0.156752            The Terrorizers
2           0.154630                     Broken
3           0.145828  If These Walls Could Talk
4           0.139007       A Girl in Every Port
5           0.138323                    Macabre
6           0.134468             New Year's Eve
7           0.133576    Invitation to the Dance
8           0.129369          The Lawless Heart
9           0.129123               Grand Canyon


In [74]:
euclidean(df_euclidean, 'The Hours')

590
- Collaborative Filtering
- Euclidean distance similarity

     Rating                                       Title
0  4.942989                              The Ninth Gate
1  4.912223                            Ocean's Thirteen
2  4.832931  Spring, Summer, Fall, Winter... and Spring
3  4.690985                                 Rear Window
4  4.625305                                Transamerica
5  4.601217                                       Ghost
6  4.579262                         Lost in Translation
7  4.571828                There's Something About Mary
8  4.563083                                  Underworld
9  4.558761                                 Bull Durham


In [26]:
mf(df_mf, 86)

- Collaborative Filtering
- Matrix Factorization (SVD)

   movieId                 title  Predictions
0      457                 Sissi     3.000377
1      745       The Sixth Sense     2.882037
2      380              Rain Man     2.460014
3      648  Beauty and the Beast     2.040655
4      562              Die Hard     1.966591
5      339        Night on Earth     1.857408
6     1249         Hollywoodland     1.833415
7      802                Lolita     1.828231
8        6        Judgment Night     1.790865
9       95            Armageddon     1.783874
