In [1]:
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

In [2]:
#get movies list by given similarity matrix, movie title and number of recommendation movie
#cos_sime(np.array): similarity matirx
#title(str): name of movie
#num(int): number of similar movie
#return(dataframe): top num of similar movie 
def recommendation(cos_sim, title, num):
    titles = df['title']
    indices = pd.Series(df.index, index = df['title'])
    idx = indices[title]
    sim_scores = list(enumerate(cos_sim[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:1+num]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [3]:
#put result of tfidf, genres, and all models result into dataframe
#title(str): name of movie
#num(int): number of same movies
#return(dataframe): result of three models
def get_result(title,num):
    df_result = pd.DataFrame(columns=['tfidf','genres','all'])
    df_result['tfidf'] = list(recommendation(cos_sim_tfidf,title,num))
    #df_result['word_count'] = list(recommendation(cos_sim_cv,title,num))
    #df_result['rgb'] = list(recommendation(cos_sim_rgb,title,num))
    df_result['genres'] = list(recommendation(cos_sim_genres,title,num))
    df_result['all'] = list(recommendation_all(cos_sim_tfidf,cos_sim_cv,cos_sim_rgb,cos_sim_genres,title,num))
    return df_result

In [4]:
df = pd.read_csv('final.csv')
df.drop(columns = 'Unnamed: 0',inplace = True)
print(len(df))
df.dropna(inplace = True)
df = df.reset_index(drop = True)
print(len(df))
df.head(10)

5923
5923


Unnamed: 0,name,year,r_mm,g_mm,b_mm,r_mme,g_mme,b_mme,r_mv,g_mv,...,Thriller,War,Western,director,runtime,production_country,actor_1,actor_2,actor_3,plot
0,Love Is All There Is,1996,82.348043,71.744324,78.750917,79.487959,68.9822,82.880811,856.86837,593.221427,...,0.0,0.0,0.0,Joseph Bologna,120.0,United States,Lainie Kazan,Joseph Bologna,Barbara Carrera,Romeo & Juliet gets transplanted to the Bronx....
1,Beautiful Thing,1996,74.98215,74.480009,82.240782,80.094028,81.324324,92.77723,512.608277,683.348508,...,0.0,0.0,0.0,Hettie Macdonald,90.0,United Kingdom,Linda Henry,Glen Berry,Scott Neal,"In a suburb of London, young Jamie is escaping..."
2,Long Kiss Goodnight The,1996,49.474618,49.722835,55.817021,39.985541,44.709762,54.383961,1030.126492,530.759442,...,1.0,0.0,0.0,Renny Harlin,121.0,United States,Geena Davis,Samuel L. Jackson,Yvonne Zima,A woman suffering from amnesia begins to recov...
3,Looking for Richard,1996,62.453501,75.513627,86.450221,55.287343,67.21792,78.712305,594.451881,703.602668,...,0.0,0.0,0.0,Al Pacino,112.0,United States,Penelope Allen,Gordon MacDonald,Madison Arnold,Al Pacino's deeply-felt rumination on Shakespe...
4,Trees Lounge,1996,35.524115,51.231585,59.102579,30.204564,47.724824,60.109961,326.071906,384.166999,...,0.0,0.0,0.0,Steve Buscemi,95.0,United States,Carol Kane,Mark Boone Junior,Steve Buscemi,Tommy is an unemployed mechanic who spends mos...
5,Proprietor The,1996,78.430029,86.697745,94.749311,76.572486,85.461808,94.89614,487.622368,424.615741,...,0.0,0.0,0.0,Ismail Merchant,113.0,France,Jeanne Moreau,Sean Young,Sam Waterston,An expatriated French novelist (Jeanne Moreau)...
6,Normal Life,1996,60.637839,64.231354,76.935139,60.807979,63.814996,74.687979,749.574004,698.375673,...,0.0,0.0,0.0,John McNaughton,101.0,United States,Ashley Judd,Luke Perry,Bruce A. Young,Chris is young idealistic cop who falls in lov...
7,Get on the Bus,1996,77.673707,89.733661,95.051895,73.826202,82.128797,85.755462,667.844879,983.110712,...,0.0,0.0,0.0,Spike Lee,120.0,United States,Richard Belzer,De'aundre Bonds,Andre Braugher,A disparate group of African-American men trav...
8,Jude,1996,47.297679,54.241306,39.965586,46.786094,51.817176,39.135897,157.899063,264.497794,...,0.0,0.0,0.0,Michael Winterbottom,123.0,United Kingdom,Christopher Eccleston,Kate Winslet,Liam Cunningham,A stonemason steadfastly pursues a cousin he l...
9,Everyone Says I Love You,1996,77.311062,76.488409,92.485118,77.942155,76.914121,92.710909,97.50726,70.566051,...,0.0,0.0,0.0,Woody Allen,101.0,United States,Edward Norton,Drew Barrymore,Diva Gray,A New York girl sets her father up with a beau...


In [5]:
#word counter with movie synopsis
count = CountVectorizer()
count_matrix = count.fit_transform(df['plot'])

In [6]:
#tfidf with movie synopsis
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['plot'])
df_tfidf = pd.DataFrame(tfidf_matrix.toarray())

In [7]:
#get rgb dataframe
scaler = MinMaxScaler()
df_rgb = df[['r_mm','g_mm','b_mm','r_mme','g_mme','b_mme','r_mv','g_mv','b_mv',\
            'r_mem','g_mem','b_mem','r_meme','g_meme','b_meme','r_mev','g_mev','b_mev',\
            'r_vm','g_vm','b_vm','r_vme','g_vme','b_vme','r_vv','g_vv','b_vv']]
df_rgb = scaler.fit_transform(df_rgb)

In [8]:
#get genres dataframe
df_genres = df[['Action','Adventure','Animation','Children','Comedy','Crime','Documentary',\
                'Drama','Fantasy','Film_Noir','Horror','Musical','Mystery','Romance','Sci_Fi',\
                'Thriller','War','Western','director','runtime','production_country','actor_1',\
                'actor_2','actor_3']]
df_genres = pd.get_dummies(df_genres)

In [9]:
df_rgb = pd.DataFrame(df_rgb)

In [10]:
#compute cosine similarity for tfidf
cos_sim_tfidf = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [11]:
#compute cosine similarity for word count
cos_sim_cv = cosine_similarity(count_matrix, count_matrix)

In [12]:
#compute cosine similarity for rgb
cos_sim_rgb = cosine_similarity(df_rgb, df_rgb)

In [13]:
#compute cosine similarity for genres
cos_sim_genres = cosine_similarity(df_genres, df_genres)

In [14]:
#Hybrid model with multi similarity matrix
#cos_sim1(np.array): similarity matrix 1
#cos_sim2(np.array): similarity matrix 2
#cos_sim3(np.array): similarity matrix 3
#cos_sim4(np.array): similarity matrix 4
#title(str): name of movie
#num(int): number of similar movies
#return(dataframe): top num of similar movies
def recommendation_all(cos_sim1,cos_sim2,cos_sim3,cos_sim4,title, num):
    titles = df['title']
    indices = pd.Series(df.index, index = df['title'])
    idx = indices[title]
    
    #cos_sim1[idx] = (cos_sim1[idx]-min(cos_sim1[idx]))/(max(cos_sim1[idx]) - min(cos_sim1[idx]))
    #cos_sim2[idx] = (cos_sim2[idx]-min(cos_sim2[idx]))/(max(cos_sim2[idx]) - min(cos_sim2[idx]))
    #cos_sim3[idx] = (cos_sim3[idx]-min(cos_sim3[idx]))/(max(cos_sim3[idx]) - min(cos_sim3[idx]))
    #cos_sim4[idx] = (cos_sim4[idx]-min(cos_sim4[idx]))/(max(cos_sim4[idx]) - min(cos_sim4[idx]))
    #print(max(cos_sim4[idx]))
    cos_total = cos_sim1[idx]*0.001 + cos_sim2[idx]*0 + cos_sim3[idx]*0.0018 + cos_sim4[idx]*1
    
    sim_total = list(enumerate(cos_total))
    sim_total = sorted(sim_total, key = lambda x: x[1], reverse = True)
    sim_total = sim_total[1:1+num]
    print(sim_total)
    movie_indices = [i[0] for i in sim_total]
    return titles.iloc[movie_indices]

In [15]:
df_result = get_result('Toy Story (1995)', 10)
df_result

[(3999, 1.0020406679064828), (4000, 1.001829277812614), (4001, 1.0016836955508153), (4425, 1.0012898601192068), (4426, 1.0012380542076686), (3175, 1.0012292913690264), (3300, 1.0012265794644957), (1307, 1.0012193120162138), (4250, 1.0012154695578679), (1020, 1.0011918445400863)]


Unnamed: 0,tfidf,genres,all
0,Toy Story 2 (1999),Toy Story 2 (1999),Toy Story 2 (1999)
1,Toy Story 3 (2010),Toy Story 3 (2010),Toy Story 3 (2010)
2,Toy Story 4 (2019),Toy Story 4 (2019),Toy Story 4 (2019)
3,Malice (1993),Pete's Dragon (1977),Pete's Dragon (1977)
4,Bound for Glory (1976),Cars (2006),Bedknobs and Broomsticks (1971)
5,It's a Boy Girl Thing (2006),Bedknobs and Broomsticks (1971),Casper Meets Wendy (1998)
6,"Shawshank Redemption, The (1994)",Ratatouille (2007),Cars (2006)
7,Firestarter (1984),Kelly's Heroes (1970),Shrek (2001)
8,Finian's Rainbow (1968),Hook (1991),Space Jam (1996)
9,"First $20 Million Is Always the Hardest, The (...","Monsters, Inc. (2001)",Kelly's Heroes (1970)
