In [1]:
import pandas as pd
import numpy as np
import re



In [2]:
movies = pd.read_csv("movies.csv")

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [4]:
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title.lower())


In [5]:
movies["clean_title"] = movies["title"].apply(clean_title)



In [6]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,toy story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,grumpier old men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,waiting to exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,father of the bride part ii 1995
...,...,...,...,...
62418,209157,We (2018),Drama,we 2018
62419,209159,Window of the Soul (2001),Documentary,window of the soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,bad poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),a girl thing 2001


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
tfidf = vectorizer.fit_transform(movies["clean_title"])

In [8]:
tfidf.shape


(62423, 170075)

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

#def search(title):
    #title = clean_title(title)
    #query_vec = vectorizer.transform([title])
    #similarity = cosine_similarity(query_vec, tfidf).flatten()
    #indices = np.argpartition(similarity, -5)[-5:]
    #results = movies.iloc[indices][::-1]
    #return results
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices]
    results = results.copy()  # 경고 방지용
    results["similarity"] = similarity[indices]
    return results.sort_values(by="similarity", ascending=False)




In [10]:
import ipywidgets as widgets
from IPython.display import display
movie_input = widgets.Text(
    description = "movie title",
    disabled = False

)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value') 
display(movie_input, movie_list)


Text(value='', description='movie title')

Output()

In [11]:
ratings = pd.read_csv("ratings.csv")


In [12]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [13]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [14]:
movie_id = 1

In [15]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 4)]["userId"].unique()

In [16]:
similar_users

array([     3,      5,      8, ..., 162530, 162533, 162534],
      shape=(37709,))

In [17]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] >=4)]["movieId"]

In [18]:
similar_user_recs

254              1
255             29
256             32
257             50
258            111
             ...  
24999332    166643
24999342    171763
24999348    177593
24999351    177765
24999378    198609
Name: movieId, Length: 5101989, dtype: int64

In [19]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
similar_user_recs = similar_user_recs[similar_user_recs > 0.1]

In [20]:
similar_user_recs

movieId
1       1.000000
318     0.549604
260     0.531518
356     0.517224
296     0.495744
          ...   
235     0.101249
1242    0.100931
1907    0.100772
3527    0.100613
2761    0.100135
Name: count, Length: 273, dtype: float64

In [21]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index))& (ratings["rating"] > 4)]

In [22]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [23]:
all_user_recs

movieId
318     0.331577
296     0.275820
2571    0.236444
356     0.227949
593     0.218883
          ...   
1907    0.019409
3175    0.019037
474     0.018338
2       0.017112
440     0.016509
Name: count, Length: 273, dtype: float64

In [24]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis = 1, join='outer')

In [25]:
rec_percentages.columns = ["similar", "all"]

In [26]:
rec_percentages


Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.000000,0.120850
318,0.549604,0.331577
260,0.531518,0.215296
356,0.517224,0.227949
296,0.495744,0.275820
...,...,...
235,0.101249,0.022579
1242,0.100931,0.023740
1907,0.100772,0.019409
3527,0.100613,0.023984


In [27]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [28]:
rec_percentages = rec_percentages.sort_values("score", ascending = False)

In [29]:
rec_percentages

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.000000,0.120850,8.274754
2355,0.191095,0.024311,7.860413
648,0.187382,0.028527,6.568707
440,0.104537,0.016509,6.332170
3114,0.328914,0.052036,6.320939
...,...,...,...
858,0.355883,0.203523,1.748618
2959,0.351826,0.209977,1.675543
318,0.549604,0.331577,1.657542
79132,0.209870,0.127298,1.648656


In [30]:
rec_percentages.head(10).merge(movies, left_index = True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.12085,8.274754,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,toy story 1995
2264,0.191095,0.024311,7.860413,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,bugs life a 1998
637,0.187382,0.028527,6.568707,648,Mission: Impossible (1996),Action|Adventure|Mystery|Thriller,mission impossible 1996
435,0.104537,0.016509,6.33217,440,Dave (1993),Comedy|Romance,dave 1993
3021,0.328914,0.052036,6.320939,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,toy story 2 1999
3650,0.128378,0.020756,6.184955,3751,Chicken Run (2000),Animation|Children|Comedy,chicken run 2000
584,0.200642,0.03244,6.184933,592,Batman (1989),Action|Crime|Thriller,batman 1989
1,0.105598,0.017112,6.170978,2,Jumanji (1995),Adventure|Children|Fantasy,jumanji 1995
2705,0.152139,0.024863,6.119119,2797,Big (1988),Comedy|Drama|Fantasy|Romance,big 1988
2895,0.15129,0.024882,6.08028,2987,Who Framed Roger Rabbit? (1988),Adventure|Animation|Children|Comedy|Crime|Fant...,who framed roger rabbit 1988


In [31]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > 0.10]

    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index))& (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

    rec_percentages  = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    rec_percentages = rec_percentages.fillna(0)


    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

    rec_percentages = rec_percentages.sort_values("score", ascending = False)
    return rec_percentages.head(10).merge(movies, left_index =  True, right_on = "movieId")[["score", "title", "genres"]]
    

In [None]:
movie_name_input = widgets.Text(
    value = 'Toy Story', 
    description = "Movie Title:", 
    disabled = False
)
recommendation_list = widgets.Output()
def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))
movie_name_input.observe(on_type, names = "value")
display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()