In [None]:
# 넷플릭스에 있는 영화와 TV 프로그램의 컨텐츠 기반 추천 시스템을 만드는 프로젝트
# 2가지 방법 사용
# 1) 캐스트/감독/국가/별점/장르 등의 특징 기반으로 사용한 경우
# 2) 해당 영화 혹은 티비 쇼를 묘사하는 단어를 특징 기반으로 사용한 경우

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import re

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
data = pd.read_csv('/content/drive/MyDrive/netflix_titles.csv')
data.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...


In [None]:
print(len(data))
data.groupby('type').count()

7787


Unnamed: 0_level_0,show_id,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Movie,5377,5377,5214,4951,5147,5377,5377,5372,5377,5377,5377
TV Show,2410,2410,184,2118,2133,2400,2410,2408,2410,2410,2410


In [None]:
data = data.dropna(subset=['cast', 'country', 'rating'])
print(len(data)) # NaN 값이 포함된 행을 버림(default axis = 0)

6652


### #방법1) Movie :캐스트/감독/국가/별점/장르 등의 특징 기반 추천 엔진 개발
#### -> cast, director, country, genre, rating 을 binary vetor로 표현

In [None]:
movies = data[data['type'] == 'Movie'].reset_index()
movies = movies.drop(['index','show_id','type','date_added','release_year','duration','description'],
                     axis=1)
movies.head()

Unnamed: 0,title,director,cast,country,rating,listed_in
0,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,TV-MA,"Dramas, International Movies"
1,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,R,"Horror Movies, International Movies"
2,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,PG-13,"Action & Adventure, Independent Movies, Sci-Fi..."
3,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,PG-13,Dramas
4,122,Yasir Al Yasiri,"Amina Khalil, Ahmed Dawood, Tarek Lotfy, Ahmed...",Egypt,TV-MA,"Horror Movies, International Movies"


In [None]:
tv = data[data['type']=='TV Show'].reset_index()
tv = tv.drop(['index','show_id','type','date_added','release_year','duration','description'],
                     axis=1)
tv.head()

Unnamed: 0,title,director,cast,country,rating,listed_in
0,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,TV-MA,"International TV Shows, TV Dramas, TV Sci-Fi &..."
1,46,Serdar Akar,"Erdal Beşikçioğlu, Yasemin Allen, Melis Birkan...",Turkey,TV-MA,"International TV Shows, TV Dramas, TV Mysteries"
2,1983,,"Robert Więckiewicz, Maciej Musiał, Michalina O...","Poland, United States",TV-MA,"Crime TV Shows, International TV Shows, TV Dramas"
3,​SAINT SEIYA: Knights of the Zodiac,,"Bryson Baugus, Emily Neves, Blake Shepard, Pat...",Japan,TV-14,"Anime Series, International TV Shows"
4,#blackAF,,"Kenya Barris, Rashida Jones, Iman Benson, Genn...",United States,TV-MA,TV Comedies


In [None]:
# Feature 
# Cast

# actors 리스트에 각 영화에 출연진 목록을 리스트로 담음.
actors = []
for i in movies['cast']:
    actor = re.split(r', \s*', i)
    actors.append(actor)

# 모든 배우 출연진을 flat_list 리스트에 담음
flat_list = []
for sublist in actors:
    for item in sublist:
        flat_list.append(item)

# 중복 제거 및 정렬해서 actors_list 생성
actors_list = sorted(set(flat_list))
binary_actors = [[0]*0 for i in range(len(set(flat_list)))]

for i in movies['cast']:
    k = 0
    for j in actors_list:
        if j in i:
            binary_actors[k].append(1.0)
        else:
            binary_actors[k].append(0.0)
        k+=1

binary_actors = pd.DataFrame(binary_actors).transpose()      

In [None]:
print(len(movies))
print(len(actors_list))
print(len(binary_actors))
print(len(binary_actors[0]))
binary_actors.shape
# (row: 배우, col:배우가 출연한 영화)

4761
22622
4761
4761


(4761, 22622)

In [None]:
# Director
directors = []

for i in movies['director']:
    if pd.notna(i):
        director = re.split(r', \s*', i)
        directors.append(director)
        
flat_list2 = []
for sublist in directors:
    for item in sublist:
        flat_list2.append(item)
        
directors_list = sorted(set(flat_list2))
binary_directors = [[0]*0 for i in range(len(set(flat_list2)))]

for i in movies['director']:
    k=0
    for j in directors_list:
        if pd.isna(i):
            binary_directors[k].append(0.0)
        elif j in i:
            binary_directors[k].append(1.0)
        else:
            binary_directors[k].append(0.0)
        k+=1
        
binary_directors = pd.DataFrame(binary_directors).transpose()

In [None]:
        
print(len(movies))
print(len(flat_list2))
print(len(directors_list))
print(len(binary_directors))
binary_directors.shape


4761
5216
3833
4761


(4761, 3833)

In [None]:
# Country

countires = []

for i in movies['country']:
    country = re.split(r', \s*', i)
    countires.append(country)
    
flat_list3 = []
for sublist in countires:
    for item in sublist:
        flat_list3.append(item)

countires_list = sorted(set(flat_list3))

binary_countries = [[0]*0 for i in range(len(set(flat_list3)))]

for i in movies['country']:
    k = 0
    for j in countires_list:
        if j in i:
            binary_countries[k].append(1.0)
        else:
            binary_countries[k].append(0.0)
        k+=1
        
binary_countries = pd.DataFrame(binary_countries).transpose()

In [None]:
print(binary_countries.shape)

(4761, 105)


In [None]:
# Genres

genres = []

for i in movies['listed_in']:
    genre = re.split(r', \s*', i)
    genres.append(genre)
    
flat_list4 = []
for sublist in genres:
    for item in sublist:
        flat_list4.append(item)
        
genres_list = sorted(set(flat_list4))

binary_genres = [[0]*0 for i in range(len(set(flat_list4)))]

for i in movies['listed_in']:
    k=0
    for j in genres_list:
        if j in i:
            binary_genres[k].append(1.0)
        else:
            binary_genres[k].append(0.0)
        k+=1
        
binary_genres = pd.DataFrame(binary_genres).transpose()

In [None]:
print(binary_genres.shape)

(4761, 20)


In [None]:
# Rating

ratings = []

for i in movies['rating']:
    ratings.append(i)
    
ratings_list = sorted(set(ratings))

binary_ratings = [[0]*0 for i in range(len(set(ratings_list)))]

for i in movies['rating']:
    k=0
    for j in ratings_list:
        if j in i:
            binary_ratings[k].append(1.0)
        else:
            binary_ratings[k].append(0.0)
        k+=1
        
binary_ratings = pd.DataFrame(binary_ratings).transpose()

In [None]:
print(binary_ratings.shape)

(4761, 14)


In [None]:
# 모든 피처들 movies axis로 합치기
# (binary_actors, binary_directors, binary_countries, binary_genres, binary_ratings)

binary = pd.concat([binary_actors, binary_directors, binary_countries, binary_genres],
                   axis=1,
                   ignore_index=True) # ratings는 사용 안하네??

binary


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26570,26571,26572,26573,26574,26575,26576,26577,26578,26579
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4756,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4757,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4758,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4759,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


### #방법1) TVshow: 캐스트/국가/별점/장르 등의 특징 기반 추천 엔진 개발
#### -> cast, country, genre, rating 을 binary vetor로 표현

In [None]:
# cast 벡터 만들기

actors2 = []

for i in tv['cast']:
    actor2 = re.split(r', \s*', i)
    actors2.append(actor2)
    
flat_list5 = []
for sublist in actors2:
    for item in sublist:
        flat_list5.append(item)
        
actors_list2 = sorted(set(flat_list5))

binary_actors2 = [[0] * 0 for i in range(len(set(flat_list5)))]

for i in tv['cast']:
    k = 0
    for j in actors_list2:
        if j in i:
            binary_actors2[k].append(1.0)
        else:
            binary_actors2[k].append(0.0)
        k+=1
        
binary_actors2 = pd.DataFrame(binary_actors2).transpose()
print(binary_actors2.shape)

(1891, 12665)


In [None]:
countries2 = []

for i in tv['country']:
    country2 = re.split(r', \s*', i)
    countries2.append(country2)
    
flat_list6 = []
for sublist in countries2:
    for item in sublist:
        flat_list6.append(item)
        
countries_list2 = sorted(set(flat_list6))

binary_countries2 = [[0] * 0 for i in range(len(set(flat_list6)))]

for i in tv['country']:
    k = 0
    for j in countries_list2:
        if j in i:
            binary_countries2[k].append(1.0)
        else:
            binary_countries2[k].append(0.0)
        k+=1
        
binary_countries2 = pd.DataFrame(binary_countries2).transpose()
print(binary_countries2.shape)

(1891, 64)


In [None]:
# genres

genres2 = []

for i in tv['listed_in']:
    genre2 = re.split(r', \s*', i)
    genres2.append(genre2)
    
flat_list7 = []
for sublist in genres2:
    for item in sublist:
        flat_list7.append(item)
        
genres_list2 = sorted(set(flat_list7))

binary_genres2 = [[0] * 0 for i in range(len(set(flat_list7)))]

for i in tv['listed_in']:
    k = 0
    for j in genres_list2:
        if j in i:
            binary_genres2[k].append(1.0)
        else:
            binary_genres2[k].append(0.0)
        k+=1
        
binary_genres2 = pd.DataFrame(binary_genres2).transpose()

print(binary_genres2.shape)

(1891, 22)


In [None]:
# ratings

ratings2 = []

for i in tv['rating']:
    ratings2.append(i)

ratings_list2 = sorted(set(ratings2))

binary_ratings2 = [[0] * 0 for i in range(len(set(ratings_list2)))]

for i in tv['rating']:
    k = 0
    for j in ratings_list2:
        if j in i:
            binary_ratings2[k].append(1.0)
        else:
            binary_ratings2[k].append(0.0)
        k+=1
        
binary_ratings2 = pd.DataFrame(binary_ratings2).transpose()
print(binary_ratings2.shape)

(1891, 9)


In [None]:
binary2 = pd.concat([binary_actors2, binary_countries2, binary_genres2], axis=1, ignore_index=True)
binary2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12741,12742,12743,12744,12745,12746,12747,12748,12749,12750
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1886,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1887,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1888,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


### Recommendation Engine

In [None]:
# CF(collaborative filtering)는 새로운 데이터가 들어왔을 때 추천을 하기 어렵다. (user based, item based)
'''
# cold start problem : 새로운 아이템에 대해서는 적용이 어럽다
# CB: 각각의 영화 별 특성을 피처로 만듦. -> 새로운 영화가 이전의 영화와 얼마나 비슷한지 유사도를 구해서 이를 기반으로 추천

'''

'\n# cold start problem : 새로운 아이템에 대해서는 적용이 어럽다\n# CB: 각각의 영화 별 특성을 피처로 만듦. -> 새로운 영화가 이전의 영화와 얼마나 비슷한지 유사도를 구해서 이를 기반으로 추천\n\n'

In [None]:
# search: 찾고자 하는 영화 인풋값. 

def recommender(search):        
    cs_list = []
    binary_list = []
    # movies 에서 찾는다
    if search in movies['title'].values:
        idx = movies[movies['title'] == search].index.item() # 인풋 search와 타이틀이 일치하는 데이터 인덱스 찾음
        for i in binary.iloc[idx]:
            binary_list.append(i) # search 영화 정보(행)를 binary_list 리스트에 추가한다.
        point1 = np.array(binary_list).reshape(1,-1)
        point1 = [val for sublist in point1 for val in sublist]
        '''
        for sublist in point1:
            for val in sublist:
                val
        '''
        for j in range(len(movies)): # movies의 모든 행
            binary_list2 = []
            for k in binary.iloc[j]: # binary(movie)벡터를 모두 펼쳐서 binary_list2에 담음
                binary_list2.append(k)
            point2 = np.array(binary_list2).reshape(1,-1)
            point2 = [val for sublist in point2 for val in sublist]
            dot_product = np.dot(point1, point2)
            norm_1 = np.linalg.norm(point1)
            norm_2 = np.linalg.norm(point2)
            cos_sim = dot_product / (norm_1 * norm_2)
            cs_list.append(cos_sim) # j행 영화(point2)와 search(point1)과의 코사인 유사도를 cs_list 리스트에 담는다.
            
        movies_copy = movies.copy()
        movies_copy['cos_sim'] = cs_list
        results = movies_copy.sort_values('cos_sim', ascending=False) # 카피한 movie 데이터를 cos_sim 기준 내림차순 정렬
        results = results[results['title'] != search]
        top_results = results.head(5)
        return(top_results)
    # 없으면, tv에서 찾는다. 위의 방식과 동일
    elif search in tv['title'].values:
        idx = tv[tv['title'] == search].index.item()
        for i in binary2.iloc[idx]:
            binary_list.append(i)
        point1 = np.array(binary_list).reshape(1, -1)
        point1 = [val for sublist in point1 for val in sublist]
        for j in range(len(tv)):
            binary_list2 = []
            for k in binary2.iloc[j]:
                binary_list2.append(k)
            point2 = np.array(binary_list2).reshape(1, -1)
            point2 = [val for sublist in point2 for val in sublist]
            dot_product = np.dot(point1, point2)
            norm_1 = np.linalg.norm(point1)
            norm_2 = np.linalg.norm(point2)
            cos_sim = dot_product / (norm_1 * norm_2)
            cs_list.append(cos_sim)
        tv_copy = tv.copy()
        tv_copy['cos_sim'] = cs_list
        results = tv_copy.sort_values('cos_sim', ascending=False)
        results = results[results['title'] != search]    
        top_results = results.head(5)
        return(top_results)
    else:
        return("Title not in dataset. Please check spelling.")
            

In [None]:
recommender('9')

Unnamed: 0,title,director,cast,country,rating,listed_in,cos_sim
520,Battle Drone,Mitch Gould,"Louis Mandylor, Daniel Southworth, Richard Ala...",United States,TV-MA,"Action & Adventure, Independent Movies, Sci-Fi...",0.333333
2456,Marvel's Iron Man & Hulk: Heroes United,"Leo Riley, Eric Radomski","Adrian Pasdar, Fred Tatasciore, Dee Bradley Ba...",United States,PG,"Action & Adventure, Sci-Fi & Fantasy",0.326599
4628,WHAT DID JACK DO?,David Lynch,David Lynch,United States,TV-14,"Dramas, Independent Movies",0.316228
4638,What's Eating Gilbert Grape,Lasse Hallström,"Johnny Depp, Leonardo DiCaprio, Juliette Lewis...",United States,PG-13,"Classic Movies, Dramas, Independent Movies",0.313112
60,2307: Winter's Dream,Joey Curtis,"Paul Sidhu, Arielle Holmes, Branden Coles, Kel...",United States,TV-MA,"Action & Adventure, Independent Movies, Sci-Fi...",0.313112


In [None]:
recommender('The Conjuring')

Unnamed: 0,title,director,cast,country,rating,listed_in,cos_sim
1868,Insidious,James Wan,"Patrick Wilson, Rose Byrne, Lin Shaye, Ty Simp...","United States, Canada, United Kingdom",PG-13,"Horror Movies, Thrillers",0.388922
968,Creep,Patrick Brice,"Mark Duplass, Patrick Brice",United States,R,"Horror Movies, Independent Movies, Thrillers",0.377964
1844,In the Tall Grass,Vincenzo Natali,"Patrick Wilson, Laysla De Oliveira, Avery Whit...","Canada, United States",TV-MA,"Horror Movies, Thrillers",0.370625
969,Creep 2,Patrick Brice,"Mark Duplass, Desiree Akhavan, Karan Soni",United States,TV-MA,"Horror Movies, Independent Movies, Thrillers",0.356348
1077,Desolation,Sam Patton,"Jaimi Paige, Alyshia Ochse, Toby Nichols, Clau...",United States,TV-MA,"Horror Movies, Thrillers",0.356348


In [None]:
recommender('After Life')

Unnamed: 0,title,director,cast,country,rating,listed_in,cos_sim
468,Extras,,"Ricky Gervais, Stephen Merchant, Ashley Jensen...","United Kingdom, United States",TV-MA,"British TV Shows, TV Comedies",0.526235
1468,The Blue Planet: A Natural History of the Oceans,Alastair Fothergill,David Attenborough,United Kingdom,TV-G,"British TV Shows, Docuseries, International TV...",0.452911
578,Grand Designs,,Kevin McCloud,United Kingdom,TV-14,"British TV Shows, International TV Shows, Real...",0.452911
1007,My Hotter Half,,Melvin Odoom,United Kingdom,TV-PG,"British TV Shows, International TV Shows, Real...",0.452911
62,Ainsley Eats the Streets,,Ainsley Harriott,United Kingdom,TV-PG,"British TV Shows, Docuseries, International TV...",0.452911


### #방법2) Movie & tv show 텍스트 묘사 기반 추천 엔진 개발


In [None]:
movies_des = data[data['type'] == 'Movie'].reset_index()
movies_des = movies_des[['title', 'description']]
movies_des.head()

Unnamed: 0,title,description
0,7:19,After a devastating earthquake hits Mexico Cit...
1,23:59,"When an army recruit is found dead, his fellow..."
2,9,"In a postapocalyptic world, rag-doll robots hi..."
3,21,A brilliant group of students become card-coun...
4,122,"After an awful accident, a couple admitted to ..."


In [None]:
tv_des = data[data['type'] == 'TV Show'].reset_index()
tv_des = tv_des[['title', 'description']]
tv_des.head()

Unnamed: 0,title,description
0,3%,In a future where the elite inhabit an island ...
1,46,A genetics professor experiments with a treatm...
2,1983,"In this dark alt-history thriller, a naïve law..."
3,​SAINT SEIYA: Knights of the Zodiac,Seiya and the Knights of the Zodiac rise again...
4,#blackAF,Kenya Barris and his family navigate relations...


In [None]:
import nltk
nltk.download("popular")

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

True

In [None]:
# description 전처리: 토큰화 / 불용어 제거 후 각 단어 토큰들을 binary화 해서 리스트로
filtered_movies = []
movies_words = []

for text in movies_des['description']:
    text_tokens = word_tokenize(text) # description 문장 -> 토큰화한 단어 리스트로 반환
    # text_tokens 소문자로 변환하고 stopwords 제거
    tokens_without_sw = [word.lower() for word in text_tokens if not word in stopwords.words()]
    movies_words.append(tokens_without_sw) # 각 영화에 사용된 단어 토큰 저장
    filtered = (" ").join(tokens_without_sw)
    filtered_movies.append(filtered) # 각 영화에 사됻된 단어토큰 합쳐서 다시 문장으로 저장(대소문자/불용어 제거 전처리 적용)
    
movies_words = [val for sublist in movies_words for val in sublist]
movies_words = sorted(set(movies_words))

# 필터링한 문장을 description_filtered'로 movies의 열에 새로 추가
movies_des['description_filtered'] = filtered_movies
movies_des.head()
    

Unnamed: 0,title,description,description_filtered
0,7:19,After a devastating earthquake hits Mexico Cit...,after devastating earthquake hits mexico city ...
1,23:59,"When an army recruit is found dead, his fellow...","when army recruit found dead , fellow soldiers..."
2,9,"In a postapocalyptic world, rag-doll robots hi...","in postapocalyptic world , rag-doll robots hid..."
3,21,A brilliant group of students become card-coun...,a brilliant group students become card-countin...
4,122,"After an awful accident, a couple admitted to ...","after awful accident , couple admitted grisly ..."


In [None]:
movie_word_binary = [[0]*0 for i in range(len(set(movies_words)))]

for des in movies_des['description_filtered']:
    k=0
    for word in movies_words:
        if word in des:
            movie_word_binary[k].append(1.0)
        else:
            movie_word_binary[k].append(0.0)
        k+=1
        
movie_word_binary = pd.DataFrame(movie_word_binary).transpose()
print(movie_word_binary.shape)

(4761, 14577)


In [None]:
# tv despcription에 대해서도 movie와 똑같은 filtering(stop words 제거 전처리)수행

filtered_tv = []
tv_words = []

for text in tv_des['description']:
    text_tokens = word_tokenize(text)
    tokens_without_sw = [word.lower() for word in text_tokens if not word in stopwords.words()]
    tv_words.append(tokens_without_sw)
    filtered = (" ").join(tokens_without_sw)
    filtered_tv.append(filtered)

tv_words = [val for sublist in tv_words for val in sublist]
tv_words = sorted(set(tv_words))
tv_des['description_filtered'] = filtered_tv
tv_des.head()

# tv 단어 binary 벡터로 변환

tv_word_binary = [[0]*0 for i in range(len(set(tv_words)))]

for des in tv_des['description_filtered']:
    k=0
    for word in tv_words:
        if word in des:
            tv_word_binary[k].append(1.0)
        else:
            tv_word_binary[k].append(0.0)
        k+=1
        
tv_word_binary = pd.DataFrame(tv_word_binary).transpose()
print(tv_word_binary.shape)

(1891, 8866)


In [None]:
# 방법1 에서와 같은 코사인 유사도 비교 recommender함수 정의

def recommender2(search):
    cs_list = []
    binary_list = []
    
    if search in movies_des['title'].values:
        idx = movies_des[movies_des['title']==search].index.item()
        for i in movie_word_binary.iloc[idx]:
            binary_list.append(i)
        point1 = np.array(binary_list).reshape(1, -1)
        point1 = [val for sublist in point1 for val in sublist]
        for j in range(len(movies_des)):
            binary_list2 = []
            for k in movie_word_binary.iloc[j]:
                binary_list2.append(k)
            point2 = np.array(binary_list2).reshape(1,-1)
            point2 = [val for sublist in point2 for val in sublist]
            dot_product = np.dot(point1, point2)
            norm_1 = np.linalg.norm(point1)
            norm_2 = np.linalg.norm(point2)
            cos_sim = dot_product / (norm_1*norm_2)
            cs_list.append(cos_sim)
            
        movies_copy = movies_des.copy()
        movies_copy['cos_sim'] = cs_list
        results = movies_copy.sort_values('cos_sim', ascending=False)
        results = results[results['title'] != search]
        top_results = results.head(5)
        return(top_results)
    
    elif search in tv_des['title'].values:
        idx = tv_des[tv_des['title'] == search].index.item()
        for i in tv_word_binary.iloc[idx]:
            binary_list.append(i)
        point1 = np.array(binary_list).reshape(1, -1)
        point1 = [val for sublist in point1 for val in sublist]
        for j in range(len(tv)):
            binary_list2 = []
            for k in tv_word_binary.iloc[j]:
                binary_list2.append(k)
            point2 = np.array(binary_list2).reshape(1, -1)
            point2 = [val for sublist in point2 for val in sublist]
            dot_product = np.dot(point1, point2)
            norm_1 = np.linalg.norm(point1)
            norm_2 = np.linalg.norm(point2)
            cos_sim = dot_product / (norm_1 * norm_2)
            cs_list.append(cos_sim)
        tv_copy = tv_des.copy()
        tv_copy['cos_sim'] = cs_list
        results = tv_copy.sort_values('cos_sim', ascending=False)
        results = results[results['title'] != search]    
        top_results = results.head(5)
        return(top_results)
    else:
        return("Title not in dataset. Please check spelling.")
            

In [None]:
pd.options.display.max_colwidth = 300
recommender2('The Conjuring')

Unnamed: 0,title,description,description_filtered,cos_sim
1632,Hard Lessons,"This drama based on real-life events tells the story of George McKenna, the tough, determined new principal of a notorious Los Angeles high school.","this drama based real-life events tells story george mckenna , tough , determined new principal notorious los angeles high school .",0.489419
3335,Sat Sri Akal,"Based on true events, this moving story centers on a Punjabi family whose celebration of their faith endures in the face of conflicting attitudes.","based true events , moving story centers punjabi family whose celebration faith endures conflicting attitudes .",0.47865
2549,Mirai,"Unhappy after his new baby sister displaces him, four-year-old Kun begins meeting people and pets from his family's history in their unique house.","unhappy new baby sister displaces , four-year-old kun begins meeting people pets family 's history unique house .",0.478345
3910,The Eyes of My Mother,"At the remote farmhouse where she once witnessed a traumatic childhood event, a young woman develops a grisly fascination with violence.","at remote farmhouse witnessed traumatic childhood event , young woman develops grisly fascination violence .",0.470605
3578,Standoff,"After witnessing an assassin's slaughter, a young girl holes up in a farmhouse with a suicidal vet, who must use wits and guts to fend off the killer.","after witnessing assassin 's slaughter , young girl holes farmhouse suicidal vet , must use wits guts fend killer .",0.460628


In [None]:
recommender2('Wild Child')

Unnamed: 0,title,description,description_filtered,cos_sim
1319,Fanatyk,"As a son deals with his own struggles, he must calm his father's obsession with fishing before his outlandish behavior ruins the entire family.","as deals struggles , must calm father 's obsession fishing outlandish behavior ruins entire family .",0.49794
1217,Either Me Or My Auntie,"A musician's marriage proposal to his girlfriend is denied by her mother, whose affinity for magic begins to meddle in their relationship even more.","a musician 's marriage proposal girlfriend denied mother , whose affinity magic begins meddle relationship even .",0.468979
2694,My Own Man,"When a man discovers he will be the father to a boy, his fear and insecurities send him on an emotional, humorous quest for his own manhood.","when discovers father boy , fear insecurities send emotional , humorous quest manhood .",0.453565
2554,Misfit,"After living in America for years, a teenage girl moves back to the Netherlands and is quickly singled out as a misfit by the popular clique at school.","after living america years , teenage girl moves back netherlands quickly singled misfit popular clique school .",0.448014
1288,Evvarikee Cheppoddu,"When caste differences throw a wrench into their otherwise blossoming relationship, a couple must somehow convince the girl’s father to let them marry.","when caste differences throw wrench otherwise blossoming relationship , couple must somehow convince girl ’ father let marry .",0.44663


In [None]:
'''
pd.groupby('key').count: 'key'항목으로 묶어서 수를 세어준다
pd.groupby('key').mean: 'key'항목으로 묶어서 다른 속성 값의 평균을 계산한다. 

pd.notna : dataframe이나 seires에 적용하여 dataframe이나 series에 있는 값들이 누락값(NaN, null 등)인지를 체크합니다.
-> 누락값이면 False, 누락값이 아닌 정상적인 값이 입력되어 있다면 True를 반환한다. -> bool 데이터프레임 반환

np.linalg.norm: linalg는 linear algebra 넘파이 라이브러리의 선형대수 함수 계산 툴
-> 단위행렬 (Unit matrix): np.eye(n)
-> 대각행렬 (Diagonal matrix): np.diag(x)
-> 내적 (Dot product, Inner product): np.dot(a, b)
-> 대각합 (Trace): np.trace(x)
-> 행렬식 (Matrix Determinant): np.linalg.det(x)
-> 역행렬 (Inverse of a matrix): np.linalg.inv(x)
-> 고유값 (Eigenvalue), 고유벡터 (Eigenvector): w, v = np.linalg.eig(x)
-> 특이값 분해 (Singular Value Decomposition): u, s, vh = np.linalg.svd(A)
-> 연립방정식 해 풀기 (Solve a linear matrix equation): np.linalg.solve(a, b)
-> 최소자승 해 풀기 (Compute the Least-squares solution): m, c = np.linalg.lstsq(A, y, rcond=None)[0]
등등...

stop words: 데이터 해석에서 큰 의미가 없는 토큰을 의미한다. 
보통 자연언어처리, 데이터 분석에서는 stopwords를 제거하고 사용한다. 
nltk는 관사/부정사/be동사 등 100여개의 단어를 미리 불용어로 지정해두고 사용한다.

word_tokenized: nltk에 정의된 단어 토크나이저. word 단위로 토큰화 한다.(dosent -> does, n't)
'''