In [1]:
# TF-IDF 알고리즘 : 특정 문서에서 단어의 중요도
# TF(Term Frequency, 단어빈도)  : 특정문서에서 특정 단어가 얼마나 자주 등장하는지를 나타낸다
# TF = 특정단어 T의 빈도 / 문서 D의 총 단어수    3/100 = 0.03
# IDF(Inverse Document Frequency, 역문서 빈도) : 단어가 여러문서에서 얼마나 드물게 등장하는지를 측정
# IDF(t) = log(N/df(t))

In [2]:
docs = [
    '먹고 싶은 사과',
    '먹고 싶은 바나나',
    '길고 노란 바나나 바나나',
    '저는 과일이 좋아요'
]

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [4]:
countvect = vect.fit_transform(docs)
countvect

<4x9 sparse matrix of type '<class 'numpy.int64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [5]:
countvect.toarray()

array([[0, 0, 0, 1, 0, 1, 1, 0, 0],
       [0, 0, 0, 1, 1, 0, 1, 0, 0],
       [0, 1, 1, 0, 2, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 1, 1]])

In [6]:
vect.vocabulary_

{'먹고': 3,
 '싶은': 6,
 '사과': 5,
 '바나나': 4,
 '길고': 1,
 '노란': 2,
 '저는': 7,
 '과일이': 0,
 '좋아요': 8}

In [7]:
sorted(vect.vocabulary_)

['과일이', '길고', '노란', '먹고', '바나나', '사과', '싶은', '저는', '좋아요']

In [8]:
import pandas as pd
countvect_df = pd.DataFrame(countvect.toarray(),columns=sorted(vect.vocabulary_))
countvect_df

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0,0,0,1,0,1,1,0,0
1,0,0,0,1,1,0,1,0,0
2,0,1,1,0,2,0,0,0,0
3,1,0,0,0,0,0,0,1,1


In [9]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(countvect_df,countvect_df)

array([[1.        , 0.66666667, 0.        , 0.        ],
       [0.66666667, 1.        , 0.47140452, 0.        ],
       [0.        , 0.47140452, 1.        , 0.        ],
       [0.        , 0.        , 0.        , 1.        ]])

In [10]:
# TF-IDF 수행
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()
tfvect = vect.fit(docs)

In [11]:
tfidv_df =  pd.DataFrame( tfvect.transform(docs).toarray(), columns=sorted(vect.vocabulary_) )
tfidv_df.index = ['문서1','문서2','문서3','문서4']
tfidv_df

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
문서1,0.0,0.0,0.0,0.526405,0.0,0.667679,0.526405,0.0,0.0
문서2,0.0,0.0,0.0,0.57735,0.57735,0.0,0.57735,0.0,0.0
문서3,0.0,0.47212,0.47212,0.0,0.74445,0.0,0.0,0.0,0.0
문서4,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.57735


In [12]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(tfidv_df,tfidv_df)

array([[1.        , 0.60784064, 0.        , 0.        ],
       [0.60784064, 1.        , 0.42980824, 0.        ],
       [0.        , 0.42980824, 1.        , 0.        ],
       [0.        , 0.        , 0.        , 1.        ]])

In [13]:
vect = TfidfVectorizer(max_features=4)
tfvect = vect.fit(docs)

In [14]:
tfidv_df = pd.DataFrame(tfvect.transform(docs).toarray(),columns=sorted(vect.vocabulary_))
tfidv_df.index = ['문서1','문서2','문서3','문서4']
tfidv_df

Unnamed: 0,과일이,먹고,바나나,싶은
문서1,0.0,0.707107,0.0,0.707107
문서2,0.0,0.57735,0.57735,0.57735
문서3,0.0,0.0,1.0,0.0
문서4,1.0,0.0,0.0,0.0


In [15]:
# 데이터
# https://drive.google.com/file/d/1qUg4IlPskuzUF7f9KD_HsKYQy12ZpqKg/view?usp=sharing

In [16]:
# !unzip /content/drive/MyDrive/data/the_movie_data.zip

In [17]:
data = pd.read_csv('/content/movies_metadata.csv',low_memory=False)
data.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [18]:
data.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [19]:
data = data.dropna(subset=['overview']) # 결측치가 있는 항목 모두 제거
data = data.reset_index(drop=True)
data = data.iloc[:10000]

In [20]:
# 불용어 제거
tfidf = TfidfVectorizer(stop_words='english',max_features=10000)
# overview 에 대해서 tf-idf 수행
tfidf_matrix = tfidf.fit_transform(data['overview'])
print(tfidf_matrix.shape)

(10000, 10000)


In [21]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_matrix = cosine_similarity(tfidf_matrix,tfidf_matrix)

In [22]:
cosine_matrix.shape

(10000, 10000)

In [25]:
import numpy as np
np.round(cosine_matrix,4)

array([[1.    , 0.018 , 0.    , ..., 0.    , 0.    , 0.    ],
       [0.018 , 1.    , 0.0559, ..., 0.    , 0.    , 0.    ],
       [0.    , 0.0559, 1.    , ..., 0.    , 0.    , 0.    ],
       ...,
       [0.    , 0.    , 0.    , ..., 1.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    , ..., 0.    , 1.    , 0.    ],
       [0.    , 0.    , 0.    , ..., 0.    , 0.    , 1.    ]])

In [26]:
# 영화의 제목과 id를 매핑할 dictonary를 생성
movie2id = {}
for i,c in enumerate(data['title']):
  movie2id[i] = c
movie2id

{0: 'Toy Story',
 1: 'Jumanji',
 2: 'Grumpier Old Men',
 3: 'Waiting to Exhale',
 4: 'Father of the Bride Part II',
 5: 'Heat',
 6: 'Sabrina',
 7: 'Tom and Huck',
 8: 'Sudden Death',
 9: 'GoldenEye',
 10: 'The American President',
 11: 'Dracula: Dead and Loving It',
 12: 'Balto',
 13: 'Nixon',
 14: 'Cutthroat Island',
 15: 'Casino',
 16: 'Sense and Sensibility',
 17: 'Four Rooms',
 18: 'Ace Ventura: When Nature Calls',
 19: 'Money Train',
 20: 'Get Shorty',
 21: 'Copycat',
 22: 'Assassins',
 23: 'Powder',
 24: 'Leaving Las Vegas',
 25: 'Othello',
 26: 'Now and Then',
 27: 'Persuasion',
 28: 'The City of Lost Children',
 29: 'Shanghai Triad',
 30: 'Dangerous Minds',
 31: 'Twelve Monkeys',
 32: 'Babe',
 33: 'Carrington',
 34: 'Dead Man Walking',
 35: 'Across the Sea of Time',
 36: 'It Takes Two',
 37: 'Clueless',
 38: 'Cry, the Beloved Country',
 39: 'Richard III',
 40: 'Dead Presidents',
 41: 'Restoration',
 42: 'Mortal Kombat',
 43: 'To Die For',
 44: 'How To Make An American Quilt',
 

In [33]:
movie2id = dict(data['title'])
movie2id

{0: 'Toy Story',
 1: 'Jumanji',
 2: 'Grumpier Old Men',
 3: 'Waiting to Exhale',
 4: 'Father of the Bride Part II',
 5: 'Heat',
 6: 'Sabrina',
 7: 'Tom and Huck',
 8: 'Sudden Death',
 9: 'GoldenEye',
 10: 'The American President',
 11: 'Dracula: Dead and Loving It',
 12: 'Balto',
 13: 'Nixon',
 14: 'Cutthroat Island',
 15: 'Casino',
 16: 'Sense and Sensibility',
 17: 'Four Rooms',
 18: 'Ace Ventura: When Nature Calls',
 19: 'Money Train',
 20: 'Get Shorty',
 21: 'Copycat',
 22: 'Assassins',
 23: 'Powder',
 24: 'Leaving Las Vegas',
 25: 'Othello',
 26: 'Now and Then',
 27: 'Persuasion',
 28: 'The City of Lost Children',
 29: 'Shanghai Triad',
 30: 'Dangerous Minds',
 31: 'Twelve Monkeys',
 32: 'Babe',
 33: 'Carrington',
 34: 'Dead Man Walking',
 35: 'Across the Sea of Time',
 36: 'It Takes Two',
 37: 'Clueless',
 38: 'Cry, the Beloved Country',
 39: 'Richard III',
 40: 'Dead Presidents',
 41: 'Restoration',
 42: 'Mortal Kombat',
 43: 'To Die For',
 44: 'How To Make An American Quilt',
 

In [34]:
# id와 movie title을 매핑
id2movie = dict([(c,i) for i,c in movie2id.items()])
id2movie

{'Toy Story': 0,
 'Jumanji': 1,
 'Grumpier Old Men': 2,
 'Waiting to Exhale': 3,
 'Father of the Bride Part II': 4,
 'Heat': 5,
 'Sabrina': 876,
 'Tom and Huck': 7,
 'Sudden Death': 8,
 'GoldenEye': 9,
 'The American President': 10,
 'Dracula: Dead and Loving It': 11,
 'Balto': 12,
 'Nixon': 13,
 'Cutthroat Island': 14,
 'Casino': 15,
 'Sense and Sensibility': 16,
 'Four Rooms': 17,
 'Ace Ventura: When Nature Calls': 18,
 'Money Train': 19,
 'Get Shorty': 20,
 'Copycat': 21,
 'Assassins': 22,
 'Powder': 23,
 'Leaving Las Vegas': 24,
 'Othello': 2715,
 'Now and Then': 26,
 'Persuasion': 27,
 'The City of Lost Children': 28,
 'Shanghai Triad': 29,
 'Dangerous Minds': 30,
 'Twelve Monkeys': 31,
 'Babe': 32,
 'Carrington': 33,
 'Dead Man Walking': 34,
 'Across the Sea of Time': 35,
 'It Takes Two': 36,
 'Clueless': 37,
 'Cry, the Beloved Country': 38,
 'Richard III': 7161,
 'Dead Presidents': 40,
 'Restoration': 41,
 'Mortal Kombat': 42,
 'To Die For': 43,
 'How To Make An American Quilt':

In [38]:
idx = id2movie['Toy Story']
sin_scores = [ (i,c) for i ,c in enumerate(cosine_matrix[idx]) if i != idx ]
sin_scores = sorted(sin_scores, key = lambda x: x[1], reverse = True)
sin_scores[0:10]

[(2979, 0.4459238112286962),
 (8303, 0.2233385403549883),
 (6920, 0.2108074471558198),
 (1058, 0.19689781959829122),
 (5776, 0.19400634012535017),
 (1916, 0.17060012383134837),
 (483, 0.16369887578608464),
 (3039, 0.15050716355723778),
 (5597, 0.15035167699522478),
 (7591, 0.14809464568562838)]

In [42]:
[(movie2id[idx],score) for idx,score in sin_scores[0:10] ]

[('Toy Story 2', 0.4459238112286962),
 ('The Champ', 0.2233385403549883),
 ('Rivers and Tides', 0.2108074471558198),
 ('Rebel Without a Cause', 0.19689781959829122),
 ('Class of 1984', 0.19400634012535017),
 ('Condorman', 0.17060012383134837),
 ('Malice', 0.16369887578608464),
 ('Man on the Moon', 0.15050716355723778),
 ('Heartbeeps', 0.15035167699522478),
 ('The First $20 Million Is Always the Hardest', 0.14809464568562838)]

In [51]:
# TF-IDF 를 이용한 코사인유사도를 적용한 추천시스템
data['title'][:20]

Unnamed: 0,title
0,Toy Story
1,Jumanji
2,Grumpier Old Men
3,Waiting to Exhale
4,Father of the Bride Part II
5,Heat
6,Sabrina
7,Tom and Huck
8,Sudden Death
9,GoldenEye


In [54]:
# 영화 제목을 입력받아서 10개 영화 추천
movie_num = int(input('추천받을 영화 번호를 입력하세요'))
movie_title = data['title'][movie_num]
print(movie_title)
# cosine simularity 추천
idx = id2movie[movie_title]
sin_scores = [ (i,c) for i ,c in enumerate(cosine_matrix[idx]) if i != idx ]
sin_scores = sorted(sin_scores, key = lambda x: x[1], reverse = True)
sin_scores[0:10]
pd.DataFrame([(movie2id[idx],score) for idx,score in sin_scores[0:10] ]  ,columns=['제목','유사도 점수'])

추천받을 영화 번호를 입력하세요19
Money Train


Unnamed: 0,제목,유사도 점수
0,Shakedown,0.254589
1,Showtime,0.249649
2,Code of Silence,0.23773
3,Serpico,0.237401
4,Next of Kin,0.212369
5,Wolfen,0.207181
6,Romeo Is Bleeding,0.188724
7,Oxygen,0.188233
8,True Confessions,0.188093
9,Hear No Evil,0.182845


# Word2Vec 알고리즘

In [55]:
from gensim.models import Word2Vec

In [59]:
sentence = [
    ["i","love","machine","learning"],
    ["i","enjoy","deep","learning"],
    ["i","love","coding","in","Python"],
]
# Word2Vec 학습
model = Word2Vec(sentence,vector_size=100, window=5,min_count=1,workers=4)

# 단어 'love'의 벡터 값
vector_love = model.wv['love']
# machine 과 learning의 유사도 값 계산
similarity = model.wv.similarity('machine','learning')
# 가장 유사한 단어 찾기
model.wv.most_similar('machine')


[('love', 0.19912061095237732),
 ('Python', 0.07497556507587433),
 ('coding', 0.060591842979192734),
 ('learning', 0.03364058583974838),
 ('i', 0.027057481929659843),
 ('deep', 0.008826158009469509),
 ('in', -0.06900332123041153),
 ('enjoy', -0.14454564452171326)]

### 영화 추천시스템 Word2Vec

In [62]:
path = '/content/ratings.csv'
movie = pd.read_csv(path,low_memory=False)
movie.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [66]:
movie.sort_values(by='timestamp',inplace=True)
movie = movie.reset_index(drop=True)
movie.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,38150,1176,4.0,789652004
1,44717,1079,3.0,789652009
2,44717,47,5.0,789652009
3,44717,21,3.0,789652009
4,112461,57,4.0,822873600
