In [1]:
docs = [
    'python 데이터',
    '데이터 분석',
    '빅 데이터 분석',
    '데이터 전처리'
]

vocab = list()
for doc in docs:
    for w in doc.split():
        vocab.append(w)

vocab = list(set(vocab))
vocab.sort()
vocab

['python', '데이터', '분석', '빅', '전처리']

In [2]:
from math import log

N = len(docs)

def tf(t, d):
    return d.count(t) # 단어의 출현 횟수

def idf(t):
    df = 0
    for doc in docs:
        df += t in doc
    return log(N/(df + 1)) # 분모에 0이 들어가면 안 되기 때문에 +1

def tfidf(t, d):
    return tf(t, d)*idf(t) # 이 숫자가 크면 클수록 중요도가 높을 수 있음

In [3]:
import pandas as pd

result = []

for i in range(N):
    result.append([])
    d = docs[i]
    for j in range(len(vocab)):
        t = vocab[j]
        result[-1].append(tf(t, d))

print(result)
df_tf = pd.DataFrame(result, columns=vocab)
df_tf

[[1, 1, 0, 0, 0], [0, 1, 1, 0, 0], [0, 1, 1, 1, 0], [0, 1, 0, 0, 1]]


Unnamed: 0,python,데이터,분석,빅,전처리
0,1,1,0,0,0
1,0,1,1,0,0
2,0,1,1,1,0
3,0,1,0,0,1


In [4]:
result = []
for j in range(len(vocab)):
    t = vocab[j]
    result.append(idf(t))

df_idf = pd.DataFrame(result, index=vocab, columns=["IDF"])
df_idf

Unnamed: 0,IDF
python,0.693147
데이터,-0.223144
분석,0.287682
빅,0.693147
전처리,0.693147


In [5]:
result = []
for i in range(N):
    result.append([])
    d = docs[i]
    for j in range(len(vocab)):
        t = vocab[j]
        result[-1].append(tfidf(t, d))

df_tfidf = pd.DataFrame(result, columns=vocab)
df_tfidf

Unnamed: 0,python,데이터,분석,빅,전처리
0,0.693147,-0.223144,0.0,0.0,0.0
1,0.0,-0.223144,0.287682,0.0,0.0
2,0.0,-0.223144,0.287682,0.693147,0.0
3,0.0,-0.223144,0.0,0.0,0.693147


In [6]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    'you know I want your love',
    'I like you',
    'what should I do'
]

vector = CountVectorizer()
print(vector.fit_transform(corpus).toarray())
print(vector.vocabulary_)

[[0 1 0 1 0 1 0 1 1]
 [0 0 1 0 0 0 0 1 0]
 [1 0 0 0 1 0 1 0 0]]
{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    'python 데이터',
    '데이터 분석',
    '빅 데이터 분석',
    '데이터 전처리'
]

tfidfv = TfidfVectorizer().fit(corpus)
print(tfidfv.transform(corpus).toarray())
print(tfidfv.vocabulary_)

[[0.88654763 0.46263733 0.         0.        ]
 [0.         0.55193942 0.83388421 0.        ]
 [0.         0.55193942 0.83388421 0.        ]
 [0.         0.46263733 0.         0.88654763]]
{'python': 0, '데이터': 1, '분석': 2, '전처리': 3}


In [None]:
# 코사인 유사도

In [8]:
import numpy as np

def dist(x,y):
    return np.sqrt(np.sum((x-y)**2)) # 유클리드 거리

doc0 = np.array((1,1,0,1))
doc1 = np.array((2,3,0,1))
doc2 = np.array((1,2,3,1))

print(dist(doc0,doc1))
print(dist(doc0,doc2))

2.23606797749979
3.1622776601683795


In [17]:
# 자카드 유사도 : 두 문서의 총 단어 집합에서 공통적으로 출현한 단어의 비율

doc1 = "python 파이썬 데이터"
doc2 = "빅데이터 python 파이썬"

tokenized_doc1 = doc1.split()
tokenized_doc2 = doc2.split()

print(tokenized_doc1)
print(tokenized_doc2)

['python', '파이썬', '데이터']
['빅데이터', 'python', '파이썬']


In [18]:
# 합집합

union = set(tokenized_doc1).union(set(tokenized_doc2))
print(union)

{'빅데이터', '파이썬', 'python', '데이터'}


In [24]:
# 교집합

intersection = set(tokenized_doc1).intersection(set(tokenized_doc2))
print(intersection)

{'python', '파이썬'}


In [25]:
print(len(intersection)/len(union))

0.5


In [26]:
from numpy import dot

a = [0,1,1]
b = [1,0,2]
dot(a,b)

2

In [27]:
from math import sqrt
from numpy.linalg import norm

a = [0,1,1]
b = [1,0,2]
print(norm(a))
print(sqrt(2))
print(norm(b))
print(sqrt(5))
print(norm(a)*norm(b))

1.4142135623730951
1.4142135623730951
2.23606797749979
2.23606797749979
3.1622776601683795


In [28]:
print(dot(a,b)/(norm(a)*norm(b)))

0.6324555320336759


In [29]:
def cos_sim(A,B):
    return dot(A,B)/(norm(A)*norm(B))

import numpy as np

doc1 = np.array([0,1,1,1])
doc2 = np.array([1,0,1,1])
doc3 = np.array([2,0,2,3])
doc4 = np.array([0,2,2,2])

print(cos_sim(doc1,doc2))
print(cos_sim(doc1,doc3))
print(cos_sim(doc2,doc3))
print(cos_sim(doc2,doc4))

0.6666666666666667
0.7001400420140049
0.9801960588196069
0.6666666666666667


In [30]:
# 줄거리가 유사한 영화 추천

import pandas as pd

df = pd.read_csv("/Users/ian/Desktop/Study/data/movies/movies_metadata.csv", low_memory=False)
df = df.head(10000)

In [31]:
df['overview'].isnull().sum()

29

In [32]:
df['overview'] = df['overview'].fillna('')

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['overview'])
print(tfidf_matrix.shape)

(10000, 32350)


In [35]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix,tfidf_matrix)
linear_kernel([[1,2]],[[1,2]])
indices = pd.Series(df.index, index=df['title']).drop_duplicates()
print(indices.head())

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
dtype: int64


In [37]:
idx = indices['Toy Story']
idx

0

In [38]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[movie_indices]

get_recommendations('Toy Story')

2997                                    Toy Story 2
8327                                      The Champ
1071                          Rebel Without a Cause
3057                                Man on the Moon
1932                                      Condorman
485                                          Malice
5797                                  Class of 1984
7254                                 Africa Screams
6944                               Rivers and Tides
7615    The First $20 Million Is Always the Hardest
Name: title, dtype: object

In [None]:
# 컨텐츠 기반 필터링

In [39]:
import pandas as pd

df = pd.read_csv("/Users/ian/Desktop/Study/data/movies/tmdb_5000_movies.csv")
df.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [40]:
df = df[['id', 'keywords', 'genres', 'title', 'vote_average', 'popularity', 'vote_count', 'overview']]

In [41]:
df['genres'] = df['genres'].apply(eval)
df['keywords'] = df['keywords'].apply(eval) # 따옴표 제거하고 그 안에 있는 코드 분석

In [42]:
df['genres'] = df['genres'].apply(
    lambda x: [d['name'] for d in x]).apply(
        lambda x: " ".join(x))

In [43]:
from sklearn.feature_extraction.text import CountVectorizer

count_vector = CountVectorizer()
c_vector_genres = count_vector.fit_transform(df['genres']).toarray()

In [44]:
from sklearn.metrics.pairwise import cosine_similarity

genre_c_sim = cosine_similarity(c_vector_genres, c_vector_genres).argsort()[:, ::-1]

In [45]:
def get_recommend_movie_list(data, movie_title, top=5):
    target_movie_index = data[data['title'] == movie_title].index.values
    sim_index = genre_c_sim[target_movie_index, :top].reshape(-1)
    sim_index = sim_index[sim_index != target_movie_index]
    result = data.iloc[sim_index].sort_values('vote_average', ascending=False)[:10]
    return result

In [46]:
df[df['title'] == 'Toy Story']

Unnamed: 0,id,keywords,genres,title,vote_average,popularity,vote_count,overview
1541,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...",Animation Comedy Family,Toy Story,7.7,73.640445,5269,"Led by Woody, Andy's toys live happily in his ..."


In [47]:
get_recommend_movie_list(df, movie_title='Toy Story')

Unnamed: 0,id,keywords,genres,title,vote_average,popularity,vote_count,overview
231,585,"[{'id': 1299, 'name': 'monster'}, {'id': 3256,...",Animation Comedy Family,"Monsters, Inc.",7.5,106.815545,5996,"James Sullivan and Mike Wazowski are monsters,..."
1555,11836,"[{'id': 270, 'name': 'ocean'}, {'id': 658, 'na...",Animation Comedy Family,The SpongeBob SquarePants Movie,6.7,27.870755,574,There's trouble brewing in Bikini Bottom. Some...
173,65759,"[{'id': 3028, 'name': 'penguin'}, {'id': 4344,...",Animation Comedy Family,Happy Feet Two,5.8,17.7735,373,Mumble the penguin has a problem: his son Erik...
692,9982,"[{'id': 1357, 'name': 'fish'}, {'id': 1415, 'n...",Animation Family Comedy,Chicken Little,5.6,47.973995,944,When the sky really is falling and sanity has ...
766,9513,"[{'id': 212, 'name': 'london england'}, {'id':...",Animation Comedy Family,Garfield: A Tail of Two Kitties,5.1,16.930969,464,Garfield is back and this time Garfield and hi...


In [None]:
# 아이템 기반 협업 필터링

In [48]:
import pandas as pd

df = pd.read_csv("/Users/ian/Desktop/Study/data/movies/ratings_small.csv")
df = df.pivot_table("rating", index="userId", columns="movieId")
df

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,4.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,4.0,...,,,,,,,,,,
5,,,4.0,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,,,,,,4.0,,,,,...,,,,,,,,,,
668,,,,,,,,,,,...,,,,,,,,,,
669,,,,,,,,,,,...,,,,,,,,,,
670,4.0,,,,,,,,,,...,,,,,,,,,,


In [49]:
df_ratings = pd.read_csv("/Users/ian/Desktop/Study/data/movies/ratings_small.csv")
df_movies = pd.read_csv("/Users/ian/Desktop/Study/data/movies/tmdb_5000_movies.csv")

df_movies.rename(columns={'id': 'movieId'}, inplace=True) # 필드명 변경

df_ratings_movies = pd.merge(df_ratings, df_movies, on='movieId') # 조인

df2 = df_ratings_movies.pivot_table('rating', index='userId', columns='title').fillna(0)

df2 = df2.transpose() # 아이템(영화제목) 기준으로 바꿈. x축 y축을 바꾼 것
df2

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15 Minutes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16 Blocks,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"You, Me and Dupree",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Young Frankenstein,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
Zodiac,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
eXistenZ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
from sklearn.metrics.pairwise import cosine_similarity

movie_sim = cosine_similarity(df2,df2)

df_movie_sim = pd.DataFrame(data=movie_sim, index=df2.index, columns=df2.index)
df_movie_sim

title,10 Things I Hate About You,12 Angry Men,1408,15 Minutes,16 Blocks,"20,000 Leagues Under the Sea",2001: A Space Odyssey,2046,21 Grams,25th Hour,...,Willy Wonka & the Chocolate Factory,World Trade Center,X-Men Origins: Wolverine,Y Tu Mamá También,You Only Live Twice,"You, Me and Dupree",Young Frankenstein,Zodiac,eXistenZ,xXx
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You,1.000000,0.0,0.000000,0.182153,0.0,0.022069,0.085323,0.0,0.00000,0.103490,...,0.059856,0.0,0.161801,0.088076,0.0,0.0,0.097588,0.000000,0.000000,0.014121
12 Angry Men,0.000000,1.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.00000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000
1408,0.000000,0.0,1.000000,0.447214,0.0,0.173381,0.028245,0.0,0.00000,0.000000,...,0.146955,0.0,0.148968,0.140265,0.0,0.0,0.191675,0.000000,0.000000,0.000000
15 Minutes,0.182153,0.0,0.447214,1.000000,0.0,0.077538,0.050526,0.0,0.00000,0.129863,...,0.197160,0.0,0.216516,0.141138,0.0,0.0,0.085720,0.115684,0.121365,0.000000
16 Blocks,0.000000,0.0,0.000000,0.000000,1.0,0.000000,0.000000,0.0,0.00000,0.000000,...,0.000000,0.0,0.130347,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"You, Me and Dupree",0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.00000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,1.0,0.000000,0.000000,0.000000,0.000000
Young Frankenstein,0.097588,0.0,0.191675,0.085720,0.0,0.204590,0.115720,0.0,0.00000,0.000000,...,0.230622,0.0,0.423840,0.236086,0.0,0.0,1.000000,0.214856,0.110536,0.204346
Zodiac,0.000000,0.0,0.000000,0.115684,0.0,0.014016,0.222842,0.0,0.00000,0.075115,...,0.359021,0.0,0.288208,0.201826,0.0,0.0,0.214856,1.000000,0.163801,0.105379
eXistenZ,0.000000,0.0,0.000000,0.121365,0.0,0.000000,0.079525,0.0,0.07253,0.000000,...,0.127951,0.0,0.241299,0.070183,0.0,0.0,0.110536,0.163801,1.000000,0.044104


In [51]:
df_movie_sim["Zodiac"].sort_values(ascending=False)[1:5]

title
Crank                                    0.588452
Confessions of a Dangerous Mind          0.529170
Lucky You                                0.471890
The Mummy: Tomb of the Dragon Emperor    0.452902
Name: Zodiac, dtype: float64