<a href="https://colab.research.google.com/github/leejukyu/recommendation_system/blob/main/TF_IDF%EC%8B%A4%EC%8A%B5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### ** TF-IDF 실습 **



In [None]:
%%time
path = "/content/drive/MyDrive/data"

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.39 µs


In [None]:
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

mecab-ko is already installed
mecab-ko-dic is already installed
mecab-python is already installed
Done.


In [None]:
import kss
import re

In [None]:
# 불용어 처리
removal_list =  "‘, ’, ◇, ‘, ”,  ’, ', ·, \“, ·, △, ●,  , ■, (, ), \", >>, `, /, -,∼,=,ㆍ<,>, .,?, !,【,】, …, ◆,%"
def cleansing_special(sentence):
    # 특수문자를 전처리를 하는 함수
    sentence = re.sub("[.,\'\"’‘”“!?]", "", sentence)
    sentence = re.sub("[^가-힣0-9a-zA-Z\\s]", " ", sentence)
    sentence = re.sub("\s+", " ", sentence)
    sentence = sentence.translate(str.maketrans(removal_list, ' '*len(removal_list)))
    sentence = sentence.strip()
    return sentence
def cleansing_numbers(sentence):
    # 숫자를 전처리(delexicalization) 하는 함수    
    sentence = re.sub('[0-9]+', 'NUM', sentence)
    sentence = re.sub('NUM\s+', "NUM", sentence)
    sentence = re.sub('[NUM]+', "NUM", sentence)
    return sentence

In [None]:
import os
from konlpy.tag import Mecab
mecab = Mecab()
all_sentences = []
with open(os.path.join(path, 'news_sample.txt'), 'r', encoding='utf-8') as f:
    for idx, line in enumerate(f.readlines()):
        print(f"---문서 {idx}번---")
        preprocessed = cleansing_numbers(line)
        preprocessed = cleansing_numbers(preprocessed)

        # 명사만 추출(nouns)
        preprocessed_news = ''.join(list(set(mecab.nouns(preprocessed))))

        # 문장으로 분리
        preprocessed_news = ''.join(kss.split_sentences(preprocessed))

        print(preprocessed_news)
        all_sentences.append(preprocessed_news)

Exception: ignored

In [None]:
all_sentences

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict

word2id = defaultdict(lambda : 0)
# print(word2id)

tfidf_vectorizer = TfidfVectorizer() # 명사에 가중치
tfidf_matrix = tfidf_vectorizer.fit_transform(all_sentences)
for idx, feature in enumerate(tfidf_vectorizer.get_feature_names()):
    # print(idx, feature)
    word2id[feature] = idx
print(tfidf_matrix)

In [None]:
tfidf_vectorizer.vocabulary_

In [None]:
for idx, line in enumerate(all_sentences):
    print(f'--문서 {idx}번---')
    results = [(token , tfidf_matrix[idx, word2id[token]]) for token in line.split()]
    # 아이템을 요소별로 정렬
    print(results)
    results.sort(key=lambda element : element[1], reverse=True)
    print(results)
    print('\n')

### 영화추천
##### movies_metacdata.csv를 화용고 TF-IDF 알고리즘 활용

In [None]:
%%time
import pandas as pd
movie_data = pd.read_csv('/content/drive/MyDrive/data/movies_metadata.csv', encoding='utf-8')
print(movie_data)

       adult  ... vote_count
0      False  ...     5415.0
1      False  ...     2413.0
2      False  ...       92.0
3      False  ...       34.0
4      False  ...      173.0
...      ...  ...        ...
45461  False  ...        1.0
45462  False  ...        3.0
45463  False  ...        6.0
45464  False  ...        0.0
45465  False  ...        0.0

[45466 rows x 24 columns]
CPU times: user 524 ms, sys: 70.6 ms, total: 595 ms
Wall time: 610 ms




In [None]:
# 결측치 제거
overview_data = movie_data[movie_data['overview'].notnull()].reset_index(drop=True)
overview_data.shape

(44512, 24)

In [None]:
tfidf_vectorizer_overview = TfidfVectorizer(stop_words="english", max_features=10000)
overview_matrix = tfidf_vectorizer_overview.fit_transform(overview_data['overview'])
# print(overview_matrix)
pd.DataFrame(overview_matrix)

Unnamed: 0,0
0,"(0, 2620)\t0.13777030554792585\n (0, 682)\t..."
1,"(0, 2198)\t0.1609551690390955\n (0, 8948)\t..."
2,"(0, 9058)\t0.09826424986547001\n (0, 4384)\..."
3,"(0, 9728)\t0.14029436903788411\n (0, 1028)\..."
4,"(0, 5012)\t0.1922902664705427\n (0, 657)\t0..."
...,...
44507,"(0, 7613)\t0.6777056094355318\n (0, 3388)\t..."
44508,"(0, 8553)\t0.4752455146929435\n (0, 671)\t0..."
44509,"(0, 993)\t0.37185786784776004\n (0, 4297)\t..."
44510,"(0, 8551)\t0.1407509003946999\n (0, 5058)\t..."


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_matrix = cosine_similarity(overview_matrix)
cosine_matrix

array([[1.        , 0.01622149, 0.        , ..., 0.        , 0.00691458,
        0.        ],
       [0.01622149, 1.        , 0.05406299, ..., 0.        , 0.02645526,
        0.0123022 ],
       [0.        , 0.05406299, 1.        , ..., 0.        , 0.01746952,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.00691458, 0.02645526, 0.01746952, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.0123022 , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [None]:
movie2id = {}
for idx, content in enumerate(overview_data['title']):
    movie2id[idx] = content

id2movie = {}
for idx, content in enumerate(overview_data['title']):
    id2movie[content] = idx

In [None]:
overview_data['title'].unique()

array(['Toy Story', 'Jumanji', 'Grumpier Old Men', ...,
       'Century of Birthing', 'Satan Triumphant', 'Queerama'],
      dtype=object)

In [None]:
import numpy as np
idx = id2movie['Toy Story']
np.sort(cosine_matrix[1])[::-1]

similiar_scores = [(i, c) for i, c in enumerate(cosine_matrix[idx]) if i != idx]
similiar_scores = sorted(similiar_scores, key = lambda x: x[1], reverse = True)
similiar_scores[0:10]

[(15282, 0.543890726837358),
 (2979, 0.4598067529219477),
 (24316, 0.33002662508690145),
 (10271, 0.3249936921360821),
 (8303, 0.29889485118380993),
 (23646, 0.27206245876641316),
 (28893, 0.261773403202809),
 (42572, 0.2527459923476682),
 (41893, 0.2393563197889988),
 (37778, 0.22069124670177778)]

In [None]:
sim_title = [(movie2id[i], score) for i, score in similiar_scores[0:5]]
sim_title

idx_convert = {0:'첫번째', 1:'두번째', 2:'세번째', 3:'네번째', 4:'다섯번째'}
for i, data in enumerate(sim_title):
    print(f'{idx_convert[i]} 영화추천은 {data[0]}입니다. 유사도는 {data[1]}')

첫번째 영화추천은 Toy Story 3입니다. 유사도는 0.543890726837358
두번째 영화추천은 Toy Story 2입니다. 유사도는 0.4598067529219477
세번째 영화추천은 Small Fry입니다. 유사도는 0.33002662508690145
네번째 영화추천은 The 40 Year Old Virgin입니다. 유사도는 0.3249936921360821
다섯번째 영화추천은 The Champ입니다. 유사도는 0.29889485118380993


### Word2Vec을 이용한 추천시스템(영화추천)

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
import os

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
path = '/content/drive/MyDrive/data/movielens'

In [None]:
movie = pd.read_csv(os.path.join(path, 'ratings.csv'), low_memory=False)
movie = movie.sort_values(by='timestamp').reset_index(drop=True)
movie.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,429,595,5.0,828124615
1,429,588,5.0,828124615
2,429,590,5.0,828124615
3,429,592,5.0,828124615
4,429,432,3.0,828124615


In [None]:
meta = pd.read_csv(os.path.join(path, 'movies_metadata.csv'), low_memory=False)
meta.columns
meta = meta.rename(columns={'id':'movieId'})
meta.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,movieId,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [None]:
movie['movieId'] = movie['movieId'].astype(str)
meta['movieId'] = meta['movieId'].astype(str)

In [None]:
movie = pd.merge(movie, meta[['movieId', 'original_title']], how='left', on='movieId')
movie.head()

Unnamed: 0,userId,movieId,rating,timestamp,original_title
0,429,595,5.0,828124615,To Kill a Mockingbird
1,429,588,5.0,828124615,Silent Hill
2,429,590,5.0,828124615,The Hours
3,429,592,5.0,828124615,The Conversation
4,429,432,3.0,828124615,


In [None]:
movie = movie[movie['original_title'].notnull()].reset_index(drop=True)
movie

Unnamed: 0,userId,movieId,rating,timestamp,original_title
0,429,595,5.0,828124615,To Kill a Mockingbird
1,429,588,5.0,828124615,Silent Hill
2,429,590,5.0,828124615,The Hours
3,429,592,5.0,828124615,The Conversation
4,429,421,4.0,828124615,The Life Aquatic with Steve Zissou
...,...,...,...,...,...
42175,331,1676,4.0,1537235373,Will Penny
42176,272,158238,4.0,1537475893,Stolen Seas
42177,210,122906,4.5,1537632293,About Time
42178,514,5247,2.5,1537757040,Mercy


In [None]:
agg = movie.groupby(['userId'])['original_title'].agg(['unique']) # 다중으로 함수를 쓰고 싶을때 agg
agg

Unnamed: 0_level_0,unique
userId,Unnamed: 1_level_1
1,"[Roman Holiday, The Wedding Planner, Der Tunne..."
2,"[The Million Dollar Hotel, Der Tunnel, Mere Br..."
3,"[American Pie, Rocky III, The Great Dictator, ..."
4,"[La Boum 2, La Cité des Enfants Perdus, Licens..."
5,"[The Hours, The Conversation, 48 Hrs., Rain Ma..."
...,...
606,"[The Great Dictator, Cold Mountain, Or, The Mo..."
607,"[Cars, Short Cuts, Young Frankenstein, Jarhead..."
608,"[Titanic, Speed 2: Cruise Control, Star Trek I..."
609,"[48 Hrs., The Conversation, The Hours, Termina..."


#### word2vec 적용

In [None]:
sentence = []

for user_sentence in agg['unique'].values:
    # print(user_sentence)
    sentence.append(list(map(str, user_sentence)))

In [None]:
from gensim.models import Word2Vec
embedding_model = Word2Vec(sentence, size=20, window=5, min_count=1, workers = 4, iter=200, sg=1)

In [None]:
embedding_model.wv.most_similar(positive=['Roman Holiday'], topn=10)

[('The Shawshank Redemption', 0.8313828110694885),
 ('Pieces of April', 0.800894558429718),
 ('El Crimen del Padre Amaro', 0.7898154258728027),
 ('Je ne suis pas là pour être aimé', 0.7889549732208252),
 ('Munich', 0.7806538343429565),
 ('Wattstax', 0.7563534379005432),
 ('La course du lièvre à travers les champs', 0.7541743516921997),
 ('Saw IV', 0.7525335550308228),
 ('Anna and the King', 0.7474038600921631),
 ('Chicken Run', 0.743503987789154)]

In [None]:
from gensim.models import doc2vec

In [None]:
meta = pd.read_csv(os.path.join(path, 'movies_metadata.csv'), low_memory=False)
meta = meta.rename(columns={'id':'movieId'})
meta = meta[meta['original_title'].notnull()].reset_index(drop=True)
meta = meta[meta['overview'].notnull()].reset_index(drop=True)
meta['overview']

0        Led by Woody, Andy's toys live happily in his ...
1        When siblings Judy and Peter discover an encha...
2        A family wedding reignites the ancient feud be...
3        Cheated on, mistreated and stepped on, the wom...
4        Just when George Banks has recovered from his ...
                               ...                        
44507          Rising and falling between a man and woman.
44508    An artist struggles to finish his work while a...
44509    When one of her hits goes wrong, a professiona...
44510    In a small town live two brothers, one a minis...
44511    50 years after decriminalisation of homosexual...
Name: overview, Length: 44512, dtype: object

In [None]:
import nltk
# nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import re
from tqdm import tqdm
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
overview = []

for words in tqdm(meta['overview']):
    word_tokens = word_tokenize(words)
    sentence = re.sub('[A-Za-z0-9]+', ' ', str(words))
    sentence.strip()
    
    sent_tokens = sent_tokenize(sentence)
    result = ''
    for token in sent_tokens:
        if token not in stop_words:
            result += ''+token
    result = result.strip().lower()
    overview.append(result)
print(result)

100%|██████████| 44512/44512 [00:19<00:00, 2262.64it/s]

,                                ,  ,                        .





In [None]:
meta['pre_overview']=overview
meta['pre_overview']

0        ,  '                 '               .        ...
1        ,         --      '                  --       ...
2        -               .,        é                 , ...
3        ,        ,            ,         "   "         ...
4        '   ,            '    ...      '   ,  ,      ....
                               ...                        
44507                                                    .
44508                                                    .
44509            ,                                     ...
44510    ,                                    .,       ...
44511    ,                                ,  ,         ...
Name: pre_overview, Length: 44512, dtype: object

In [None]:
doc_vector = doc2vec.Doc2Vec(
    dm = 0,
    dbow_words=1,
    window = 1,
    size = 1,
    alpha=0.025,
    seed=1234,
    min_count = 5,
    min_alpha = 0.025,
    workers = 4,
    hs = 1,
    negative = 10
)

In [None]:
from collections import namedtuple
agg = meta[['movieId', 'original_title', 'pre_overview']]
ToggedDocument = namedtuple('ToggedDocument', 'words tags')
tagged_train_docs = [ToggedDocument((c), [d]) for c, d in agg[['original_title', 'pre_overview']].values]

In [None]:
doc_vector.build_vocab(tagged_train_docs)
print(str(doc_vector))

Doc2Vec(dbow+w,d1,n10,hs,w1,mc5,s0.001,t4)


In [None]:
# 벡터 문서 학습
from time import time

start = time()

for epoch in tqdm(range(5)):
    doc_vector.train(tagged_train_docs, total_examples=doc_vector.corpus_count, epochs=doc_vector.iter)
    doc_vector.alpha -= 0.002
    doc_vector.min_alpha = doc_vector.alpha

end = time()
print(f'During Time: {end-start}')

100%|██████████| 5/5 [01:43<00:00, 20.76s/it]

During Time: 103.79265713691711





In [None]:
doc_vector.docvecs.most_similar(['Toy Story'], topn=10)

TypeError: ignored