In [3]:
import numpy as np
from numpy.linalg import norm
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import requests
from bs4 import BeautifulSoup

# 문서 유사도

문서 유사도를 측정할 때 많이 사용되는 알고리즘이 코사인 유사도와 유클리드 거리.

문서간 단어들의 차이를 어떻게 계산할 것인가? --> 유클리드, 코사인 유사도 등.

문서간 차이를 보려면 문서에 등장하는 단어를 수치화하는 작업이 선행되어야 한다.

단어 수치화 --> DocumentTermMatrix(DTM), Word2Vec 등

## 코사인 유사도

유사도(Similarity): 두 객체가 얼마나 닮았는지 나타내는 측정값. 닮을수록 유사도는 높다.

In [4]:
def cos_sim(x, y):
    return np.dot(x, y) / (norm(x) * norm(y))

doc1 = np.array([1, 1, 0, 1])
doc2 = np.array([1, 0, 1, 1])
doc3 = np.array([2, 0, 2, 2])

print(cos_sim(doc1, doc2))
print(cos_sim(doc1, doc3))
print(cos_sim(doc2, doc3))

0.6666666666666667
0.6666666666666667
1.0000000000000002


### 영화들 줄거리를 코사인 유사도로 비교

In [5]:
data = pd.read_csv("data/the-movies-dataset/movies_metadata.csv")
data.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [6]:
data.columns    # 'overview'라는 컬럼이 줄거리 컬럼

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [7]:
data.head()["overview"]
data.tail()["overview"]

45461          Rising and falling between a man and woman.
45462    An artist struggles to finish his work while a...
45463    When one of her hits goes wrong, a professiona...
45464    In a small town live two brothers, one a minis...
45465    50 years after decriminalisation of homosexual...
Name: overview, dtype: object

#### 불용어 처리

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
tfidf = TfidfVectorizer()    # default는 "stop_words=None"
tfidf

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [10]:
tfidf = TfidfVectorizer(stop_words="english")
tfidf

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

#### 결측값 처리

In [11]:
data["overview"].isnull().value_counts()    # overview 컬럼에 결측값 954개

False    44512
True       954
Name: overview, dtype: int64

In [12]:
data["overview"] = data["overview"].fillna("")    # NaN의 경우를 ""로 대체 (tfidf 작업시 NaN 있으면 에러 발생)
data["overview"].isnull().value_counts()

False    45466
Name: overview, dtype: int64

In [14]:
data = data.head(20000)

#### tfidf

In [15]:
tfidf_mat = tfidf.fit_transform(data["overview"])
tfidf_mat.shape     # 45466편의 영화의 overview를 vygusgksms eksdjrk 75827개 

(20000, 47487)

In [17]:
from sklearn.metrics.pairwise import linear_kernel
cos_sim = linear_kernel(tfidf_mat, tfidf_mat)

In [22]:
cos_sim

array([[1.        , 0.01575748, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.01575748, 1.        , 0.04907345, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.04907345, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.08375766],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.08375766, 0.        ,
        1.        ]])

In [25]:
pd.Series(data.index, index=data["title"])

title
Toy Story                                                                       0
Jumanji                                                                         1
Grumpier Old Men                                                                2
Waiting to Exhale                                                               3
Father of the Bride Part II                                                     4
Heat                                                                            5
Sabrina                                                                         6
Tom and Huck                                                                    7
Sudden Death                                                                    8
GoldenEye                                                                       9
The American President                                                         10
Dracula: Dead and Loving It                                                    11
Balto     

In [29]:
idx = pd.Series(data.index, index=data["title"]).drop_duplicates()    # drop_duplicates: 중복 제거
idx.head()

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
dtype: int64

In [33]:
print(idx["Rambo"])
print(cos_sim[12356])

12356
[0.02696439 0.         0.02085587 ... 0.         0.         0.        ]


#### 코사인 유사도가 비슷한 영화 10편 출력

In [45]:
def get_recommendations(title, cos_sim=cos_sim):
    idx_title = idx[title]    # idx_title은 번호가 나온다 (idx 시리즈 참고)
    sim_score = list(enumerate(cos_sim[idx_title]))
    sim_score = sorted(sim_score, key=lambda x: x[1], reverse=True)    # 코사인 유사도가 높은 순서대로 정렬
    # key: 정렬 기준    # 람다 함수를 통해 튜플의 인자에 접근한다
    mi = sim_score[1:11]    # 영화 자기 자신 (이 경우 Rambo) 을 제외한 유사도 상위 10개
    res = [i[0] for i in mi]    # sim_score가 튜플들이 들어가있으니까 거기서 인덱스번호만 추출해서 리스트에 넣기
    print(data["title"].iloc[res])

In [46]:
get_recommendations("Rambo")

2290                         First Blood
2291                           Rambo III
2289          Rambo: First Blood Part II
2693                   Universal Soldier
3832                          Billy Jack
9632     Strawberries in the Supermarket
1165                      Apocalypse Now
12252              Journey from the Fall
16638                     Little Soldier
10030                         The Search
Name: title, dtype: object


# 로그인해서 웹 스크래핑

1. 로그인: http://www.hanbit.co.kr/member/login_proc.php, 블랙박스(m_id="jiuney"&m_passwd="Qwer1029")
2. 개인정보 페이지로 이동: http://www.hanbit.co.kr/myhanbit/myhanbit.html
3. 

In [48]:
login_info = {"m_id": "jiuney", "m_passwd": "Qwer1029"}
url_login = "http://www.hanbit.co.kr/member/login_proc.php"

In [49]:
# post 방식으로 서버에 연결
import requests
session = requests.session()    # session 객체 생성
session.post(url_login)

<Response [200]>

In [50]:
res = session.post(url_login, data=login_info)
res    # <Response [200]> 은 정상 접속

<Response [200]>

In [51]:
url_mypage = "http://www.hanbit.co.kr/myhanbit/myhanbit.html"
res = session.get(url_mypage)
res

<Response [200]>

In [52]:
res.text

'<!DOCTYPE html>\r\n<html lang="ko">\r\n<head>\r\n<!--[if lte IE 8]>\r\n<script>\r\n  location.replace(\'/support/explorer_upgrade.html\');\r\n</script>\r\n<![endif]-->\r\n<!-- Google Tag Manager -->\r\n<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({\'gtm.start\':\r\nnew Date().getTime(),event:\'gtm.js\'});var f=d.getElementsByTagName(s)[0],\r\nj=d.createElement(s),dl=l!=\'dataLayer\'?\'&l=\'+l:\'\';j.async=true;j.src=\r\n\'https://www.googletagmanager.com/gtm.js?id=\'+i+dl;f.parentNode.insertBefore(j,f);\r\n})(window,document,\'script\',\'dataLayer\',\'GTM-W9D5PM3\');</script>\r\n<!-- End Google Tag Manager -->\r\n<meta charset="utf-8"/>\r\n<title>한빛출판네트워크</title>\r\n<link rel="shortcut icon" href="http://www.hanbit.co.kr/images/common/hanbit.ico"> \r\n<meta http-equiv="X-UA-Compatible" content="IE=Edge" />\r\n<meta property="og:type" content="website"/>\r\n<meta property="og:title" content="한빛출판네트워크"/>\r\n<meta property="og:description" content="출판사, IT전문서, 대학교재, 경제경영, 어린이/유아,

In [56]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(res.text, "html.parser")
soup.select_one("#container > div > div.sm_mymileage > dl.mileage_section1 > dd > span").get_text()

'2,000'

In [61]:
mi = soup.select_one(".mileage_section1 > dd > span").get_text()
mi

'2,000'

In [58]:
soup.select_one("#container > div > div.sm_mymileage > dl.mileage_section2 > dd > span").get_text()

'0'

In [62]:
ec = soup.select_one(".mileage_section2 > dd > span").get_text()
ec

'0'

In [63]:
print("마일리지: " + mi + ", 이코인: " + ec)

마일리지: 2,000, 이코인: 0


# 랜덤포레스트

* 기준이 부족할 때 -> too biased -> underfitting
* 기준이 너무 많을 때 -> high variances -> overfitting
* bias와 variance는 반대개념이 아니다 (https://towardsdatascience.com/bias-and-variance-in-linear-models-e772546e0c30)
* 우리가 추구하는 것은 low bias, low variance