In [2]:
import requests
from bs4 import BeautifulSoup
from urllib import request
import numpy as np
import pandas as pd


def get_url():
    url_list = []
    for i in range(2004, 2021):
        url_list.append('https://movie.daum.net/boxoffice/yearly?year=' + str(i))
    return url_list


class Movie_Url:
    def __init__(self, movie_url):
        self.movie_url = movie_url
        self.lst = self.navigate_url()
        self.name_list = self.crawl_movie_name()
        self.rating_list = self.crawl_movie_rating()
        self.genre_list, self.director_list, self.actor_list = self.crawl_movie_detail()

    def navigate_url(self):
        response = requests.get(self.movie_url)
        html = BeautifulSoup(response.content, 'html.parser')
        info = html.find('div', {'class': 'main_detail'})
        return info.findAll('li')

    def crawl_movie_name(self):
        name_list = []
        for lt in self.lst:
            name = lt.find('strong').getText()
            name_list.append(name)
        return name_list

    def crawl_movie_rating(self):
        rating_list=[]
        for lt in self.lst:
            rating = lt.find('em').getText()
            rating_list.append(rating)
        return rating_list

    def crawl_movie_image(self):
        for i, lt in enumerate(self.lst):
            img = str(lt.find('img'))
            img_url = img[img.find('http'):img.find('onload')-2]

            request.urlretrieve(img_url, "./image/"+str(self.movie_url[-4:])+str(i+1)+".png")
        print('Image Downloaded')

    def crawl_movie_detail(self):
        base_url = 'https://movie.daum.net'
        genre_list = []
        director_list = []
        actor_list = []

        for lt in self.lst:
            stlt = str(lt)
            stlt_2 = stlt[stlt.find('href=')+6:]
            stlt_3 = stlt_2[:stlt_2.find('>')-1]

            new_url = base_url+str(stlt_3)
            response = requests.get(new_url)
            html = BeautifulSoup(response.content, 'html.parser')
            new_info = html.find('div', {'class': 'movie_summary'})
            genre_list.append(new_info.findAll('dd')[0].getText())
            director_list.append(html.find('dd', {'class': 'type_ellipsis'}).find('a').getText())
            try:
                actor_list.append([x.getText() for x in html.findAll('dd',{'class': 'type_ellipsis'})[1]
                              .findAll('a')])
            except IndexError:
                actor_list.append('X')

        genre_list = [genre.split('/') for genre in genre_list]

        return genre_list, director_list, actor_list

    def make_movie_df(self):
        df_movie = pd.DataFrame()
        df_movie['movie_name'] = self.name_list
        df_movie['movie_rating'] = self.rating_list
        df_movie['movie_genre'] = self.genre_list
        df_movie['movie_director'] = self.director_list
        df_movie['movie_actor'] = self.actor_list

        return df_movie

def object_column_preprocess(df):
    df = df.reset_index(drop=True)
    df['movie_genre_split'] = df['movie_genre'].apply(lambda x: (' ').join(x))
    df['movie_director_split'] = df['movie_director'].apply(lambda x: x.replace(' ', ''))
    df['movie_actor_split'] = df['movie_actor'].apply(
        lambda x: (' ').join([ac.replace(' ', '') for ac in x]))

    return df


url_list = get_url()
for i, year_url in enumerate(url_list):
    movie = Movie_Url(year_url)
    # movie.crawl_movie_image()
    if i == 0:
        movie_df = movie.make_movie_df()
    else:
        movie_df = pd.concat([movie_df, movie.make_movie_df()], axis=0)

    print(i)

movie_df = object_column_preprocess(movie_df)
# movie_2004 = Movie_Url(url_list[0])
# df_2004 = movie_2004.make_movie_df()
# print(df_2004)


0
1


KeyboardInterrupt: 

In [3]:
import pandas as pd
import numpy as np

In [4]:
movie_df=pd.read_csv('movie_df.csv')

In [10]:
movie_df

Unnamed: 0.1,Unnamed: 0,movie_name,movie_rating,movie_genre,movie_director,movie_actor,movie_genre_split,movie_director_split,movie_actor_split
0,0,태극기 휘날리며,9.1,['전쟁'],강제규,"['장동건', '원빈']",전쟁,강제규,장동건 원빈
1,1,트로이,8.4,"['액션', '로맨스', '멜로']",볼프강 페터슨,"['브래드 피트', '올랜도 블룸']",액션 로맨스 멜로,볼프강페터슨,브래드피트 올랜도블룸
2,2,내 머리 속의 지우개,8.7,"['로맨스', '멜로', '드라마']",이재한,"['정우성', '손예진']",로맨스 멜로 드라마,이재한,정우성 손예진
3,3,귀신이 산다,6.4,"['코미디', '판타지', '공포']",김상진,"['차승원', '장서희']",코미디 판타지 공포,김상진,차승원 장서희
4,4,투모로우,8.3,"['액션', '스릴러', 'SF', '어드벤처', '드라마']",롤랜드 에머리히,"['데니스 퀘이드', '제이크 질렌할']",액션 스릴러 SF 어드벤처 드라마,롤랜드에머리히,데니스퀘이드 제이크질렌할
...,...,...,...,...,...,...,...,...,...
845,845,이 멋진 세계에 축복을! 붉은 전설,7.7,"['애니메이션', '액션', '코미디', '판타지']",칸자키 타카오미,"['후쿠시마 쥰', '아마미야 소라', '타카하시 리에', '카야노 아이']",애니메이션 액션 코미디 판타지,칸자키타카오미,후쿠시마쥰 아마미야소라 타카하시리에 카야노아이
846,846,사마에게,9.7,['다큐멘터리'],와드 알-카팁,X,다큐멘터리,와드알-카팁,X
847,847,핑크퐁 시네마 콘서트 : 우주대탐험,7.1,['애니메이션'],변희선,"['조경이', '김서영']",애니메이션,변희선,조경이 김서영
848,848,슈퍼 베어,6.6,"['액션', '어드벤처', '애니메이션']",왕치,"['김기두', '서반석', '이다은', '박성영']",액션 어드벤처 애니메이션,왕치,김기두 서반석 이다은 박성영


In [7]:
mv_df=movie_df.drop(['movie_genre','movie_director','movie_actor'],axis=1)
mv_df

Unnamed: 0.1,Unnamed: 0,movie_name,movie_rating,movie_genre_split,movie_director_split,movie_actor_split
0,0,태극기 휘날리며,9.1,전쟁,강제규,장동건 원빈
1,1,트로이,8.4,액션 로맨스 멜로,볼프강페터슨,브래드피트 올랜도블룸
2,2,내 머리 속의 지우개,8.7,로맨스 멜로 드라마,이재한,정우성 손예진
3,3,귀신이 산다,6.4,코미디 판타지 공포,김상진,차승원 장서희
4,4,투모로우,8.3,액션 스릴러 SF 어드벤처 드라마,롤랜드에머리히,데니스퀘이드 제이크질렌할
...,...,...,...,...,...,...
845,845,이 멋진 세계에 축복을! 붉은 전설,7.7,애니메이션 액션 코미디 판타지,칸자키타카오미,후쿠시마쥰 아마미야소라 타카하시리에 카야노아이
846,846,사마에게,9.7,다큐멘터리,와드알-카팁,X
847,847,핑크퐁 시네마 콘서트 : 우주대탐험,7.1,애니메이션,변희선,조경이 김서영
848,848,슈퍼 베어,6.6,액션 어드벤처 애니메이션,왕치,김기두 서반석 이다은 박성영


In [15]:
txt_info=mv_df['movie_genre_split']+str(' ')+mv_df['movie_director_split']+str(' ')+mv_df['movie_actor_split']

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
txt_info

0                                          전쟁 강제규 장동건 원빈
1                           액션 로맨스 멜로 볼프강페터슨 브래드피트 올랜도블룸
2                                 로맨스 멜로 드라마 이재한 정우성 손예진
3                                 코미디 판타지 공포 김상진 차승원 장서희
4               액션 스릴러 SF 어드벤처 드라마 롤랜드에머리히 데니스퀘이드 제이크질렌할
                             ...                        
845    애니메이션 액션 코미디 판타지 칸자키타카오미 후쿠시마쥰 아마미야소라 타카하시리에 카...
846                                       다큐멘터리 와드알-카팁 X
847                                    애니메이션 변희선 조경이 김서영
848                     액션 어드벤처 애니메이션 왕치 김기두 서반석 이다은 박성영
849                         드라마 페르난도메이렐레스 조나단프라이스 안소니홉킨스
Length: 850, dtype: object

In [53]:
txt_info.to_csv('txt_info.txt', index=False, header=None, sep="\t")

In [56]:
t=pd.read_csv('txt_info.txt',header=None)


Unnamed: 0,0
0,전쟁 강제규 장동건 원빈
1,액션 로맨스 멜로 볼프강페터슨 브래드피트 올랜도블룸
2,로맨스 멜로 드라마 이재한 정우성 손예진
3,코미디 판타지 공포 김상진 차승원 장서희
4,액션 스릴러 SF 어드벤처 드라마 롤랜드에머리히 데니스퀘이드 제이크질렌할
...,...
845,애니메이션 액션 코미디 판타지 칸자키타카오미 후쿠시마쥰 아마미야소라 타카하시리에 카...
846,다큐멘터리 와드알-카팁 X
847,애니메이션 변희선 조경이 김서영
848,액션 어드벤처 애니메이션 왕치 김기두 서반석 이다은 박성영


In [34]:
cv=CountVectorizer(ngram_range=(1,1))
cv_mv=cv.fit_transform(txt_info)

In [58]:
cv_mv=cv_mv.toarray()

In [63]:
cv_data=pd.DataFrame(cv_mv)

In [73]:
cv_data.to_csv('cv_data.txt',index=False,header=None)

In [70]:
cv_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1656,1657,1658,1659,1660,1661,1662,1663,1664,1665
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
845,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
846,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
847,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
848,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [59]:
from sklearn.metrics.pairwise import cosine_similarity
cs_mv=cosine_similarity(cv_mv,cv_mv)

In [66]:
cs_mv_df=pd.DataFrame(cs_mv)

In [75]:
cs_mv_df.to_csv('cs_mv_df.txt',index=False,header=None)

In [74]:
a=pd.read_csv('cv_data.txt')
a

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,...,0.1652,0.1653,0.1654,0.1655,0.1656,0.1657,0.1658,0.1659,0.1660,0.1661
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
844,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
845,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
846,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
847,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
index = 6
print(mv_df['movie_name'][index])
mv_df.iloc[np.argsort(cs_mv[6])[::-1][:20]]


해리포터와 아즈카반의 죄수


Unnamed: 0.1,Unnamed: 0,movie_name,movie_rating,movie_genre_split,movie_director_split,movie_actor_split
6,6,해리포터와 아즈카반의 죄수,8.4,판타지 어드벤처,알폰소쿠아론,다니엘래드클리프 엠마왓슨 루퍼트그린트
835,835,해리포터와 아즈카반의 죄수,8.4,판타지 어드벤처,알폰소쿠아론,다니엘래드클리프 엠마왓슨 루퍼트그린트
53,53,해리포터와 불의 잔,9.2,판타지 어드벤처,마이크뉴웰,다니엘래드클리프 엠마왓슨 루퍼트그린트
155,155,해리 포터와 불사조 기사단,8.2,판타지 어드벤처,데이빗예이츠,다니엘래드클리프 엠마왓슨 루퍼트그린트
358,358,해리포터와 죽음의 성물2,9.2,판타지 어드벤처,데이빗예이츠,다니엘래드클리프 루퍼트그린트 엠마왓슨
261,261,해리 포터와 혼혈 왕자,6.9,판타지 어드벤처,데이빗예이츠,다니엘래드클리프 루퍼트그린트 엠마왓슨
315,315,해리 포터와 죽음의 성물1,7.9,판타지,데이빗예이츠,다니엘래드클리프 엠마왓슨 루퍼트그린트
129,129,킹콩,8.7,액션 판타지 어드벤처,피터잭슨,나오미왓츠
419,419,호빗: 뜻밖의 여정,8.1,어드벤처 판타지,피터잭슨,이안맥켈런 마틴프리먼
63,63,킹콩,8.7,액션 판타지 어드벤처,피터잭슨,나오미왓츠
