In [16]:
import re
import requests
import bs4
import time
import pandas as pd

from bs4 import BeautifulSoup
from collections import namedtuple
from pickle import dump, load

In [6]:
home_url = 'https://movie.douban.com'

def get_URL(url):
    headers = {'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0'}
    response = requests.get(url, headers=headers)
    return response.text

def get_urlSoup(url):
    headers = {'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'lxml')
    return soup

def parse_movieURL(url, home='https://movie.douban.com'):
    text = get_URL(url)
    movie_pattern = re.compile('/subject/[0-9]+?/')
    movie_index = movie_pattern.findall(text)
    movies = set([home + x for x in movie_index])
    return movies

def collect_movieURL(home_url, movies_prime = set(), max_iter = 5):
    if len(movies_prime)==0:
        movies = parse_movieURL(url=home_url, home=home_url)
    else:
        movies = movies_prime
    new_movies_temp = movies
    for step in range(max_iter):
        new_movies = set()
        nobj = len(new_movies_temp)
        totalNum = len(movies)
        for sub_step, url in enumerate(new_movies_temp):
            movies_temp = parse_movieURL(url) - movies
            new_movies = (new_movies | movies_temp)
            movies = movies | new_movies
            print('Global iter %d, local step %d/%d, fetch %d new movies!, total: %d ' % 
              (step+1, sub_step+1, nobj, len(movies_temp), len(movies)))
        new_movies_temp = new_movies
        if len(movies)-totalNum < 10:
            break
    return movies

# movies = parse_movieURL(home_url)
movies_new = collect_movieURL(home_url, movies, max_iter=1)
# a = parse_movieURL(home_url)
# b = parse_movieURL('https://movie.douban.com/subject/22266126/')
# b-a

Global iter 1, local step 1/6768, fetch 0 new movies!, total: 6768 
Global iter 1, local step 2/6768, fetch 6 new movies!, total: 6774 
Global iter 1, local step 3/6768, fetch 7 new movies!, total: 6781 
Global iter 1, local step 4/6768, fetch 0 new movies!, total: 6781 
Global iter 1, local step 5/6768, fetch 3 new movies!, total: 6784 
Global iter 1, local step 6/6768, fetch 0 new movies!, total: 6784 
Global iter 1, local step 7/6768, fetch 4 new movies!, total: 6788 
Global iter 1, local step 8/6768, fetch 3 new movies!, total: 6791 
Global iter 1, local step 9/6768, fetch 0 new movies!, total: 6791 
Global iter 1, local step 10/6768, fetch 0 new movies!, total: 6791 
Global iter 1, local step 11/6768, fetch 4 new movies!, total: 6795 
Global iter 1, local step 12/6768, fetch 5 new movies!, total: 6800 
Global iter 1, local step 13/6768, fetch 2 new movies!, total: 6802 
Global iter 1, local step 14/6768, fetch 0 new movies!, total: 6802 
Global iter 1, local step 15/6768, fetch 4 

In [11]:
class movie_detail:
    def __init__(self, url):
        self.soup = self.get_movie_detail(url)
        assert isinstance(self.soup, bs4.BeautifulSoup)
        people_info = self.soup.find_all('div', attrs={'id':'info'})[0]

        # 标题
        try:
            self.title = self.soup.title.string.strip()
        except:
            self.title = 'none'

        # 导演
        try:
            director_temp = people_info.find_all('a', attrs={'rel':'v:directedBy'})[0]
            director_name = director_temp.string
            director_url = self.wrap_url(director_temp['href'])
            self.director = {director_name:director_url}
        except:
            self.director = {'none':'none'}

        # 编剧
        try:
            pattern_composer = re.compile('''<a href="(/celebrity/[0-9]+?/)">(.+?)</a>''')
            composer_temp = pattern_composer.findall(str(people_info))
            self.composer = {}
            for (x, y) in composer_temp:
                self.composer[y] = self.wrap_url(x)
        except:
            self.composer = {'none':'none'}

        # 主演
        try:
            actors_temp = people_info.find_all('a', attrs={'rel':'v:starring'})
            self.actors = {}
            for actor in actors_temp:
                name = actor.string
                url = actor['href']
                self.actors[name] = self.wrap_url(url)
        except:
            self.actors = {'none':'none'}

        # 电影类型
        try:
            movie_class_temp = people_info.find_all('span', attrs={'property':'v:genre'})
            self.movie_class = ','.join([x.string for x in movie_class_temp])
        except:
            self.movie_class = 'none'

        # 制片国家/地区
        try:
            pattern_place = re.compile('<span class="pl">制片国家/地区:</span>(.+?)<br/>')
            self.place = pattern_place.findall(str(people_info))[0].strip()
        except:
            self.place = 'none'

        # 语言
        try:
            pattern_language = re.compile('<span class="pl">语言:</span>(.+?)<br/>')
            self.language = pattern_language.findall(str(people_info))[0].strip()
        except:
            self.language = 'none'

        # 上映时间
        try:
            self.time = people_info.find_all('span', attrs={'property':'v:initialReleaseDate'})[0].string.strip()
        except:
            self.time = 'none'

        # 片长
        try:
            self.length = people_info.find_all('span', attrs={'property':'v:runtime'})[0].string.strip()
        except:
            self.length = 'none'

        # 别名
        try:
            pattern_alias = re.compile('<span class="pl">又名:</span>(.+?)<br/>')
            self.alias = pattern_alias.findall(str(people_info))[0].strip()
        except:
            self.alias = 'none'

        # imdb链接
        try:
            self.IMDB_url = people_info.find_all('a', attrs={'rel':'nofollow'})[0]['href']
        except:
            self.IMDB_url = 'none'
    
    def get_movie_detail(self, url):
        headers = {'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0'}
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'lxml')
        return soup
    
    def wrap_url(self, url):
        return 'https://movie.douban.com' + url

    @property
    def get_title(self):
        return self.title
    
    @property
    def get_director(self):
        return ','.join(self.director.keys())
    
    @property
    def get_composer(self):
        return ','.join(self.composer.keys())
    
    @property
    def get_actors(self):
        return ','.join(self.actors.keys())
    
    @property
    def get_movie_class(self):
        return self.movie_class
    
    @property
    def get_place(self):
        return self.place
    
    @property
    def get_language(self):
        return self.language
    
    @property
    def get_time(self):
        return self.time
    
    @property
    def get_length(self):
        return self.length
    
    @property
    def get_alias(self):
        return self.alias
    
    @property
    def get_IMDB(self):
        return self.IMDB_url
    
    @property
    def show(self):
        print('title : %s' % self.title)
        print('alias : %s' % self.alias)
        print('director : %s' % list(self.director.keys())[0])
        print('language : %s' % self.language)

In [10]:
dump(movies_new, open('~/douban_movieURL.pickle', 'wb'))

In [2]:
movieURL = load(open('/home/da/douban_movieURL.pickle', 'rb'))

In [8]:
movie_urls = list(movieURL)[0:15]

In [9]:
movie_urls

['https://movie.douban.com/subject/25861907/',
 'https://movie.douban.com/subject/26856347/',
 'https://movie.douban.com/subject/10604893/',
 'https://movie.douban.com/subject/4728304/',
 'https://movie.douban.com/subject/1300117/',
 'https://movie.douban.com/subject/26898906/',
 'https://movie.douban.com/subject/2132732/',
 'https://movie.douban.com/subject/1298988/',
 'https://movie.douban.com/subject/22993097/',
 'https://movie.douban.com/subject/26820458/',
 'https://movie.douban.com/subject/6815121/',
 'https://movie.douban.com/subject/10437779/',
 'https://movie.douban.com/subject/2344687/',
 'https://movie.douban.com/subject/25827381/',
 'https://movie.douban.com/subject/2154245/']

In [12]:
a = movie_urls[0]
a

'https://movie.douban.com/subject/25861907/'

In [31]:
list_index = []
list_title = []
list_director = []
list_composer = []
list_actors = []
list_movie_class = []
list_place = []
list_language = []
list_time = []
list_length = []
list_alias = []
list_IMDBurl = []
index_pattern = re.compile('https://movie.douban.com/subject/([0-9]+?)/')


for step, url in enumerate(movieURL):
    try:
        movie_data = movie_detail(url)
        list_title.append(movie_data.get_title)
        list_director.append(movie_data.get_director)
        list_composer.append(movie_data.get_composer)
        list_actors.append(movie_data.get_actors)
        list_movie_class.append(movie_data.get_movie_class)
        list_place.append(movie_data.get_place)
        list_language.append(movie_data.get_language)
        list_time.append(movie_data.get_time)
        list_length.append(movie_data.get_length)
        list_alias.append(movie_data.get_alias)
        list_IMDBurl.append(movie_data.get_IMDB)
        movie_index = index_pattern.findall(url)
        list_index.append(movie_index[0])
        print('Current step : %d, fetch movie: %s' % (step, movie_data.title))
    except:
        print('Current step : %d, %s unable to find!' % (step, url))
        pass
    time.sleep(0.3)
#     if (step+1) % 10==0:
#     print('Current step : %d, fetch movie: %s' % (step, movie_data.title))
    

Current step : 0, https://movie.douban.com/subject/25861907/ unable to find!
Current step : 1, fetch movie: 禁地之恐怖医院 (豆瓣)
Current step : 2, fetch movie: 四大名捕大结局 (豆瓣)
Current step : 3, fetch movie: 艾菲·格蕾 (豆瓣)
Current step : 4, fetch movie: 千钧一发 (豆瓣)
Current step : 5, fetch movie: 乡村爱情进行曲 (豆瓣)
Current step : 6, fetch movie: 虎猛威龙 (豆瓣)
Current step : 7, fetch movie: 冲破黑暗谷 (豆瓣)
Current step : 8, fetch movie: 在远处永远守候着你 (豆瓣)
Current step : 9, fetch movie: 我是哪吒 (豆瓣)
Current step : 10, fetch movie: 梦回唐朝 (豆瓣)
Current step : 11, fetch movie: 新世界 (豆瓣)
Current step : 12, fetch movie: 嬉皮未成年 (豆瓣)
Current step : 13, fetch movie: 为你抛却 (豆瓣)
Current step : 14, fetch movie: 新精武门 (豆瓣)
Current step : 15, fetch movie: 剧场版 银河机攻队：觉醒的基因 (豆瓣)
Current step : 16, fetch movie: 板牙东京竞速 (豆瓣)
Current step : 17, https://movie.douban.com/subject/24753388/ unable to find!
Current step : 18, fetch movie: 茶泡饭之味 (豆瓣)
Current step : 19, fetch movie: 我们的田野 (豆瓣)
Current step : 20, fetch movie: 差馆 (豆瓣)
Current step : 21, fetch mo

In [32]:
len(list_index)

14262

In [33]:
result_table = pd.DataFrame({
        'id':list_index,
        'title':list_title,
        'director':list_director,
        'composer':list_composer, 
        'actors':list_actors,
        'movie_class':list_movie_class,
        'place':list_place,
        'language':list_language,
        'time':list_time,
        'length':list_length,
        'alias':list_alias,
        'IMDB':list_IMDBurl},columns = ['id', 'title', 'alias', 'director', 'composer',
                                       'actors', 'movie_class', 'place', 'language', 'time', 
                                       'length', 'IMDB'])

In [34]:
result_table

Unnamed: 0,id,title,alias,director,composer,actors,movie_class,place,language,time,length,IMDB
0,26856347,禁地之恐怖医院 (豆瓣),none,孙小西,,"张政勇,裴小瑞,阳蕾","惊悚,恐怖",中国大陆,汉语普通话,2016-08-17,61分钟,none
1,10604893,四大名捕大结局 (豆瓣),四大名捕3 / The Four Final Battle,陈嘉上,"谭广源,陈嘉上,王思敏,陈淑贤,温瑞安,吕冠南","向恬冉,刘俊纬,曹炳琨,郑中基,刘亦菲,邹兆龙,邓萃雯,苏有朋,包贝尔,李子雄,吴秀波,黄秋...","动作,爱情,悬疑,武侠,古装",中国大陆 / 香港,汉语普通话 / 粤语,2014-08-22(中国大陆),107分钟,http://www.imdb.com/title/tt3919278
2,4728304,艾菲·格蕾 (豆瓣),none,理查德·莱克斯顿,艾玛·汤普森,"大卫·苏切特,汤姆·斯图里奇,克劳迪娅·卡汀娜,罗彼·考特拉尼,德里克·雅各比,格雷·怀斯,...","剧情,传记",英国,英语,2014-10-10(英国),108分钟,http://www.imdb.com/title/tt1605798
3,1300117,千钧一发 (豆瓣),变种异煞 / 自然人 / 戛塔卡 / 伽蒂卡 / 太空梦 / 基因代码,安德鲁·尼科尔,安德鲁·尼科尔,"劳恩·迪恩,乌玛·瑟曼,山德·贝克利,艾伦·阿金,戈尔·维达尔,伊桑·霍克,裘德·洛","剧情,科幻",美国,英语 / 世界语,1997-10-24(美国),106分钟,http://www.imdb.com/title/tt0119177
4,26898906,乡村爱情进行曲 (豆瓣),乡村爱情9,孟令宇,张继,"刘小光,毕畅,小沈阳,王小虎,王小利,于月仙,唐鉴军,赵本山,贺树峰,周弋楠,金玫玫,蔡维利","剧情,喜剧",中国大陆,汉语普通话,2017-02-01(中国大陆),none,none
5,2132732,虎猛威龙 (豆瓣),Hu meng wei long / Red wolf,袁和平 Woo-ping Yuen,,"钟丽缇 Christy Chung,袁祥仁 Cheung-Yan Yuen,何家劲",动作,香港,英语 / 粤语,none,92 分钟,http://www.imdb.com/title/tt0113350
6,1298988,冲破黑暗谷 (豆瓣),巨星汤美 / Tommy by 'The Who',肯·罗素,肯·罗素,"奥列佛·里德,蒂娜·特纳,罗杰·达尔特雷,安-玛格丽特,埃尔顿·约翰,杰克·尼科尔森,埃里克...","剧情,歌舞,奇幻",英国,英语,1975-03-19,111 分钟,http://www.imdb.com/title/tt0073812
7,22993097,在远处永远守候着你 (豆瓣),none,长泽雅彦,狗饲恭子,"冈田奈奈,六角精児,中野裕太,伽奈,徳井義実,倉科カナ,清水くるみ",剧情,日本,日语,2013-06-08(日本),120分钟,http://xn--n8jva7am3awjz8bztr157g.com
8,26820458,我是哪吒 (豆瓣),I am NeZha,舒展,"王禹博,邹超疑,李珏,舒展","陶典,王柏超,吴轶飞,胡艺,姚彦泽,梁达伟,邹亮,彭博,孟祥龙,韩娇娇,张焕昭,刘垚","剧情,动画",中国大陆,汉语普通话,2016-10-01(中国大陆),83分钟,http://www.animex.com.cn/clapro_complex.aspx?F...
9,6815121,梦回唐朝 (豆瓣),none,蔡晶盛,,"王力可,倪虹洁,郑恺,谭耀文,隋俊波,佟丽娅,张世,郭德纲","剧情,古装",中国大陆,汉语普通话,2013-01-19,none,none


In [36]:
print('save result to excel...')
writer = pd.ExcelWriter('douban_movies.xlsx')
result_table.to_excel(writer, sheet_name='douban', encoding = 'utf-8', index = False)
writer.save()
print('task done.')

save result to excel...
task done.


In [37]:
dump(result_table, open('douban_movies.pickle', 'wb'))