In [26]:
import requests
from lxml import etree
import pymysql

In [27]:
# 重新定义movie_info类
class movie_info:
    
    def __init__(self, url):
        headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36'
        }
        res = requests.get(url, headers=headers)
        self.html = etree.HTML(res.text)
    
    def title(self):
        title = self.html.xpath('//h1//span[@property]/text()')
        return title[0] # 电影标题，返回一个字符串

    def director(self):
        director = self.html.xpath('//span/a[@rel = "v:directedBy"]/text()')
        return director[0] # 导演，返回一个字符串

    def dates(self):
        movie_dates = self.html.xpath('//span[@property = "v:initialReleaseDate"]/text()')
        dates = ''
        for movie_date in movie_dates:
            dates = dates + movie_date + '/'
        dates = dates.rstrip('/') # 去掉最右方的/
        return dates # 上映日期，返回一个字符串
    
    def runtime(self):
        movie_times = self.html.xpath('//span[@property = "v:runtime"]/text()')
        runtimes = ''
        for movie_time in movie_times:
            runtimes = runtimes + movie_time + '/'
        runtimes = runtimes.rstrip('/')
        return runtimes # 放映时长，返回一个字符串

    def star(self):
        star = self.html.xpath('//strong/text()')
        return float(star[0]) # 返回豆瓣评分

    def comments(self):
        comments = self.html.xpath('//div/a[@href = "comments" and @class="rating_people"]/span/text()')
        return int(comments[0]) # 返回评价人数，数据类型为int型

    def types(self):
        movie_types = self.html.xpath('//span[@property = "v:genre"]/text()')
        types = ''
        for movie_type in movie_types:
            types = types + movie_type + '/'
        types = types.rstrip('/') # 去掉最右方的/
        return types # 电影类型，返回一个字符串

    def summary(self):
        summary = self.html.xpath('//div//span[@property = "v:summary"]/text()')
        content = ''
        for i in summary:
            content = content + i.strip()
        return content # 返回剧情简介，字符串类型

    def ratings(self):
        ratings = self.html.xpath('//div[@class = "ratings-on-weight"]/div/span[@class = "rating_per"]/text()')
        return ratings # 各评分占比，返回的是个字符串列表
    
    def image(self):
        image = self.html.xpath('//img[@title = "点击看更多海报"]/@src')
        image_url = image[0].replace('jpg', 'webp')
        return image_url # 电影海报，返回的是个字符串，链接

In [28]:
movie_url = 'https://movie.douban.com/subject/1292720/'
movie = movie_info(movie_url)

In [34]:
conn = pymysql.connect(  # 连接本地数据库
    host="localhost",
    user="root", # 要填root
    password="htht0928", # 填上自己的密码
    database="doubanmovie", # 数据库名
    charset="utf8"
)

cursor = conn.cursor()  # 得到一个可以执行SQL语句的光标对象

In [35]:
cursor.execute("DROP TABLE IF EXISTS MOVIEINFO")

0

In [39]:
# 创建MOVIEINFO表
sql = """CREATE TABLE MOVIEINFO(
    `rank` INT auto_increment PRIMARY KEY,
    title VARCHAR(750),
    director CHAR(250),
    release_date VARCHAR(750),
    runtime VARCHAR(750),
    star CHAR(10),
    comments CHAR(15),
    type VARCHAR(750),
    summary VARCHAR(750),
    image CHAR(100),
    r5 CHAR(10),
    r4 CHAR(10),
    r3 CHAR(10),
    r2 CHAR(10),
    r1 CHAR(10)
    )
"""
try:
    cursor.execute(sql)
except Exception as e:
    print(e)
    # 发生错误则回滚
    conn.rollback()