In [None]:
import pandas as pd
import datetime
import requests
from bs4 import BeautifulSoup as bs
import time

## 共用參數

In [None]:
in_theater = 'https://movies.yahoo.com.tw/movie_intheaters.html?page=1'
coming_soon = 'https://movies.yahoo.com.tw/movie_comingsoon.html?page=1'
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36'}

## 爬取總頁面數function

In [None]:
def get_page(url):
    res = requests.get(url, headers=headers)
    soup = bs(res.content, 'html.parser')
    body = soup.find('div', class_='page_numbox').find('ul')
    pages = []
    for b in body.find_all('a'):
        pages.append(b.text)
    new_pages = []
    for p in pages:
        try:
            new_pages.append(int(p))
        except ValueError:
            pass
    return max(new_pages)

## 爬蟲主程式

In [None]:
start = time.time()
columns = ['片名', '英文片名', '上映日期', '期待度', '網友評分', '影片簡介']
movie = pd.DataFrame(columns=columns)

headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36'}

page = get_page(in_theater)
for i in range(1, page + 1):
    url = 'https://movies.yahoo.com.tw/movie_intheaters.html?page=' + str(i)
    res = requests.get(url, headers=headers)
    soup = bs(res.content, 'html.parser')
    divs = soup.find_all('div', class_='release_info_text')
    for d in divs:
        name = d.find('a', class_='gabtn').text
        en_name = d.find('div', class_='en').text
        date = d.find('div', class_='release_movie_time').text
        wanted = d.find('div', class_='leveltext').find('span').text
        rating = d.find('span', class_='count')['data-num']
        description = d.find('div', class_='release_text').text.strip()
        movie = movie.append({'片名': name, '英文片名': en_name, '上映日期': date, '期待度': wanted, '網友評分': rating, '影片簡介': description}, ignore_index=True)

page = get_page(coming_soon)
for i in range(1, page + 1):
    url = 'https://movies.yahoo.com.tw/movie_comingsoon.html?page=' + str(i)
    res = requests.get(url, headers=headers)
    soup = bs(res.content, 'html.parser')
    divs = soup.find_all('div', class_='release_info_text')
    for d in divs:
        name = d.find('a', class_='gabtn').text
        en_name = d.find('div', class_='en').text
        date = d.find('div', class_='release_movie_time').text
        wanted = d.find('div', class_='leveltext').find('span').text
#         rating = d.find('span', class_='count')['data-num']
        description = d.find('div', class_='release_text').text.strip()
        movie = movie.append({'片名': name, '英文片名': en_name, '上映日期': date, '期待度': wanted, '影片簡介': description}, ignore_index=True)

end = time.time()
final = end - start
print('總共{}秒爬完'.format(final))



## 處理數據

In [None]:
movie['片名'] = movie['片名'].str.strip()
movie['英文片名'] = movie['英文片名'].str.strip()
movie['影片簡介'] = movie['影片簡介'].str.replace('\r\n', '')
movie['上映日期'] = movie['上映日期'].str.replace('上映日期 ： ', '')
movie['上映日期'] = pd.to_datetime(movie['上映日期'], yearfirst=True)
movie['網友評分'] = movie['網友評分'].map(lambda x: x if pd.notna(x) else 0)
movie['網友評分'] = movie['網友評分'].astype('float')
# result = result.sort_values(['網友評分', '期待度'], ascending=False)
# result.to_excel('/Users/kai/Desktop/movies.xlsx', index=False)

In [None]:
import plotly.express as px

In [None]:
plot = movie[(movie['網友評分'] > 0) & (movie['上映日期'].between('2021-03-01', '2021-04-06'))]
px.bar(plot, x='片名', y='網友評分').update_xaxes(categoryorder='total descending', tickangle=45).update_yaxes(showgrid=False, dtick=0.5, range=[2, movie['網友評分'].max()])

In [None]:
print(__name__)