In [1]:
import pandas as pd 
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import time
import math
import numpy as np
import os
from datetime import datetime, timedelta

In [2]:
# Url scrape
url_base = r"https://www.imdb.com/search/title/?title_type=feature&year="
url_middle_base = r"&sort=release_date,asc&start="
url_last_base = r"&ref_=adv_nxt"

# Dataframe columns
columns = ["title","date","run_time","genre","rating","introduction","director","stars","num_votes"]

In [4]:
current_path = os.getcwd()
data_path = os.path.join(current_path, "data")

already_scrape_day = []

for path in os.listdir(data_path):
    # check if current path is a file
    if os.path.isfile(os.path.join(data_path , path)):
        already_scrape_day.append(path.split(".")[0])

try:
    next_scrape_day_date = datetime.strptime(max(already_scrape_day), "%Y-%m-%d") + timedelta(days=1)
    next_scrape_day = next_scrape_day_date.strftime("%Y-%m-%d")
except: 
    next_scrape_day = "2023-01-01"

# Date range to scrape
date_range = pd.date_range(start = next_scrape_day, end = "2023-12-31")

In [5]:
# Request html from the website
def request(day,start_movie):
    url_day = url_base + str(day.strftime("%Y-%m-%d"))  + url_middle_base + str(start_movie) + url_last_base
    html_data = requests.get(url_day).content.decode('utf-8')
    soup = BeautifulSoup(html_data,'html5lib')
    return soup

In [6]:
# how many movies released on that day
def movie_by_day(soup):
    try:
        num_of_movie = soup.body.find_all("div",{"class":"desc"})[0].find_all("span")[0].text.split()[-2].replace(',', '')
        total_movie = int(num_of_movie)
    except: 
        total_movie = 0 
    return total_movie

In [7]:
# scrape title 
def scrape_title(content): 
    try:
        title = content.find_all("h3",{"class":"lister-item-header"})[0].find_all("a")[0].text
    except: 
        title = "NA"
    return title

In [8]:
# scrape run time 
def scrape_run_time(content):
    try:
        run_time = content.find_all("p",{"class":"text-muted"})[0].find_all("span",{"class":"runtime"})[0].text.split()[0]
    except:
        run_time = "NA"
    return run_time

In [9]:
# scrape genre
def scrape_genre(content):
    try:
        genre_name = content.find_all("p",{"class":"text-muted"})[0].find_all("span",{"class":"genre"})[0].text.strip()
    except:
        genre_name = "NA"
    return genre_name

In [10]:
# scrape rating
def scrape_rating(content):
    try:
        rating_imdb = content.find_all("div",{"class":"ratings-bar"})[0].find_all("strong")[0].text
    except:
        rating_imdb = "NA"
    return rating_imdb

In [11]:
# scrape intro
def scrape_introduction(content):
    try:
        intro = content.find_all("p",{"class":"text-muted"})[1].text.strip().rstrip("See full summary\xa0»")
        if intro == "Add a Plot": 
            intro = "NA"
    except:
        intro = "NA"
    return intro

In [12]:
# scrape director
def scrape_director(content):
    try:
        director_and_stars = content.find_all("p",{"class":""})[0].text.replace("\n","").strip().split(" | ")
        director_name = [director for director in director_and_stars if "Director" in director][0].split(":")[1].strip()
    except:
        director_name = "NA"
    return director_name

In [13]:
# scrape stars
def scrape_stars(content):
    try:
        director_and_stars = content.find_all("p",{"class":""})[0].text.replace("\n","").strip().split(" | ")
        stars_name = [stars for stars in director_and_stars if "Star" in stars][0].split(":")[1].strip()
    except:
        stars_name = "NA"
    return stars_name

In [14]:
# scrape votes 
def scrape_votes(content):
    try:
        num_vote = content.find_all("p",{"class":"sort-num_votes-visible"})[0].find_all("span",{"name":"nv"})[0].text
    except:
        num_vote = "NA"
    return num_vote

In [15]:
# merge all data to row
def merge_to_row(title,day,run_time,genre_name,rating_imdb,intro,director,star,vote):
    row = pd.DataFrame({"title":title,
                        "date":day,
                        "run_time":run_time,
                        "genre":genre_name,
                        "rating":rating_imdb,
                        "introduction":intro,
                        "director":director,
                        "stars":star,
                        "num_votes":vote},
                        columns = ["title","date","run_time","genre","rating","introduction","director","stars","num_votes"], index = [0]) 
    return row

In [None]:
# concat all the file and merge into one
def concatenate_csv_files(data_path):
    # Get a list of all CSV files in the input folder
    csv_files = [file for file in os.listdir(data_path) if file.endswith('.csv')]

    # Initialize an empty list to store dataframes
    dfs = []

    # Read each CSV file into a DataFrame and add it to the list
    for csv_file in csv_files:
        csv_path = os.path.join(data_path, csv_file)
        df = pd.read_csv(csv_path, encoding = "utf-16")
        dfs.append(df)

    # Concatenate all DataFrames in the list
    concatenated_df = pd.concat(dfs, ignore_index=True)

    # Save the concatenated DataFrame to a new CSV file
    concatenated_df.to_csv("2023.csv", index=False, encoding = "utf-16")
    print(f"All CSV files in '{data_path}' have been concatenated and saved to 2023.csv!")

In [16]:
# Create a dataframe to store data 
for day in date_range:
    
    df = pd.DataFrame(columns = ["title","date","run_time","genre","rating","introduction","director","stars","num_votes"])

    # Start page
    current_page = 1 
    start_movie = (current_page-1) * 50 + 1 

    # request from the website
    soup = request(day,start_movie)

    # how many movies released on that day
    num_of_movie = movie_by_day(soup)
    
    # how many page?
    max_page = math.ceil(float(num_of_movie)/50)

    # read the data of the page
    for content in soup.find_all("div",{"class":"lister-item-content"}):
        title = scrape_title(content)
        run_time = scrape_run_time(content)
        genre_name = scrape_genre(content)
        rating_imdb = scrape_rating(content) 
        intro = scrape_introduction(content)
        director = scrape_director(content)
        star = scrape_stars(content)
        vote = scrape_votes(content)
        row = merge_to_row(title,day,run_time,genre_name,rating_imdb,intro,director,star,vote)
        df = pd.concat([df,row],ignore_index=True)

    # if the max page > 1 then continue to scrape. After each page current_page increase by 1.
    if max_page > 1: 
        current_page = current_page + 1 
        while current_page <= max_page:
            time.sleep(3)
            start_movie = (current_page-1) * 50 + 1 
            soup = request(day,start_movie)
            for content in soup.find_all("div",{"class":"lister-item-content"}):
                title = scrape_title(content)
                run_time = scrape_run_time(content)
                genre_name = scrape_genre(content)
                rating_imdb = scrape_rating(content) 
                intro = scrape_introduction(content)
                director = scrape_director(content)
                star = scrape_stars(content)
                vote = scrape_votes(content)
                row = merge_to_row(title,day,run_time,genre_name,rating_imdb,intro,director,star,vote)
                df = pd.concat([df,row],ignore_index=True)
            current_page = current_page + 1 
    
    #print current scraping day
    df.to_csv(data_path +"//" + str(day.strftime("%Y-%m-%d")) + ".csv", encoding = "utf-16", index = None)
    print("Finish for day: " + str(day.strftime("%Y-%m-%d")))
    time.sleep(5)

Finish for day: 2023-05-24
Finish for day: 2023-05-25
Finish for day: 2023-05-26
Finish for day: 2023-05-27
Finish for day: 2023-05-28
Finish for day: 2023-05-29
Finish for day: 2023-05-30
Finish for day: 2023-05-31
Finish for day: 2023-06-01
Finish for day: 2023-06-02
Finish for day: 2023-06-03
Finish for day: 2023-06-04
Finish for day: 2023-06-05
Finish for day: 2023-06-06
Finish for day: 2023-06-07
Finish for day: 2023-06-08
Finish for day: 2023-06-09
Finish for day: 2023-06-10
Finish for day: 2023-06-11
Finish for day: 2023-06-12
Finish for day: 2023-06-13
Finish for day: 2023-06-14
Finish for day: 2023-06-15
Finish for day: 2023-06-16
Finish for day: 2023-06-17
Finish for day: 2023-06-18
Finish for day: 2023-06-19
Finish for day: 2023-06-20
Finish for day: 2023-06-21
Finish for day: 2023-06-22
Finish for day: 2023-06-23
Finish for day: 2023-06-24
Finish for day: 2023-06-25
Finish for day: 2023-06-26
Finish for day: 2023-06-27
Finish for day: 2023-06-28
Finish for day: 2023-06-29
F

In [30]:
concatenate_csv_files(data_path)

All CSV files in 'c:\Users\minhh\Documents\VSCode\studysession\Data Science\imdb_scrpae\data' have been concatenated and saved to 2023.csv!
