https://en.wikipedia.org/wiki/Category:Lists_of_American_films_by_year

In [1]:
american_movies_by_year_link_base = 'https://en.wikipedia.org/wiki/List_of_American_films_of_'

american_movies_by_year_links = [american_movies_by_year_link_base + str(year) for year in range(2005, 2022)]

In [19]:
import requests
import urllib.request
import time
import re
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from urllib.request import urlopen
from tqdm import tqdm

In [42]:
def get_movie_titles_and_urls_for_year(year):
    url = american_movies_by_year_link_base + str(year)
    html = urlopen(url)
    soup = BeautifulSoup(html, 'html5lib')

    all_movie_titles = []
    all_movie_urls = []
    error_count = 0

    tables = soup.find_all('table', class_='wikitable')
    for table in tables:
        all_headers = [ th.text.replace("\n", "") for th in table.find_all("th")]
        if 'Title' in all_headers and 'Production company' in all_headers:
            rows = table.tbody.find_all('tr')
            for row in rows[1:]:
                try:
                    title = row.find_all('i')[0].a
                    url = 'https://en.wikipedia.org' + title['href']
                    all_movie_urls.append(url)
                    all_movie_titles.append(title.text)
                except:
                    error_count += 1
                    pass

    return all_movie_titles, all_movie_urls, error_count

def get_plot_data(movie_page_url):
    html = urlopen(movie_page_url)
    soup = BeautifulSoup(html, 'html5lib')
    try:
        sibling_element = soup.find(id='Plot').find_parent('h2').find_next_sibling()
        plot = ''
        while sibling_element.name == 'p':
            plot = plot + "\n" + sibling_element.text
            sibling_element = sibling_element.find_next_sibling()
        return None if plot == '' else plot
    except:
        return None


In [43]:
all_movie_titles = []
all_movie_urls = []
total_error_count = 0

print('Collecting movie titles and urls...')

for year in tqdm(range(2005, 2022)):
    movie_titles, movie_urls, error_count = get_movie_titles_and_urls_for_year(year)
    all_movie_titles.extend(movie_titles)
    all_movie_urls.extend(movie_urls)
    total_error_count += error_count
    #print(f'Year: {year}, Total Movies: {len(movie_titles)}, Total Errors: {error_count}')


Collecting movie titles and urls...


100%|██████████| 17/17 [00:18<00:00,  1.06s/it]


In [44]:

final_data = []

print('Collecting movie plots...')

for movie_title, movie_url in tqdm(zip(all_movie_titles, all_movie_urls)):
    plot = get_plot_data(movie_url)
    if plot is not None:
        final_data.append([movie_title, movie_url, plot])

Collecting movie plots...


3709it [1:08:13,  1.10s/it]


In [46]:
df = pd.DataFrame(final_data, columns=['title', 'url', 'plot'])
df.shape

(3192, 3)

In [None]:
file_path = 'movie_plots.csv'
df.to_csv(file_path)