In [115]:
import requests
from bs4 import BeautifulSoup as BS
import bs4

In [15]:
def wiki_page(year):
    headers = {'user-agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}
    url = f'https://en.wikipedia.org/wiki/{year}_in_film'
    page = requests.get(url, headers = headers)
    return BS(page.content, 'html.parser')
    

In [170]:
page = wiki_page(2017)

In [17]:
page.prettify()

'<!DOCTYPE html>\n<html class="client-nojs" dir="ltr" lang="en">\n <head>\n  <meta charset="utf-8"/>\n  <title>\n   2018 in film - Wikipedia\n  </title>\n  <script>\n   document.documentElement.className=document.documentElement.className.replace(/(^|\\s)client-nojs(\\s|$)/,"$1client-js$2");RLCONF={"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"2018_in_film","wgTitle":"2018 in film","wgCurRevisionId":914884414,"wgRevisionId":914884414,"wgArticleId":39649311,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 Swedish-language sources (sv)","CS1 errors: markup","Wikipedia semi-protected pages","Wikipedia pending changes protected pages","Articles with short description","Use mdy dates from April 2018","Film by year","2018 in film","2018-related lists","Media timelines by year"],"wgBreakFrames":!1,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTabl

In [241]:
def get_four_tables(page, year):
    try:
        section_title_id = f"{year}_films"
    
        jan_to_mar_table = page.find(id=section_title_id).findNext('table')
        apr_to_jun_table = jan_to_mar_table.findNext('table')
        jul_to_sep_table = apr_to_jun_table.findNext('table')
        oct_to_dec_table = jul_to_sep_table.findNext('table')
        tables = [jan_to_mar_table, apr_to_jun_table, jul_to_sep_table, oct_to_dec_table]
    
    except AttributeError:
        section_title_id=f"{year}_wide-release_films"
        
        jan_to_mar_table = page.find(id=section_title_id).findNext('table')
        apr_to_jun_table = jan_to_mar_table.findNext('table')
        jul_to_sep_table = apr_to_jun_table.findNext('table')
        oct_to_dec_table = jul_to_sep_table.findNext('table')
        tables = [jan_to_mar_table, apr_to_jun_table, jul_to_sep_table, oct_to_dec_table]
    
    except:
        print(f"Could not find tables for year {year}")
        tables = []
    
    return tables

In [58]:
def get_rows(table):
    rows = []
    for i in range(1, len(table.select('tr'))):
        rows.append(table.select('tr')[i])
    return rows

In [93]:
# def get_title(row):
#     title_or_link = row.select('td i')[0].contents[0]
#     if title_or_link.select('a'):
#         title = str(title_or_link.find('a').text)
#     else:
#         title = str(title_or_link)
#     return title

In [249]:
def get_title(row):
    
    if row.select('td i'):
        if len(row.select('td i')[0].contents) <= 1:
            title_or_link = row.select('td i')[0].contents[0]
        else:
            print(f"Title starting with {row.select('td i')[0].contents[0]} is formatted oddly. Cannot grab.")
            return None
    else:
        return None
    
    if type(title_or_link) is bs4.element.Tag:
        title = str(title_or_link.contents[0])
        #title = str(title_or_link.text)
    elif type(title_or_link) is bs4.element.NavigableString:
        title = str(title_or_link)
    else:
        title = None
        
    return title

In [228]:
def get_movie_titles(table):
    rows = get_rows(table)
    movie_titles = []
    for row in rows:
        movie_title = get_title(row)
        if movie_title:
            movie_titles.append(movie_title)
        else:
            print('Could Not Find Movie Title in Row (Skipping)')
    return movie_titles

In [229]:
def parse_page(page, year):
    tables = get_four_tables(page, year)
    movie_titles = []
    for table in tables:
        movie_titles += get_movie_titles(table)
    return movie_titles

In [230]:
def scrape_movies(year):
    movie_titles = parse_page(wiki_page(year), year)
    titles_and_years = list(map(lambda x: (x, year), movie_titles))
    return titles_and_years

In [231]:
def scrape_wiki(start_year, end_year):
    result = []
    for year in range(start_year, end_year+1):
        result += scrape_movies(year)
    return result

In [250]:
movies = scrape_wiki(2004,2018)

Title starting with <a href="/wiki/The_Illusionist_(2006_film)" title="The Illusionist (2006 film)">The Illusionist</a> is formatted oddly. Cannot grab.
Could Not Find Movie Title in Row (Skipping)
Title starting with <a href="/wiki/Babel_(film)" title="Babel (film)">Babel</a> is formatted oddly. Cannot grab.
Could Not Find Movie Title in Row (Skipping)
Title starting with <a href="/wiki/Volver" title="Volver">Volver</a> is formatted oddly. Cannot grab.
Could Not Find Movie Title in Row (Skipping)
Title starting with <a href="/wiki/Bobby_(2006_film)" title="Bobby (2006 film)">Bobby</a> is formatted oddly. Cannot grab.
Could Not Find Movie Title in Row (Skipping)
Title starting with <a href="/wiki/For_Your_Consideration_(film)" title="For Your Consideration (film)">For Your Consideration</a> is formatted oddly. Cannot grab.
Could Not Find Movie Title in Row (Skipping)
Title starting with <a href="/wiki/Dreamgirls_(film)" title="Dreamgirls (film)">Dreamgirls</a> is formatted oddly. Canno

Title starting with <a href="/wiki/Brideshead_Revisited_(film)" title="Brideshead Revisited (film)">Brideshead Revisited</a> is formatted oddly. Cannot grab.
Could Not Find Movie Title in Row (Skipping)
Title starting with <a href="/wiki/Frozen_River" title="Frozen River">Frozen River</a> is formatted oddly. Cannot grab.
Could Not Find Movie Title in Row (Skipping)
Title starting with <a href="/wiki/The_Midnight_Meat_Train" title="The Midnight Meat Train">The Midnight Meat Train</a> is formatted oddly. Cannot grab.
Could Not Find Movie Title in Row (Skipping)
Title starting with <a href="/wiki/Hamlet_2" title="Hamlet 2">Hamlet 2</a> is formatted oddly. Cannot grab.
Could Not Find Movie Title in Row (Skipping)
Title starting with <a href="/wiki/Towelhead_(film)" title="Towelhead (film)">Towelhead</a> is formatted oddly. Cannot grab.
Could Not Find Movie Title in Row (Skipping)
Title starting with <a href="/wiki/Appaloosa_(film)" title="Appaloosa (film)">Appaloosa</a> is formatted oddly.

In [242]:
movies

[('Chasing Liberty', 2004),
 ("My Baby's Daddy", 2004),
 ('Along Came Polly', 2004),
 ("Teacher's Pet", 2004),
 ('Torque', 2004),
 ('The Butterfly Effect', 2004),
 ('Win a Date with Tad Hamilton!', 2004),
 ('The Big Bounce', 2004),
 ('The Perfect Score', 2004),
 ('You Got Served', 2004),
 ('Barbershop 2: Back in Business', 2004),
 ('Catch That Kid', 2004),
 ('Miracle', 2004),
 ('Pinocchio 3000', 2004),
 ('50 First Dates', 2004),
 ('Against the Ropes', 2004),
 ('Confessions of a Teenage Drama Queen', 2004),
 ('EuroTrip', 2004),
 ('Welcome to Mooseport', 2004),
 ('The Passion of the Christ', 2004),
 ('Club Dread', 2004),
 ('Twisted', 2004),
 ('Hidalgo', 2004),
 ('Starsky & Hutch', 2004),
 ('Agent Cody Banks 2: Destination London', 2004),
 ('Secret Window', 2004),
 ('Spartan', 2004),
 ('Dawn of the Dead', 2004),
 ('Eternal Sunshine of the Spotless Mind', 2004),
 ('Taking Lives', 2004),
 ('Jersey Girl', 2004),
 ('The Ladykillers', 2004),
 ('Never Die Alone', 2004),
 ('Scooby-Doo 2: Monster

In [247]:
import csv

with open('movies.csv', 'w') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerows(movies)