In [347]:
from BeautifulSoup import BeautifulSoup
import bs4
import json
import requests
import argparse
import csv
import operator
import codecs
import cStringIO

# STEP 1: Generate List of Animated Movies Released Since 1980

In [308]:
def scrape_movie_lists(url):
    """
    This function scrapes a wikipedia url and returns the
    movie titles for a given year. The code is specific
    to wikipedia's animated movie lists by year.
    """
    
    # Get web page
    response = requests.get(url)

    # Create soup object from page content
    soup = bs4.BeautifulSoup(response.text, "html.parser")
    main_table = soup.findAll('table','wikitable',limit=1)[0]
    
    # Get table rows
    trs = main_table.findAll("tr")

    list_of_movies = []
    list_of_techniques = [] # Currently not being used
    
    # Get movie titles
    for row in trs:        
        cols = row.findAll('td', limit=1)
        
        for td in cols:
            
            # Filter by the i tag since the "small" tag includes foreign languages
            i_tag = td.i
            a_tag = i_tag.a
            if type(a_tag)==bs4.element.Tag:
                list_of_movies.append(a_tag.get_text())  
            else: 
                pass
    
    return list_of_movies

In [342]:
test = scrape_movie_lists('https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_2016')
print '\n'.join(test[0:10])

Accel World: Infinite Burst
Ajin -Shōtotsu-
Ajin: Shōgeki
Albert
The Angry Birds Movie
Another Day of Life
The Ape Story
Ballerina
Big Fish & Begonia
BoBoiBoy: The Movie


In [264]:
# To start, I will look at all animated films released in 1980 or later:
# It's faster to copy paste the link and just change the last digits to reflect the release year:

urls = ["https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_1980",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_1981",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_1982",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_1983",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_1984",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_1985",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_1986",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_1987",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_1988",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_1989",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_1990",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_1991",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_1992",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_1993",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_1994",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_1995",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_1996",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_1997",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_1998",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_1999",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_2000",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_2001",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_2002",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_2003",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_2004",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_2005",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_2006",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_2007",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_2008",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_2009",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_2010",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_2011",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_2012",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_2013",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_2014",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_2015",
       "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of_2016"]

In [309]:
# Call the function scrape_movie_lists for each year, starting with 1980:

movie_titles_all = []
for url in urls:
    movie_titles_all.append(scrape_movie_lists(url))
movie_titles_all = reduce(operator.concat, movie_titles_all)
print len(movie_titles_all)

2629


In [343]:
print '\n'.join(movie_titles_all[0:10])

15 Sonyeon Uju Pyoryugi
Animalympics
Be Forever Yamato
Bloody Lady
Bon Voyage, Charlie Brown (and Don't Come Back!!)
Cyborg 009: Legend of the Super Galaxy
Doksuri 5 Hyeongje
Doraemon: The Motion Picture
Eleven Hungry Cats
Fumoon


In [345]:
# Copied from python documentation:

class UnicodeWriter:
    """
    A CSV writer which will write rows to CSV file "f",
    which is encoded in the given encoding.
    """

    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
        # Redirect output to a queue
        self.queue = cStringIO.StringIO()
        self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
        self.stream = f
        self.encoder = codecs.getincrementalencoder(encoding)()

    def writerow(self, row):
        self.writer.writerow([s.encode("utf-8") for s in row])
        # Fetch UTF-8 output from the queue ...
        data = self.queue.getvalue()
        data = data.decode("utf-8")
        # ... and reencode it into the target encoding
        data = self.encoder.encode(data)
        # write to the target stream
        self.stream.write(data)
        # empty queue
        self.queue.truncate(0)

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)
        

In [348]:
# Save the list of animated movies in a csv file:

with open('movie_list.csv', 'wb') as output:
    writer = UnicodeWriter(output, lineterminator='\n', quoting=csv.QUOTE_NONNUMERIC)
    for val in movie_titles_all:
        writer.writerow([val])  