## Scraping Wikipedia: American films of 1995

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep
from random import randint
pd.set_option('display.max_rows', 500)

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_American_films_of_1995'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

# find all tables of class wikitable
tables = soup.find_all(class_='wikitable')
len(tables)

8

In [3]:
# collect data from the tables
df_list = []

# loop over each table found
for t in range(0, len(tables)):    
    rows = tables[t].find_all('tr')  #find all rows in the tables
    headers = rows[0].find_all('th')    #find the header row
    
    headers_list = ['path']   #to store the header text
    
    for h in range(0, len(headers)):    #loop over each header in header row
        headers_list.append(headers[h].text.strip())  #store header text
    
    rows_dict = {}
    
    for n in range(1,len(rows)):   #loop over each remaining row n
        row_data = rows[n].find_all('td') #find data from each cell in row
        row_data_list = []  #to store cell text
        try:
            path = (row_data[0].find('a')['href'])
        except:
            path = ('no link')
        row_data_list.append(path)
    
        for i in range(0, len(row_data)): #loop over each cell i
            data = row_data[i].text.strip() #collect cell text
            row_data_list.append(data)  #store cell text
        rows_dict['row{}'.format(n)] = row_data_list #store row data in dict

    
   # headers_list.append('path')
    
    df = pd.DataFrame.from_dict(rows_dict, orient='index', columns=headers_list)
    df_list.append(df)
    
df_final = pd.concat(df_list).reset_index(drop=True)


In [4]:
# go to url for each film and collect plot data
df_final['Plot'] = 'None'

for i in range(0, len(df_final)):
    sleep(randint(1,3)) 
    try: 
        url = 'https://en.wikipedia.org' + df_final['path'][i]
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
    except:
        continue  #if film doesn't have url then move to next row
    try:    #try setting up first tag as header for Plot, Premise, Introduction etc
        first_tag = soup.find('span', id='Plot').parent 
    except:
        try:
            first_tag = soup.find('span', id='Premise').parent
        except:
            try:
                first_tag = soup.find('span', id='Introduction').parent 
            except:
                try:
                    first_tag = soup.find('span', id='Summary').parent 
                except:
                    try:
                        first_tag = soup.find('span', id='Overview').parent 
                    except:
                        try:
                            first_tag = soup.find('span', id='Plot_summary').parent 
                        except:
                            try:
                                first_tag = soup.find('span', id='Synopsis').parent 
                            except:
                                continue
    next_tag = first_tag #to begin while loop instantiate next_tag
    text = []
    while True:
        next_tag = next_tag.find_next_sibling() #update next_tag to next sibling
        tag_name = next_tag.name 
        if tag_name == 'p':   #check if updated next_tag name is p
            text.append(next_tag.text)
        else:     #if it is not, then end since you now have everything you need
            break 
    all_text = ''.join(text)
    df_final.loc[i,'Plot']= all_text

df_final.to_csv('scraped_csv/wiki_films.csv', index=False)

In [7]:
df_final.drop('path', axis=1)

Unnamed: 0,Title,Director,Cast,Genre,Note,Plot
0,12 Monkeys,Terry Gilliam,"Bruce Willis, Madeleine Stowe, Brad Pitt, Chri...",Science fiction,,"A deadly virus, released in 1996, wipes out al..."
1,3 Ninjas Knuckle Up,Sang-ok Shin,"Victor Wong, Charles Napier, Michael Treanor, ...",Comedy,,"Rocky (Michael Treanor), Colt (Max Elliott Sla..."
2,Above Suspicion,Steven Schachter,"Christopher Reeve, Kim Cattrall, Joe Mantegna",Thriller,HBO,Christopher Reeve stars as a paralyzed police ...
3,Ace Ventura: When Nature Calls,Steve Oedekerk,"Jim Carrey, Ian McNeice, Sophie Okonedo, Bob G...",Comedy,sequel,"In the Himalayas, after a failed rescue missio..."
4,The Addiction,Abel Ferrara,"Lili Taylor, Christopher Walken, Annabella Sci...",Horror,,"Kathleen Conklin, an introverted graduate stud..."
5,Amanda and the Alien,Jon Kroll,"John Diehl, Michael Dorn",Sci-fi comedy,,"Amanda Patterson, a typical Gen X girl and emp..."
6,The Amazing Panda Adventure,Christopher Cain,"Stephen Lang, Ryan Slater, Yi Ding",Family,,Ryan Tyler's father Michael sends him a plane ...
7,The American President,Rob Reiner,"Michael Douglas, Annette Bening, Martin Sheen,...",Romantic comedy,script by Aaron Sorkin; 5 Golden Globe nominat...,Popular Democratic President Andrew Shepherd i...
8,Angela,Rebecca Miller,Miranda Stuart Rhyne,Drama,,Angela is a 10-year-old girl trying to cope wi...
9,Angels and Insects,Philip Haas,"Mark Rylance, Patsy Kensit, Kristin Scott Thomas",Drama,,"William Adamson (Mark Rylance), a naturalist, ..."
