In [1]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import requests
from IPython.core.display import display, HTML

In [2]:
def all_lowercase(df):
    for col in df.columns:
        if type(df[col][4]) == str:
            df[col] = df[col].str.lower()

# BAFTA

In [3]:
bafilmurl = 'https://en.wikipedia.org/wiki/BAFTA_Award_for_Best_Film'
bafilmsoup = BeautifulSoup(requests.get(bafilmurl).text, 'lxml')
bafilmtable = bafilmsoup.find_all(class_='wikitable') 

In [4]:
baftafilm = []

for i in range(len(bafilmtable)):
    rows = [row for row in bafilmtable[i].find_all('tr')]
    for row in rows:
        if len(row.find_all('th')) != 0:
            continue
        if len(row.find_all('td')) == 1:
            year = row.find('td').text[:4]
            if year == 'Best':
                year = 2019
            year = int(year)
        else:
            if len(baftafilm)<2:
                win = True
            elif baftafilm[-1][0] == year:
                win = False
            else:
                win = True
            link = row.find_all('a')[0]
            baftafilm.append([year] + [link.text] + [win])

In [5]:
bestfilm = pd.DataFrame(baftafilm, columns=['year_film','movie_title','bawin'])
bestfilm['category'] = 'picture'

In [6]:
bestfilm

Unnamed: 0,year_film,movie_title,bawin,category
0,1947,The Best Years of Our Lives,True,picture
1,1948,Hamlet,True,picture
2,1948,Crossfire,False,picture
3,1948,The Fallen Idol,False,picture
4,1948,Monsieur Vincent,False,picture
...,...,...,...,...
477,2019,1917,True,picture
478,2019,The Irishman,False,picture
479,2019,Joker,False,picture
480,2019,Once Upon a Time in Hollywood,False,picture


In [7]:
cats2 = ['Direction','Original_Screenplay','Adapted_Screenplay']
bafta2 = {'Direction':[],'Original_Screenplay':[],'Adapted_Screenplay':[]}
cats2dict = {'Direction':'director','Original_Screenplay':'screenplay',
             'Adapted_Screenplay':'screenplay'}

for x in cats2:
    ba2url = f'https://en.wikipedia.org/wiki/BAFTA_Award_for_Best_{x}'
    ba2soup = BeautifulSoup(requests.get(ba2url).text, 'lxml')
    ba2table = ba2soup.find_all(class_='wikitable') 

    for i in range(len(ba2table)):
        rows = [row for row in ba2table[i].find_all('tr')]
        for row in rows:
            if len(row.find_all('th')) != 0:
                continue
            if len(row.find_all('td')) == 1:
                year = row.find('td').text[:4]
                if year == 'Best':
                    year = 2019
                year = int(year)
            else:
                if len(bafta2[x])<1:
                    win = True
                elif bafta2[x][-1][0] == year:
                    win = False
                else:
                    win = True
                items = row.find_all('td')
                if x == 'Direction':
                    link = row.find_all('a')[1]
                else:
                    link = row.find_all('a')[0]
                bafta2[x].append([year] + [link.text] + [win] + [cats2dict[x]])

In [8]:
ba2 = bafta2['Direction']+bafta2['Original_Screenplay']+bafta2['Adapted_Screenplay']

In [9]:
best_dir_and_scrply = pd.DataFrame(ba2, columns=['year_film','movie_title','bawin','category'])

In [10]:
best0 = pd.concat([bestfilm,best_dir_and_scrply]).reset_index().drop(columns='index')
best0['nominee'] = best0.movie_title

In [11]:
cats3 = ['Actor_in_a_Leading_Role','Actress_in_a_Leading_Role',
         'Actor_in_a_Supporting_Role','Actress_in_a_Supporting_Role']
bafta3 = {'Actor_in_a_Leading_Role':[],'Actress_in_a_Leading_Role':[],
          'Actor_in_a_Supporting_Role':[],'Actress_in_a_Supporting_Role':[]}
cats3dict = {'Actor_in_a_Leading_Role':'actor','Actress_in_a_Leading_Role':'actress',
             'Actor_in_a_Supporting_Role':'supporting actor',
             'Actress_in_a_Supporting_Role':'supporting actress'}
cats3ab = {'Actor_in_a_Leading_Role':[1,8],'Actress_in_a_Leading_Role':[0,8],
             'Actor_in_a_Supporting_Role':[0,6],
             'Actress_in_a_Supporting_Role':[0,6]}

for x in cats3:
    ba3url = f'https://en.wikipedia.org/wiki/BAFTA_Award_for_Best_{x}'
    ba3soup = BeautifulSoup(requests.get(ba3url).text, 'lxml')
    ba3table = ba3soup.find_all(class_='wikitable')
    
    a = cats3ab[x][0]
    b = cats3ab[x][1]

    for i in range(a,b):
        rows = [row for row in ba3table[i].find_all('tr')]
        for row in rows:
            if len(row.find_all('th')) != 0:
                continue
            if len(row.find_all('td')) == 1:
                text = row.find('td').text[:4]
                if text == 'Best':
                    cat = row.find('td').text
                else:
                    year = int(text)
            else:
                if len(bafta3[x])<1:
                    win = True
                elif (bafta3[x][-1][0] == year) & (bafta3[x][-1][1] == cat):
                    win = False
                else:
                    win = True
                items = row.find_all('td')
                if len(row.find_all('a')) == 0:
                    continue
                actor = row.find_all('a')[0].text
                if len(row.find_all('i')) > 0:
                    film = row.find_all('i')[0].text
                else:
                    film = bafta3[x][-1][3]
                if film == actor:
                    actor = bafta3[x][-1][2]
                bafta3[x].append([year] + [cat] + [actor] + [film] + [win] + [cats3dict[x]])

In [12]:
ba3 = bafta3['Actor_in_a_Leading_Role'] + bafta3['Actor_in_a_Supporting_Role'] + bafta3[
    'Actress_in_a_Leading_Role'] + bafta3['Actress_in_a_Supporting_Role']

In [13]:
best_act = pd.DataFrame(ba3, columns=[
    'year_film','award','nominee','movie_title','bawin','category'])
best_act.drop(columns = 'award', inplace = True)
bafta = pd.concat([best0, best_act])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


In [14]:
bafta.nominee = bafta.nominee.str.lower()
bafta.movie_title = bafta.movie_title.str.lower()

# Guild

## DGA

In [15]:
dgaurl = 'https://en.wikipedia.org/wiki/Directors_Guild_of_America_Award_for_Outstanding_Directing_–_Feature_Film'
dgasoup = BeautifulSoup(requests.get(dgaurl).text, 'lxml')
dgatable = dgasoup.find_all(class_='wikitable') 

In [16]:
dgadir = []

for i in range(len(dgatable)):
    rows = [row for row in dgatable[i].find_all('tr')]
    for row in rows:
        if len(row.find_all('th')) != 0:
            continue
        if len(row.find_all('td')) == 4:
            year = row.find('td').text[:4]
            year = int(year)
            link = row.find_all('a')[2]
            win = True
        else:
            link = row.find_all('a')[1]
            win = False
        dgadir.append([year] + [link.text] + [win])

In [17]:
dga = pd.DataFrame(dgadir, columns=['year_film','movie_title','guildwin'])
dga['category'] = 'director'

## PGA

In [18]:
pgaurl = 'https://en.wikipedia.org/wiki/Producers_Guild_of_America_Award_for_Best_Theatrical_Motion_Picture'
pgasoup = BeautifulSoup(requests.get(pgaurl).text, 'lxml')
pgatable = pgasoup.find_all(class_='wikitable') 

In [19]:
pgafilm = []

for i in range(4):
    rows = [row for row in pgatable[i].find_all('tr')]
    for row in rows:
        if len(row.find_all('th')) != 0:
            continue
        if len(row.find_all('td')) == 4:
            year = row.find('td').text[:4]
            year = int(year)
            link = row.find_all('a')[1]
            win = True
        else:
            link = row.find_all('a')[0]
            win = False
        pgafilm.append([year] + [link.text] + [win])

In [20]:
pga = pd.DataFrame(pgafilm, columns=['year_film','movie_title','guildwin'])
pga['category'] = 'picture'

## WGA

In [21]:
wgacats = ['Adapted','Original']
wgadict = {'Adapted':[],'Original':[]}

for x in wgacats:
    wgaurl = f'https://en.wikipedia.org/wiki/Writers_Guild_of_America_Award_for_Best_{x}_Screenplay'
    wgasoup = BeautifulSoup(requests.get(wgaurl).text, 'lxml')
    wgatable = wgasoup.find_all(class_='wikitable') 

    for i in range(len(wgatable)):
        rows = [row for row in wgatable[i].find_all('tr')]           
        for row in rows:
            if len(row.find_all('td')) == 1:
                year = row.find('a').text
                year = int(year)
                if row.find('th') != None:
                    cat = row.find('th').text
            elif len(row.find_all('td')) == 3:
                year = row.find('a').text
                year = int(year)
                link = row.find_all('a')[3]
                win = True
                wgadict[x].append([year] + [cat] + [link.text] + [win])
            elif len(row.find_all('th')) == 1:
                cat = row.find('th').text
            elif len(row.find_all('th')) > 1:
                pass
            else:
                link = row.find_all('a')[0]
                if len(wgadict[x])<1:
                    win = True
                elif (wgadict[x][-1][0] == year) & (wgadict[x][-1][1] == cat):
                    win = False
                else:
                    win = True
                wgadict[x].append([year] + [cat] + [link.text] + [win])

In [22]:
wgascrply = wgadict['Adapted'] + wgadict['Original']

In [23]:
wga = pd.DataFrame(wgascrply, columns=['year_film','type','movie_title','guildwin'])
wga['category'] = 'screenplay'
wga.drop(columns='type',inplace=True)

## SAG

In [35]:
sag = pd.read_csv('screen_actor_guild_awards.csv')
sag.columns = ['year_film', 'category', 'nominee', 'movie_title', 'sagwin']
sag.year_film.iloc[5756] = '2018'
sag.movie_title.iloc[5756] = 'roman j israel esq'
sag = sag.iloc[:5757]
sag.year_film = sag['year_film'].str.split(' - ').str[0].astype(int) - 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [42]:
thedict = {'female':'actress','male':'actor','makeup':'makeup','star of the year':'new',
           'award':'honorary','achievement':'honorary','stunt':'stunt','cast':'picture'}

sag.category[sag.category.str.find('series')>-1] = 'tv'
sag.category[(sag.category.str.find('female')>-1) &
             (sag.category.str.find('support')>-1)] = 'supporting actress'
sag.category[(sag.category.str.find('male')>-1) &
             (sag.category.str.find('support')>-1)] = 'supporting actor'

for old in thedict:
    sag.category[sag.category.str.find(old)>-1] = thedict[old]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


## Clean and Export

In [37]:
all_lowercase(sag)
all_lowercase(dga)
all_lowercase(pga)
all_lowercase(wga)

In [38]:
sag.nominee = sag.nominee.str.lower().str.replace('.','')

In [39]:
dga.movie_title = dga.movie_title.str.replace(', ',' ').str.replace('.','').str.replace('  ',' ')
pga.movie_title = pga.movie_title.str.replace(', ',' ').str.replace('.','').str.replace('  ',' ')
wga.movie_title = wga.movie_title.str.replace(', ',' ').str.replace('.','').str.replace('  ',' ')

In [40]:
for row in range(len(sag)):
    if sag.category[row] == 'picture':
        sag.nominee[row] = sag.movie_title[row]

In [41]:
dga['nominee'] = dga['movie_title']
wga['nominee'] = wga['movie_title']
pga['nominee'] = pga['movie_title']

# Export to CSV

In [51]:
# dga.to_csv('data/dga.csv',index=False)
# wga.to_csv('data/wga.csv',index=False)
# pga.to_csv('data/pga.csv',index=False)
# sag.to_csv('data/sag.csv',index=False)
# bafta.to_csv('data/bafta.csv',index=False)