In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

AWARD_COLS = ['award_company', 'award_type', 'movie', 'person', 'year']


In [2]:
ref = {
    'best_director': 'director',
    'best_actress': 'actor',
    'best_actor': 'actor',
    'best_picture': 'picture'

}

def get_cell(cell):
    try:
        return cell.find_all('a')[0].text
    except:
        return cell.text
    
class AwardWikiCrawler():
    
    def __init__(self):
        self.dfs = {
            'award_won': {c: [] for c in AWARD_COLS},
            'award_nominated': {c: [] for c in AWARD_COLS}
        }
    
    def __getitem__(self, k):
        return getattr(self, k.replace('best_actress', 'best_actor'))
                       
    def _append_to_dfs(self, key, d):
        for col in AWARD_COLS:
            self.dfs[key][col].append(d.get(col))
        return self
                       
    def load_data(self, url, award_type, award_company):
        class_ref = {
            'oscars': 'wikitable sortable',
            'golden_globe': 'wikitable' 
        }
        year = None
        r = requests.get(url)
        soup = BeautifulSoup(r.content, 'html.parser')
        tables = soup.find_all('table', {'class': class_ref[award_company]})

        for decade_tb in tables:
            for i, row in enumerate(decade_tb.tbody.find_all('tr')[1:]):
                try:
                    d = {}
                    to_append = 'award_nominated'
                    
                    if row.th and award_company == 'oscars':
                        year = row.th.find_all('a')[0].text
                        to_append = 'award_won'

                    if len(row.find_all('td')) == 1 and award_company == 'oscars':
                        year = row.td.find_all('a')[0].text.split('/')[0]
                        continue
                            
                    assemble = self[f'{award_type}_{award_company}'](row)
                    if award_company == 'golden_globe' and 'year' in assemble:
                        year = assemble.pop('year')
                        to_append = 'award_won'
                        
                    d = {
                        'award_type': award_type,
                        'award_company': award_company,
                        'year': year,
                        **assemble
                    }
#                     if d.get('movie') is None:
#                         continue

                except Exception as e:
                    print(row)
                    raise e
                    
                if d:
                    self._append_to_dfs(to_append, d)



                    
        return self

    def best_actor_oscars(self, row):
        d = {}
        for i, cell in enumerate(row.find_all('td')):
            if i == 0:
                d['person'] = get_cell(cell)
            if i == 2:
                d['movie'] = get_cell(cell)
        return d
    
    def best_director_oscars(self, row):
        d = {}
        for i, cell in enumerate(row.find_all('td')):
            if i == 0:
                d['person'] = get_cell(cell)
            if i == 1:
                d['movie'] = get_cell(cell)
        return d
    
    def best_picture_oscars(self, row):
        d = {}
        for i, cell in enumerate(row.find_all('td')):                
            if i == 0:
                d['movie'] = get_cell(cell)
        
        return d
    
    def best_actor_golden_globe(self, row):
        d = {}
        for i, cell in enumerate(row.find_all('td')):
            if len(row.find_all('td')) == 4:
                if i == 0:
                    d['year'] = get_cell(cell)
                if i == 1:
                    d['person'] = get_cell(cell)
                if i == 3:
                    d['movie'] = get_cell(cell)
                    
            if len(row.find_all('td')) == 3:
                if i == 0:
                    d['person'] = get_cell(cell)
                if i == 2:
                    d['movie'] = get_cell(cell)
        return d
    
    def best_director_golden_globe(self, row):
        d = {}
        for i, cell in enumerate(row.find_all('td')):
            if len(row.find_all('td')) == 3:
                if i == 0:
                    d['year'] = get_cell(cell)
                if i == 1:
                    d['person'] = get_cell(cell)
                if i == 2:
                    d['movie'] = get_cell(cell)
                    
            if len(row.find_all('td')) == 2:
                if i == 0:
                    d['person'] = get_cell(cell)

                if i == 1:
                    d['movie'] = get_cell(cell)

        return d
    
    def best_picture_golden_globe(self, row):
        d = {}
        for i, cell in enumerate(row.find_all('td')):
            if len(row.find_all('td')) == 4:
                if i == 0:
                    d['year'] = get_cell(cell)
                if i == 1:
                    d['movie'] = get_cell(cell)
                if i == 2:
                    d['person'] = get_cell(cell)
                    
            if len(row.find_all('td')) == 3:
                if i == 0:
                    d['movie'] = get_cell(cell)
                if i == 1:
                    d['person'] = get_cell(cell)
        
        return d

award_wiki_crawler = AwardWikiCrawler()

In [3]:
AWARD_LIST = [
    {
        'award_company': 'oscars',
        'award_type': 'best_director',
        'url': 'https://en.wikipedia.org/wiki/Academy_Award_for_Best_Director'
    },
    {
        'award_company': 'oscars',
        'award_type': 'best_actor',
        'url': 'https://en.wikipedia.org/wiki/Academy_Award_for_Best_Actor'
    },
    {
        'award_company': 'oscars',
        'award_type': 'best_actress',
        'url': 'https://en.wikipedia.org/wiki/Academy_Award_for_Best_Actress'
    },
    {
        'award_company': 'oscars',
        'award_type': 'best_picture',
        'url': 'https://en.wikipedia.org/wiki/Academy_Award_for_Best_Picture'
    },
    {
        'award_company': 'golden_globe',
        'award_type': 'best_director',
        'url': 'https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Director'
    },
    {
        'award_company': 'golden_globe',
        'award_type': 'best_actor',
        'url': 'https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Motion_Picture_Drama'
    },
    {
        'award_company': 'golden_globe',
        'award_type': 'best_actress',
        'url': 'https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Actress_in_a_Motion_Picture_%E2%80%93_Drama'
    },
    {
        'award_company': 'golden_globe',
        'award_type': 'best_picture',
        'url': 'https://en.wikipedia.org/wiki/Golden_Globe_Award_for_Best_Motion_Picture_%E2%80%93_Drama'
    }
]
for row in AWARD_LIST:
    try:
        award_wiki_crawler.load_data(**row)
        
    except Exception as e:
        print(row)
        raise e
    

In [4]:
file_path = 'files/'
dfs = {}
for k, data in award_wiki_crawler.dfs.items():
    df = pd.DataFrame(data)
    print(f'========= {k} ========')
    print(df.info())
    print(f'========= end ========')
    df.to_csv(f'files/{k}.csv')
    dfs[k] = df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 598 entries, 0 to 597
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   award_company  598 non-null    object
 1   award_type     598 non-null    object
 2   movie          597 non-null    object
 3   person         597 non-null    object
 4   year           598 non-null    object
dtypes: object(5)
memory usage: 23.5+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2832 entries, 0 to 2831
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   award_company  2832 non-null   object
 1   award_type     2832 non-null   object
 2   movie          2816 non-null   object
 3   person         2243 non-null   object
 4   year           2832 non-null   object
dtypes: object(5)
memory usage: 110.8+ KB
None


In [5]:
won = dfs['award_won']
nom = dfs['award_nominated']

In [6]:
won[pd.isnull(won['movie'])]

Unnamed: 0,award_company,award_type,movie,person,year
281,oscars,best_actress,,,2021


In [7]:
nom[pd.isnull(nom['movie'])]

Unnamed: 0,award_company,award_type,movie,person,year
389,oscars,best_actor,,Charles Laughton,1935
390,oscars,best_actor,,Franchot Tone,1935
462,oscars,best_actor,,Burt Lancaster,1953
504,oscars,best_actor,,Peter O'Toole,1964
536,oscars,best_actor,,Laurence Olivier,1972
582,oscars,best_actor,,Albert Finney,1983
820,oscars,best_actress,,Bette Davis,1950
858,oscars,best_actress,,Elizabeth Taylor,1959
1778,golden_globe,best_director,,,1974
1915,golden_globe,best_director,,,2006


In [8]:
nom['award_type'].value_counts()

best_picture     877
best_director    656
best_actress     654
best_actor       645
Name: award_type, dtype: int64

In [10]:
won[won['award_type'] == 'best_actor']

Unnamed: 0,award_company,award_type,movie,person,year
94,oscars,best_actor,The Last Command,Emil Jannings,1927
95,oscars,best_actor,In Old Arizona,Warner Baxter,1928
96,oscars,best_actor,Disraeli,George Arliss,1929
97,oscars,best_actor,A Free Soul,Lionel Barrymore,1930
98,oscars,best_actor,The Champ,Wallace Beery,1931
...,...,...,...,...,...
435,golden_globe,best_actor,Darkest Hour,Gary Oldman,2017
436,golden_globe,best_actor,Bohemian Rhapsody,Rami Malek,2018
437,golden_globe,best_actor,Joker,Joaquin Phoenix,2019
438,golden_globe,best_actor,Ma Rainey's Black Bottom,Chadwick Boseman,2020


In [None]:
nom['golden_globe']