# Web-scraping with Python

The idea is to take each movie from Wikipedia page https://en.wikipedia.org/wiki/List_of_Academy_Award-winning_films and to store each movie's table(which is under its poster). The table contains each movie's actors, directors, running time, budget, etc. <br>
Next work includes reformatting some data and getting IMDB score for each movie.<br>
Credits to https://www.youtube.com/watch?v=Ewgy-G9cmbg, where I found this task for the Disney movie dataset.

In [1]:
from bs4 import BeautifulSoup as bs
import requests

In [2]:
r = requests.get('https://en.wikipedia.org/wiki/Tenet_(film)')
soup = bs(r.content)
contents = soup.prettify()   #print(contents)  - nice print

In [3]:
infobox = soup.find(class_='infobox vevent')   #locating the table under the movie's poster
info_rows = infobox.find_all('tr')

In [4]:
def get_content_value(row_data):
    """
    Adding movie creators to the list.
    """
    if row_data.find('li'):
        return [li.get_text(' ',strip=True).replace('\xa0',' ') \
                for li in row_data.find_all('li')]
    elif row_data.find('br'):   #In case actors are separated by <br> tag
        return [text for text in row_data.stripped_strings] 
    
    else:
        return row_data.get_text(' ',strip=True).replace('\xa0','')

In [5]:
#Example of saving movie's data
movie_info = {}
for index, row in enumerate(info_rows):
    if index == 0:
        movie_info['title'] = row.find('th').get_text(' ',strip=True)
    elif index == 1:
        continue
    else:
        content_key = row.find('th').get_text(' ',strip=True)
        content_value = get_content_value(row.find('td'))
        movie_info[content_key] = content_value
movie_info

# Solving the task for every Oscar movie

In [6]:
def clean_tags(soup):
    """
    Cleans from references.
    """
    for tag in soup.find_all(['sup','span']):
        tag.decompose()

def get_info_box(url):
    """
    Takes the table from url.
    """
    
    r = requests.get(url)
    soup = bs(r.content)
    
    infobox = soup.find(class_='infobox vevent')
    info_rows = infobox.find_all('tr')
    clean_tags(soup)
    movie_info = {}
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['title'] = row.find('th').get_text(' ',strip=True)
        
        else:
            header = row.find('th')
            if header:
                content_key = row.find('th').get_text(' ',strip=True)
                content_value = get_content_value(row.find('td'))
                movie_info[content_key] = content_value
    return movie_info


In [7]:
get_info_box('https://en.wikipedia.org/wiki/Hair_Love')

In [None]:
a = requests.get('https://en.wikipedia.org/wiki/List_of_Academy_Award-winning_films')
soup1 = bs(a.content)
infobox1 = soup1.select('.wikitable.sortable i a') #taking all the movies from the table

base_path = 'https://en.wikipedia.org/'
movie_info_list = []

for index,movie in enumerate(infobox1):
    if index == 50:
        print(index)
    try:
        relative_path = movie['href']
        full_path = base_path+relative_path
        title = movie['title']
        movie_info_list.append(get_info_box(full_path))
    except Exception as e:  #Checking errors
        print(movie.get_text())
        print(e)

In [None]:
len(movie_info_list)

# Save/reload Movie data

In [8]:
import json

def save_data(title,data):
    with open(title,'w',encoding='utf-8') as f:
        json.dump(data,f,ensure_ascii=False,indent=2)
        
def load_data(title):
    with open(title,encoding='utf-8') as f:
        return json.load(f)

In [None]:
save_data('oscar_data_cleaned.json',movie_info_list)

# Clean data

In [9]:
movie_info_list = load_data('../input/oscar-data-clean-json/oscar_data_cleaned.json')

### Subtasks
- ~~Clean up refs~~
- ~~Convert running time into integers~~
- ~~Convert dates into datetime object~~
- ~~Split up the long strings~~
- ~~Convert Budget & Box offices to numbers~~

In [None]:
movie_info_list[-10]['Running time']


In [10]:
#Convert running time into integers
def minutes_to_integer(running_time):
    """
    Transforms the running time to integer.
    """
    if running_time == 'N/A':
        return None  
    elif isinstance(running_time,list):
        entry = running_time[0]
        value = int(entry.split(' ')[0])
        return value
    else:
        value = running_time.split(' ')[0]
        value = value.split(':')[0]
        return value
deleted_titles = ['Funny Girl','Fanny and Alexander','War and Peace','Mister Roberts',\
                  'The Walls of Malapaga','Wings']  #Movies with non-ordinary time length
    
for movie in movie_info_list:
    
    if movie['title'] in deleted_titles:
        movie_info_list.remove(movie)
        continue
        #print(movie)
        #print(movie['Running time'])
    movie['Running time (int)'] = minutes_to_integer(movie.get('Running time','N/A'))

In [None]:

#[movie.get('Budget','N/A') for movie in movie_info_list]

In [11]:
import re
#Converts budgets and box offices into integers
amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"

word_re = fr"\${number}(-|\sto\s|—)?({number})?\s({amounts})"
value_re = fr"\${number}"

def word_to_value(word):
    """
    Transforms word to numerical representation.
    """
    value_dict = {"thousand":1000,'million':1000000,'billion':1000000000}
    return value_dict[word]

def parse_word_syntax(string):
    """
    Transforms budget, which contains words.
    """
    value_string = re.search(number,string).group()
    value = float(value_string.replace(',',''))
    word = re.search(amounts,string,flags = re.I).group()\
                                            .lower()#flags for the case with upper case
                                                        #('Million')
    word_value = word_to_value(word)
    return value*word_value

def parse_value_syntax(string):
    """
    Transforms budget, which doesn't contain any words.
    """
    value_string = re.search(number,string).group()
    value = float(value_string.replace(',',''))
    return value

def money_conversion(money):
    """
    Transforms string containing budget to numerical representation.
    """
    if money == 'N/A':
        return None
    
    if isinstance(money,list):  #If there are several values in a list
        money = money[0]
    
    word_syntax = re.search(word_re,money,flags=re.I)   #If money has words
    value_syntax = re.search(value_re,money) #If it doesn't have words
    
    if word_syntax:
        return parse_word_syntax(word_syntax.group())
    elif value_syntax:
        return parse_value_syntax(value_syntax.group())
    else:
        return None
print(money_conversion('$790 Million'))

In [12]:
for movie in movie_info_list:
    movie['Budget (float)'] = money_conversion(movie.get('Budget','N/A'))
    movie['Box office (float)'] = money_conversion(movie.get('Box office','N/A'))

In [13]:
movie_info_list[300]

In [14]:
print([movie.get('Release date','N/A') for movie in movie_info_list])

In [15]:
#Converts dates into datetimes
from datetime import datetime
dates = [movie.get('Release date','N/A') for movie in movie_info_list]

def clean_date(date):
    """
    Cleans the date from info in brackets.
    """
    date = date.split('(')[0].strip()
    return date
    
def date_conversion(date):
    """
    Transforms release date into datetime format.
    """
    if isinstance(date,list):
        date = date[0]
    if date == 'N/A':
        return None
        
    date_str = clean_date(date)    
    
    fmts =['%B %d, %Y','%d %B %Y']
    for fmt in fmts:
        try:
            return datetime.strptime(date_str,fmt)
        except:
            pass
    return None  

In [16]:
for movie in movie_info_list:
    movie['Release date (datetime)'] = date_conversion(movie.get('Release date','N/A'))

In [17]:
movie_info_list[0]

# Saving data into pickle format

Datetime object can't be saved in **json** format. That's why **pickle** is used.

In [18]:
import pickle

def save_pickle(name,data):
    with open(name, 'wb') as a:
        pickle.dump(data, a, protocol=pickle.HIGHEST_PROTOCOL)

def load_pickle(name):
    with open(name, 'rb') as a:
        b = pickle.load(a)
        return b

In [25]:
save_pickle('oscar_data_cleaned_more.pickle',movie_info_list)

# Attach IMDB/Rotten Tomatoes/Metascore scores

In [19]:
movie_info_list = \
load_pickle('../input/oscar-data-cleaned-more-pickle/oscar_data_cleaned_more.pickle')

In [None]:
#http://www.omdbapi.com/?apikey=[yourkey]&

In [34]:
import os
import urllib

base_link = 'http://www.omdbapi.com/?'

def omdb_info(title):
    """
    Gets info from the omdb web-site(special api for movie scores) for a specific movie.
    You need to download your own apikey from base_link for this function.
    """
    
    parameters = {'apikey':os.environ['OMDB_API_KEY'],'t':title}
    params_encoded = urllib.parse.urlencode(parameters)
    full_link = base_link + params_encoded
    return requests.get(full_link).json()

def get_rotten_tomatoes_score(omdb_info):
    """
    As Rotten Tomatoes score is contained in a specific dictionary,
    we will return it if we find it.
    """
    ratings = omdb_info.get('Ratings',[])
    for rating in ratings:
        if rating['Source'] == 'Rotten Tomatoes':
            return rating['Value']
    return None
            

info = omdb_info('A Beautiful Mind')
info
print(get_rotten_tomatoes_score(info))

In [29]:
for index,movie in enumerate(movie_info_list):
    if index % 50 == 0:
        print(index)
    title = movie['title']
    omdb = omdb_info(title)
    movie['IMDB'] = omdb.get('imdbRating',None)
    movie['Metascore'] = omdb.get('Metascore',None)
    movie['Rotten Tomatoes'] = get_rotten_tomatoes_score(omdb)

In [46]:
#As I'm not a subscriber of the Patreon, I don't get ratings after making 1000 requests a day
movie_info_list[-100]

In [47]:
save_pickle('oscar_data_final.pickle',movie_info_list)

# Save data as json and csv

In [48]:
movie_info_list = load_pickle('../input/oscar-data-final/oscar_data_final.pickle')

In [49]:
movie_info_copy = [movie.copy() for movie in movie_info_list]

In [50]:
movie_info_copy[20]

In [51]:
for movie in movie_info_copy:
    date = movie['Release date (datetime)']
    if date:
        movie['Release date (datetime)'] = date.strftime('%B %d, %Y') 
        #Change it to save to json
    else:
        movie['Release date (datetime)'] = None

In [55]:
save_data('oscar_data_final.json',movie_info_copy)

In [57]:
import pandas as pd

df = pd.DataFrame(movie_info_list)

In [58]:
df.head()

In [62]:
df.to_csv('oscar_data_final.csv')

In [65]:
df.info()