# Movies Scraper

In [67]:
import os
import requests
import pandas as pd
from dotenv import load_dotenv
import random

# take environment variables from .env
load_dotenv()

# Get OMDB secret
OMDB_SECRET = os.environ.get("OMDB_SECRET")

In [2]:
# Get the list of movies from the IMDB top 250
top_movies = pd.read_csv("../data/top-250-movie-ratings.csv").drop(columns = ["Unnamed: 0"])

### Aquire Data from the OMDB API

In [3]:
# Get the list of movie information as dataframes from the IMDB top 250
movie_info_dfs = []

for index, row in top_movies.iterrows():
    
    movie_title = row['Title'] # Get movie title
    try:

        # Get the movie information using the OMDB API
        movie_info = requests.get('http://www.omdbapi.com/', params = {
            'apikey': OMDB_SECRET,
            't': movie_title
        }).json()

        if movie_info['Response'] == 'True':
            # Convert to dataframe
            movie_info_df = pd.json_normalize(movie_info)

            # Add to list
            movie_info_dfs.append(movie_info_df)
        else:
            print(f"{index}. Information for '{movie_title}' not found.")
    except:
        print(f"{index}. Error requesting '{movie_title}'.")

66. Information for 'Oldeuboi' not found.
143. Error requesting 'Casino'.
182. Information for 'Relatos salvajes' not found.
205. Information for 'The Wages of Fear' not found.
213. Information for 'The Passion of Joan of Arc' not found.
243. Information for 'The Nights of Cabiria' not found.


### Preprocess the Data

In [55]:
# Concatenate all dataframes into one
df = pd.concat(movie_info_dfs)

# Get rotten tomatoes rating
def get_rotten_tomatoes_rating(x):
    try:
        return x[1]['Value']
    except:
        return None
df['rotten_tomatoes'] = df['Ratings'].apply(get_rotten_tomatoes_rating)

# Remove columns we won't be needing
df = df.drop(columns = [
    'Response', 'totalSeasons', 'Website', 'DVD', 'imdbVotes', 'Production', 'Year', 'Ratings'
])

# Replace string N/A with None
df[df == 'N/A'] = None
df[df == 'None'] = None
df[df == ''] = None

# Rename columns
df.columns = [x.lower() for x in df.columns]
df.rename(columns = {
    'metascore': 'metacritic',
    'boxoffice': 'box_office',
    'imdbrating': 'imdb_rating',
    'writer': 'writers',
    'genre': 'genres',
    'director': 'directors',
    'imdbid': 'id'
    
}, inplace = True)

# Correct formattings and namings
df['runtime'] = df['runtime'].str.replace(' min', '').astype(float)
df['imdb_rating'] = df['imdb_rating'].astype(float)
df['rotten_tomatoes'] = df['rotten_tomatoes'].apply(lambda x: x[:2] if x is not None else None).astype(float)
df['box_office'] = df['box_office'].str.replace(r'[$,]', '', regex = True).astype(float)
df['released'] = pd.to_datetime(df['released'])
df['metacritic'] = df['metacritic'].astype(float)
df['writers'] = df['writers'].apply(lambda x: x.split(',')[0] if x is not None else None)

### Movies

In [56]:
df.reset_index(inplace = True, drop = True)

movie_cols = ['id', 'title', 'type', 'poster', 'writers', 'runtime', 'released', 'rated', 'country', 'language', 'imdb_rating', 'metacritic', 'rotten_tomatoes', 'box_office', 'plot']
df[movie_cols].to_csv("../data/csv/movies.csv", index = False)

### People

In [226]:
people = []
people_id = 0

def get_people(names):
    global people_id

    for name in names.split(', '):
        people.append(( people_id, name, random.choice([0, 1]), None, random.choice(['American', 'Canadian']), None ))
        people_id += 1

_ = df.loc[~df['actors'].isna(), 'actors'].apply(get_people)
_ = df.loc[~df['directors'].isna(), 'directors'].apply(get_people)
people = pd.DataFrame(people, columns = ['id', 'full_name', 'is_married', 'email', 'nationality', 'birth_date'])

people.to_csv('../data/csv/people.csv', index = False)

### Actors

In [227]:
casts = []

for index, row in df.iterrows():
    if row['actors'] is not None:
        for name in  row['actors'].split(', '):
            person = people.loc[people['full_name'] == name, 'id']

            if person is not None:
                casts.append((
                    person.values[0],
                    row['id'],
                    name,
                    None,
                ))

casts = pd.DataFrame(casts, columns = ['person_id', 'movie_id', 'role', 'contract'])
casts.to_csv('../data/csv/casts.csv', index = False)

### Directors

In [228]:
directors = []

for index, row in df.iterrows():
    if row['directors'] is not None:
        for name in  row['directors'].split(', '):
            person = people.loc[people['full_name'] == name, 'id']

            if person is not None:
                directors.append((
                    person.values[0],
                    row['id'],
                    None,
                ))

directors = pd.DataFrame(directors, columns = ['person_id', 'movie_id', 'contract'])
directors.to_csv('../data/csv/directors.csv', index = False)

### Genres

In [244]:
genres = pd.DataFrame(
    set(', '.join(df['genres'].values).split(", ")),
    columns = ['genre']
)
genres['id'] = genres.index
genres.to_csv('../data/csv/genres.csv', index = False)

### Movie Genres

In [266]:
movies_genres = []

for index, row in df.iterrows():
    for genre in row['genres'].split(", "):

        if genre in genres['genre'].values:
            movies_genres.append((
                row.id,
                genres.loc[genres['genre'] == genre, 'id'].values[0]
            ))        

movies_genres = pd.DataFrame(movies_genres, columns = ['movie_id', 'genre_id'])
movies_genres.to_csv('../data/csv/movies_genres.csv', index = False)

### Awards

In [268]:
df['awards']

0      Nominated for 7 Oscars. 21 wins & 43 nominatio...
1           Won 3 Oscars. 31 wins & 30 nominations total
2           Won 6 Oscars. 17 wins & 20 nominations total
3         Won 2 Oscars. 159 wins & 163 nominations total
4      Nominated for 3 Oscars. 17 wins & 13 nominatio...
                             ...                        
239         Won 2 Oscars. 16 wins & 17 nominations total
240    Nominated for 3 Oscars. 19 wins & 29 nominatio...
241                                                1 win
242                             17 wins & 45 nominations
243                             20 wins & 15 nominations
Name: awards, Length: 244, dtype: object