# Movies Scraper

In [1]:
import os
import requests
import pandas as pd
from dotenv import load_dotenv

# take environment variables from .env
load_dotenv()

# Get OMDB secret
OMDB_SECRET = os.environ.get("OMDB_SECRET")

In [2]:
# Get the list of movies from the IMDB top 250
top_movies = pd.read_csv("../data/top-250-movie-ratings.csv").drop(columns = ["Unnamed: 0"])

### Aquire Data from the OMDB API

In [3]:
# Get the list of movie information as dataframes from the IMDB top 250
movie_info_dfs = []

for index, row in top_movies.iterrows():
    
    movie_title = row['Title'] # Get movie title
    try:

        # Get the movie information using the OMDB API
        movie_info = requests.get('http://www.omdbapi.com/', params = {
            'apikey': OMDB_SECRET,
            't': movie_title
        }).json()

        if movie_info['Response'] == 'True':
            # Convert to dataframe
            movie_info_df = pd.json_normalize(movie_info)

            # Add to list
            movie_info_dfs.append(movie_info_df)
        else:
            print(f"{index}. Information for '{movie_title}' not found.")
    except:
        print(f"{index}. Error requesting '{movie_title}'.")

66. Information for 'Oldeuboi' not found.
143. Error requesting 'Casino'.
182. Information for 'Relatos salvajes' not found.
205. Information for 'The Wages of Fear' not found.
213. Information for 'The Passion of Joan of Arc' not found.
243. Information for 'The Nights of Cabiria' not found.


### Preprocess the Data

In [14]:
# Concatenate all dataframes into one
df = pd.concat(movie_info_dfs)

# Get rotten tomatoes rating
def get_rotten_tomatoes_rating(x):
    try:
        return x[1]['Value']
    except:
        return None
df['rotten_tomatoes'] = df['Ratings'].apply(get_rotten_tomatoes_rating)

# Remove columns we won't be needing
df = df.drop(columns = [
    'Response', 'totalSeasons', 'Website', 'DVD', 'imdbVotes', 'imdbID', 'Production', 'Year', 'Ratings'
])

# Replace string N/A with None
df[df == 'N/A'] = None
df[df == 'None'] = None
df[df == ''] = None

# Rename columns
df.columns = [x.lower() for x in df.columns]
df.rename(columns = {
    'metascore': 'metacritic',
    'boxoffice': 'box_office',
    'imdbrating': 'imdb_rating',
    'writer': 'writers',
    'genre': 'genres',
    'director': 'directors',
    
}, inplace = True)

# Correct formattings and namings
df['runtime'] = df['runtime'].str.replace(' min', '').astype(float)
df['imdb_rating'] = df['imdb_rating'].astype(float)
df['rotten_tomatoes'] = df['rotten_tomatoes'].apply(lambda x: x[:2] if x is not None else None).astype(float)
df['box_office'] = df['box_office'].str.replace(r'[$,]', '').astype(float)
df['released'] = pd.to_datetime(df['released'])
df['metacritic'] = df['metacritic'].astype(float)
df['writers'] = df['writers'].apply(lambda x: x.split(',')[0] if x is not None else None)

  df['box_office'] = df['box_office'].str.replace(r'[$,]', '').astype(float)


In [16]:
movie_cols = ['title', 'type', 'poster', 'writers', 'runtime', 'released', 'rated', 'country', 'language', 'imdb_rating', 'metacritic', 'rotten_tomatoes', 'box_office', 'plot']
df[movie_cols].to_csv("../data/csv/movie_info.csv", index = False)