# Movies Scraper

In [54]:
import os
import requests
import pandas as pd
from dotenv import load_dotenv
from random import randrange
import string
from datetime import datetime, timedelta
import random
import hashlib

# take environment variables from .env
load_dotenv()

# Get OMDB secret
OMDB_SECRET = os.environ.get("OMDB_SECRET")

In [3]:
# Get the list of movies from the IMDB top 250
top_movies = pd.read_csv("../data/csv/top-250-movie-ratings.csv").drop(columns = ["Unnamed: 0"])

In [20]:
def random_date(start, end):
    delta = end - start
    int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
    random_second = randrange(int_delta)
    return (start + timedelta(seconds = random_second)).strftime('%m/%d/%Y')

d1 = datetime.strptime('1/1/1970', '%m/%d/%Y')
d2 = datetime.strptime('1/1/2010', '%m/%d/%Y')

### Aquire Data from the OMDB API

In [5]:
# Get the list of movie information as dataframes from the IMDB top 250
movie_info_dfs = []

for index, row in top_movies.iterrows():
    
    movie_title = row['Title'] # Get movie title
    try:

        # Get the movie information using the OMDB API
        movie_info = requests.get('http://www.omdbapi.com/', params = {
            'apikey': OMDB_SECRET,
            't': movie_title
        }).json()

        if movie_info['Response'] == 'True':
            # Convert to dataframe
            movie_info_df = pd.json_normalize(movie_info)

            # Add to list
            movie_info_dfs.append(movie_info_df)
        else:
            print(f"{index}. Information for '{movie_title}' not found.")
    except:
        print(f"{index}. Error requesting '{movie_title}'.")

66. Information for 'Oldeuboi' not found.
135. Information for 'Babam ve Oglum' not found.
143. Error requesting 'Casino'.
182. Information for 'Relatos salvajes' not found.
243. Information for 'The Nights of Cabiria' not found.


### Preprocess the Data

In [39]:
# Concatenate all dataframes into one
df = pd.concat(movie_info_dfs)

# Get rotten tomatoes rating
def get_rotten_tomatoes_rating(x):
    try:
        return x[1]['Value']
    except:
        return None
df['rotten_tomatoes'] = df['Ratings'].apply(get_rotten_tomatoes_rating)

# Remove columns we won't be needing
df = df.drop(columns = [
    'Response', 'totalSeasons', 'Website', 'DVD', 'imdbVotes', 'Production', 'Year', 'Ratings'
])

# Replace string N/A with None
df[df == 'N/A'] = None
df[df == 'None'] = None
df[df == ''] = None

# Rename columns
df.columns = [x.lower() for x in df.columns]
df.rename(columns = {
    'metascore': 'metacritic',
    'boxoffice': 'box_office',
    'imdbrating': 'imdb_rating',
    'writer': 'writers',
    'genre': 'genres',
    'director': 'directors',
    'imdbid': 'id'
    
}, inplace = True)

# Correct formattings and namings
df['runtime'] = df['runtime'].str.replace(' min', '').astype(float)
df['imdb_rating'] = df['imdb_rating'].astype(float)
df['rotten_tomatoes'] = df['rotten_tomatoes'].apply(lambda x: x[:2] if x is not None else None).astype(float)
df['box_office'] = df['box_office'].str.replace(r'[$,]', '', regex = True).astype(float)
df['released'] = pd.to_datetime(df['released'])
df['metacritic'] = df['metacritic'].astype(float)
df['writers'] = df['writers'].apply(lambda x: x.split(',')[0] if x is not None else None)

### Movies

In [115]:
df.reset_index(inplace = True, drop = True)

df['price'] = df['id'].apply(lambda x: random.randint(1, 50))
df['id'] = df.index

df.rename(columns = {
    'writers': 'writer'
}, inplace = True)

movie_cols = ['id', 'title', 'type', 'poster', 'writer', 'runtime', 'released', 'rated', 'country', 'language', 'imdb_rating', 'metacritic', 'rotten_tomatoes', 'box_office', 'price', 'plot']
movies = df[movie_cols]
movies.to_csv("../data/csv/movies.csv", index = False)
movies.head()

Unnamed: 0,id,title,type,poster,writer,runtime,released,rated,country,language,imdb_rating,metacritic,rotten_tomatoes,box_office,price,plot
0,0,The Shawshank Redemption,movie,https://m.media-amazon.com/images/M/MV5BMDFkYT...,Stephen King,142.0,1994-10-14,R,United States,English,9.3,80.0,91.0,28699976.0,30,Two imprisoned men bond over a number of years...
1,1,The Godfather,movie,https://m.media-amazon.com/images/M/MV5BM2MyNj...,Mario Puzo,175.0,1972-03-24,R,United States,"English, Italian, Latin",9.2,100.0,97.0,134966411.0,26,The Godfather follows Vito Corleone Don of the...
2,2,The Godfather: Part II,movie,https://m.media-amazon.com/images/M/MV5BMWMwMG...,Francis Ford Coppola,202.0,1974-12-18,R,United States,"English, Italian, Spanish, Latin, Sicilian",9.0,90.0,96.0,47834595.0,24,The early life and career of Vito Corleone in ...
3,3,The Dark Knight,movie,https://m.media-amazon.com/images/M/MV5BMTMxNT...,Jonathan Nolan,152.0,2008-07-18,PG-13,"United States, United Kingdom","English, Mandarin",9.0,84.0,94.0,534858444.0,45,When the menace known as the Joker wreaks havo...
4,4,12 Angry Men,movie,https://m.media-amazon.com/images/M/MV5BMWU4N2...,Reginald Rose,96.0,1957-04-10,Approved,United States,English,9.0,96.0,10.0,,40,The jury in a New York City murder trial is fr...


### People

In [41]:
people = []
people_id = 0

def get_people(names):
    global people_id

    for name in names.split(', '):
        people.append(( people_id, name, random.choice([0, 1]), None, random.choice(['American', 'Canadian']), None ))
        people_id += 1

_ = df.loc[~df['actors'].isna(), 'actors'].apply(get_people)
_ = df.loc[~df['directors'].isna(), 'directors'].apply(get_people)
people = pd.DataFrame(people, columns = ['id', 'full_name', 'is_married', 'email', 'nationality', 'birth_date'])

people['first_name'] = people['full_name'].apply(lambda x: x.split(' ')[0])
people['last_name'] = people['full_name'].apply(lambda x: x.split(' ')[1] if len(x.split()) == 2 else None)
people['email'] = people['id'].apply(lambda x: ''.join(random.choice(string.ascii_letters) for _ in range(random.randint(5, 10))) + '@gmail.com')
people['birth_date'] = people['id'].apply(lambda x: random_date(d1, d2))

people[['id', 'first_name', 'last_name', 'is_married', 'email', 'nationality', 'birth_date']].to_csv('../data/csv/people.csv', index = False)
people.head()

Unnamed: 0,id,full_name,is_married,email,nationality,birth_date,first_name,last_name
0,0,Tim Robbins,0,ZZJTJj@gmail.com,American,09/12/1985,Tim,Robbins
1,1,Morgan Freeman,1,eIlicrufbD@gmail.com,Canadian,05/13/1996,Morgan,Freeman
2,2,Bob Gunton,0,KXrQcLziQp@gmail.com,American,12/23/2008,Bob,Gunton
3,3,Marlon Brando,0,GAkeqwdJA@gmail.com,American,10/22/2006,Marlon,Brando
4,4,Al Pacino,0,JLydNSq@gmail.com,American,04/22/2008,Al,Pacino


### Casts

In [91]:
casts = []

for index, row in df.iterrows():
    if row['actors'] is not None:
        for name in  row['actors'].split(', '):
            person = people.loc[people['full_name'] == name, 'id']

            if person is not None:
                casts.append((
                    person.values[0],
                    row['id'],
                    name,
                    None,
                ))

casts = pd.DataFrame(casts, columns = ['people_id', 'movie_id', 'role', 'contract'])
casts.to_csv('../data/csv/casts.csv', index = False)
casts.head()

Unnamed: 0,people_id,movie_id,role,contract
0,0,0,Tim Robbins,
1,1,0,Morgan Freeman,
2,2,0,Bob Gunton,
3,3,1,Marlon Brando,
4,4,1,Al Pacino,


### Directors

In [92]:
directors = []

for index, row in df.iterrows():
    if row['directors'] is not None:
        for name in  row['directors'].split(', '):
            person = people.loc[people['full_name'] == name, 'id']

            if person is not None:
                directors.append((
                    person.values[0],
                    row['id'],
                    None,
                ))

directors = pd.DataFrame(directors, columns = ['people_id', 'movie_id', 'contract'])
directors.to_csv('../data/csv/directors.csv', index = False)
directors.head()

Unnamed: 0,people_id,movie_id,contract
0,956,200,
1,957,201,
2,884,202,
3,959,203,
4,960,204,


### Users

In [108]:
users = []

for index, person in people.iterrows():
    if person['id'] not in directors['people_id'] and person['id'] not in casts['people_id']:

        username = ''.join(random.choice(string.ascii_letters) for _ in range(random.randint(5, 12)))
        password = ''.join(random.choice(string.ascii_letters) for _ in range(random.randint(5, 10)))
        hashed_password = hashlib.sha1(password.encode('utf-8') + str(os.urandom(16)).encode('utf-8')).hexdigest()

        users.append((
            person['id'],
            username,
            hashed_password,
            random.randint(50, 100),
        ))

users = pd.DataFrame(users, columns = ['id', 'username', 'password', 'credit'])
users.to_csv('../data/csv/users.csv', index = False)
users.head()

Unnamed: 0,id,username,password,credit
0,600,yYCdSLOHbCi,480e738959634518ebbdaad07f8d2b878a294a3e,85
1,601,AKdFVftSw,ef3304c55c0b8a41c6cec3aaef09b84326c52c54,64
2,602,gCjibMZ,63ae543e7d9e5a04f13bc021cd70a818aa6b7bd3,57
3,603,VqJmrFYHmP,d12fa151fc765f08237621d6719135c69e9c5a1d,54
4,604,Nxoif,38b5a77fb0f9889e1426e30a1e58bc97ef351880,98


### Genres

In [111]:
genres = pd.DataFrame(
    set(', '.join(df['genres'].values).split(", ")),
    columns = ['genre']
)
genres['id'] = genres.index
genres[['id', 'genre']].to_csv('../data/csv/genres.csv', index = False)
genres

Unnamed: 0,genre,id
0,Animation,0
1,Sci-Fi,1
2,Mystery,2
3,Sport,3
4,Family,4
5,Film-Noir,5
6,Game-Show,6
7,Western,7
8,Romance,8
9,Music,9


### Movie Genres

In [112]:
movies_genres = []

for index, row in df.iterrows():
    for genre in row['genres'].split(", "):

        if genre in genres['genre'].values:
            movies_genres.append((
                row.id,
                genres.loc[genres['genre'] == genre, 'id'].values[0]
            ))        

movies_genres = pd.DataFrame(movies_genres, columns = ['movie_id', 'genre_id'])
movies_genres.to_csv('../data/csv/movies_genres.csv', index = False)
movies_genres

Unnamed: 0,movie_id,genre_id
0,0,20
1,1,19
2,1,20
3,2,19
4,2,20
...,...,...
613,243,16
614,243,19
615,244,16
616,244,20


### users_movies

In [135]:
n = 3
k = 50

user_ids = users.sample(k)['id'].values
movie_ids = movies.sample(k)['id'].values

users_movies = []

for i in range(n):
    for j in range(k):
        if (user_ids[j], movie_ids[j]) not in users_movies:
            users_movies.append((
                user_ids[j],
                movie_ids[j],
            ))

users_movies = pd.DataFrame(users_movies, columns = ['user_id', 'movie_id'])
users_movies.to_csv('../data/csv/users_movies.csv', index = False)
users_movies.head()

Unnamed: 0,user_id,movie_id
0,753,146
1,990,7
2,664,177
3,858,78
4,987,205


### Awards

In [190]:
# Get awards list
awards_list = []
for index, row in df.iterrows():
    if row['awards']:
        awards_list.extend(row['awards'].split('. '))
awards_list = list(set(awards_list))

awards = []
for index, cast in casts.iterrows():

    # 50% Chance to get award
    if random.random() > 0.7:
        awards.append((
            index,
            awards_list[index % len(awards_list)],  # title
            cast['people_id'],
            cast['movie_id'],
            random_date(d1, d2),    # issued_date
        ))

awards = pd.DataFrame(awards, columns = ['id', 'title', 'people_id', 'movie_id', 'issued_date'])
awards.to_csv('../data/csv/awards.csv', index = False)
awards.head()

Unnamed: 0,id,title,people_id,movie_id,issued_date
0,7,107 wins & 140 nominations total,7,2,08/08/1987
1,12,19 wins & 15 nominations total,12,4,10/11/1994
2,13,Nominated for 1 BAFTA Film Award5 wins & 2 nom...,13,4,12/29/2009
3,14,8 wins & 7 nominations total,14,4,11/10/1987
4,16,16 wins & 12 nominations total,16,5,06/09/2008


### purchases

In [214]:
k = 100

purchase_users = users.sample(100)['id'].values
purchase_movies = movies.sample(100)['id'].values

purchases = []
for i in range(k):
    purchases.append((
        purchase_movies[i],
        purchase_users[i],        
        random_date(d1, d2)
    ))

purchases = pd.DataFrame(purchases, columns = ['movie_id', 'user_id', 'purchase_date'])
purchases.to_csv('../data/csv/purchases.csv', index = False)
purchases.head()

Unnamed: 0,movie_id,user_id,purchase_date
0,59,775,07/19/1983
1,52,617,12/24/1993
2,29,624,10/09/2006
3,30,679,10/06/1999
4,11,782,09/18/1997


### rents

In [215]:
k = 100

rent_users = users.sample(100)['id'].values
rent_movies = movies.sample(100)['id'].values

rents = []
for i in range(k):
    rents.append((
        rent_movies[i],
        rent_users[i],        
        random_date(d1, d2),
        random_date(d1, d2)
    ))

rents = pd.DataFrame(rents, columns = ['movie_id', 'user_id', 'rented_date', 'returned_date'])
rents.to_csv('../data/csv/rents.csv', index = False)
rents.head()

Unnamed: 0,movie_id,user_id,rented_date,returned_date
0,63,735,11/22/1981,03/23/1991
1,68,757,11/25/1992,05/21/2006
2,4,761,01/26/1970,02/12/1972
3,222,900,12/19/1985,07/07/2000
4,115,692,07/22/1996,08/23/1998
