## Our Project
#### Team Members: John Clark, Nicole Fejfar, Jason O'Day, Marianne Pagerit
#### Instructions to Graders:
	• In postgreSQL create a database called 'movies_db'
    • Create tables in postgreSQL using the 'schema.sql' file
	• Run the Jupyter Notebook code below
	• 


In [None]:
import pandas as pd
from sqlalchemy import create_engine
import requests

In [None]:
from config import username
from config import password

## The Numbers Website Scrape
Source: https://www.the-numbers.com/box-office-records/domestic/all-movies/cumulative/all-time-inflation-adjusted

In [None]:
# This is the website where we will pull box office numbers for hollywood movies
url = 'https://www.the-numbers.com/box-office-records/domestic/all-movies/cumulative/all-time-inflation-adjusted'

# These headers will allow us to avoid a 403 error by mimicing a web browser
header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

In [None]:
# format the request to mimic a web browser
r = requests.get(url, headers=header)

# import tables using pandas
tables = pd.read_html(r.text)

# save the first table as a dataframe
movie_numbers_df = tables[0]

movie_numbers_df.head()

In [None]:
# set the initial count value. This will be appended to the URL for each loop to get the 
# next page of movies.
count = 101

# iterate through 40 the movie pages, updating the counter for the next set of 100 movies
while count < 4001:
    url_string = url + f'/{count}'
#     print(url_string)
    req = requests.get(url_string, headers=header)
    tables = pd.read_html(req.text)
    movie_numbers_df = movie_numbers_df.append(tables[0])
    count = count + 100

In [None]:
# Rename columns
movie_numbers_df = movie_numbers_df.rename(columns={'Rank': 'rank', 'Released': 'released', 'Movie': 'movie',
                                            'Total Box Office': 'total_box_office'})

In [None]:
movie_numbers_df.shape

## Streaming Movie Platform CSV
Source: https://www.kaggle.com/ruchi798/movies-on-netflix-prime-video-hulu-and-disney?select=MoviesOnStreamingPlatforms_updated.csv

In [None]:
# Load streaming csv file
stream_file = 'MoviesOnStreamingPlatforms_updated.csv'
stream_df = pd.read_csv(stream_file)
stream_df.head(1)

In [None]:
# Remove extraneous columns
stream_cols = ['ID', 'Title', 'Year', 'Age', 'IMDb', 'Rotten Tomatoes',
       'Netflix', 'Hulu', 'Prime Video', 'Disney+', 'Directors',
       'Genres', 'Language', 'Runtime']
stream_df = stream_df[stream_cols].copy()

# Rename columns
stream_df = stream_df.rename(columns={'ID': 'id', 'Title': 'title', 'Year': 'released', 'Age': 'suggested_viewing_age',
                                     'IMDb': 'imdb', 'Rotten Tomatoes': 'rotten_tomatoes', 'Netflix': 'netflix',
                                     'Hulu': 'hulu', 'Prime Video': 'prime_video', 'Disney+': 'disney+',
                                     'Directors': 'directors', 'Genres': 'genres', 'Language': 'language',
                                     'Runtime': 'runtime'})

In [None]:
# Change 0's & 1's to more meaningful data
stream_df['netflix'] = stream_df['netflix'].replace(1,'Yes').replace(0,'No')
stream_df['hulu'] = stream_df['hulu'].replace(1,'Yes').replace(0,'No')
stream_df['prime_video'] = stream_df['prime_video'].replace(1,'Yes').replace(0,'No')
stream_df['disney+'] = stream_df['disney+'].replace(1,'Yes').replace(0,'No')

In [None]:
stream_df.head()

### Create database connection & load DataFrames into database

In [None]:
connection_string = "username:password@localhost:5432/movies_db"
engine = create_engine(f'postgresql://{connection_string}')

In [None]:
movie_numbers_df.to_sql(name='box_office', con=engine, if_exists='replace', index=False)

In [None]:
stream_df.to_sql(name='streaming', con=engine, if_exists='replace', index=False)