In [9]:
import requests
from bs4 import BeautifulSoup as BS
import config
import mysql.connector
from mysql.connector import errorcode
from datetime import date, datetime, timedelta
import time

headers = {'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
numbers_root = 'https://www.the-numbers.com'


In [2]:
def connect_to_AWS():
    cnx = mysql.connector.connect(
        host = config.host,
        user = config.user,
        passwd = config.password
    )
    return cnx

def close_connections():
    cursor.close()
    conn.close()
    
conn = connect_to_AWS()
cursor = conn.cursor()
db_name = 'Movies'

In [42]:
close_connections()

In [3]:
## Function to create new DB 
def create_database(cursor, database_name):
    try:
        cursor.execute(
            "CREATE DATABASE {} DEFAULT CHARACTER SET 'utf8'".format(database_name))
    except mysql.connector.Error as err:
        ## Catch the error if an error occurs.
        print("Failed creating database: {}".format(err))
        exit(1)

## Try to use the database we are creating.
## If we can't use it, then that means the database doesn't exist
    ## Thus we create the database and switch over to it.
try:
    cursor.execute("USE {}".format(db_name))
except mysql.connector.Error as err:
    print("Database {} does not exists.".format(db_name))
    if err.errno == errorcode.ER_BAD_DB_ERROR:
        create_database(cursor, db_name)
        print("Database {} created successfully.".format(db_name))
        conn.database = db_name
    else:
        print(err)
        exit(1)
        
# Function to create tables from a dictionary of tables
# dictionary will have the name of the table as the key and the query to create that table as the value
def create_table(dict_of_tables):
    for table_name in dict_of_tables:
        table_query = dict_of_tables[table_name]
        try:
            print("Creating table {}: ".format(table_name), end='')
            cursor.execute(table_query)
        except mysql.connector.Error as err:
            if err.errno == errorcode.ER_TABLE_EXISTS_ERROR:
                print("already exists.")
            else:
                print(err.msg)
        else:
            print("OK")
        
def drop_tables():
    cursor.execute("""DROP TABLE mg_junction""")
    cursor.execute("""DROP TABLE movies""")
    cursor.execute("""DROP TABLE genres""")
    conn.commit()

In [4]:
TABLES = {}
TABLES['movies'] = """CREATE TABLE movies (
        movieId INT NOT NULL AUTO_INCREMENT,
        movieTitle varchar(255),
        movieDomGross INT(10),
        movieIntGross INT(10),
        movieBudget INT(10),
        movieRunTime REAL,
        movieAgeRating varchar(10),
        movieReleaseDate DATE,
        movieProdMethod varchar(30),
        movieGenre varchar(20),
        movieType varchar(30),
        PRIMARY KEY (movieId),
        UNIQUE KEY(movieTitle)
    ) ENGINE=InnoDB"""

In [4]:
drop_tables()

In [5]:
create_table(TABLES)
# cursor.execute("""ALTER TABLE `Movies`.`movies` AUTO_INCREMENT = 101;""")
conn.commit()

Creating table movies: already exists.


In [6]:
def get_soup(url):
    try:
        page = requests.get(url, timeout=5)
    # include code to do status check
        if page.status_code != 200:
            print(page.status_code)
        else:
#             print(page.status_code)
            soup = BS(page.content,'html.parser')
    except requests.ConnectionError as e:
        print("OOPS!! Connection Error. Make sure you are connected to Internet. Technical Details given below.\n")
        print(str(e))
    except requests.Timeout as e:
        print("OOPS!! Timeout Error")
        print(str(e))
    except requests.RequestException as e:
        print("OOPS!! General Error")
        print(str(e))
    except KeyboardInterrupt:
        print("Someone closed the program") 
        
    
    time.sleep(1)
    return soup

In [59]:
def str_to_int(string):
    return int(string.replace(',','').strip('$'))
    
def str_to_date(string):
    return datetime.strptime(string,"%b %d, %Y").date()

def str_to_time(string):
    return (int(string.strip('minutes')))
    
def get_from_pages(page_start,page_end):
    url = numbers_root+"/movie/budgets/all"
    for page in ['/'+str(num)+'01' for num in range(page_start-1,page_end)]:
        movie_rows = get_movie_rows(url+page)
        for movie_row in movie_rows[:2]:
#             print(movie_row)
            movie_info = get_movie_info(movie_row[1])

def get_movie_rows(page_url):
    movie_tuples = []
    soup = get_soup(page_url)
    movie_fins = soup.select('tr td.data')
    movie_exts = soup.select('tr td b a')
    movie_release = soup.select('tr td a')
#     print(movie_release)
#     print(movie_urls)
    for i in range(100):
        ind = i*4
        ind2 = i*2
        movie_tuples.append((str_to_date(movie_release[ind2].text),
                             movie_exts[i]['href'],
                             str_to_int(movie_fins[ind+1].text),
                             str_to_int(movie_fins[ind+2].text),
                             str_to_int(movie_fins[ind+3].text)))
    return movie_tuples
#     return [(movie_row['href'],) for movie_row in movie_rows]

## Get runtime, MPAA Rating, Production method, genre, and creative type
def get_movie_info(movie_ext):
#     try:
    url = numbers_root + movie_ext
    soup = get_soup(url)
    all_html = soup.select('tr td a')
    
    runtime = rating = prod_method = genre = c_type = None
    for html in all_html:
        if (rating and prod_method and genre and c_type):
            break
            
        elif html['href'].startswith('/market/mpaa-rating/'):
            rating = html.text

        elif html['href'].startswith('/market/genre/'):
            genre = html.text

        elif html['href'].startswith('/market/production-method/'):
            prod_method = html.text

        elif html['href'].startswith('/market/creative-type/'):
            c_type = html.text

    return (runtime, rating, prod_method, genre, c_type)
#     except:
#         print('failed movie into')

In [60]:
get_from_pages(1,1)

PG-13
Action
Animation/Live Action
Science Fiction
found all
PG-13
Adventure
Live Action
Fantasy
found all


In [106]:
drop_tables()
create_table(TABLES)

Creating table movies: OK
Creating table genres: OK
Creating table mg_junction: OK


In [17]:
# print(string_to_int('$20 million'))
print(str_to_int('$856,576,358'))
print(str_to_date('Mar 8, 2019'))
print(str_to_time('162 minutes'))

856576358
2019-03-08
162
