In [None]:
import gzip
import os
import re
import sys
from urllib.request import urlretrieve
from getpass import getpass

import pandas as pd
from mysql.connector import Error, connect, errorcode

In [None]:
# Configurate MySQL connection

# input username and password when run this block
CONFIG = {
    "host": "localhost",
    "user": input("Enter username: "),
    "password": getpass("Enter password: "),
}

# MySQL Database Name
DB_NAME = "my_imdb"


# List of datasets download links
URLS = [
    "https://datasets.imdbws.com/title.basics.tsv.gz",
    "https://datasets.imdbws.com/title.ratings.tsv.gz"
    ]


# Dictionary of creating table queries
CREATES = {}
CREATES["title_basics"] = """
    CREATE TABLE title_basics (
        tconst varchar(20) NOT NULL,
        titleType varchar(20),
        primaryTitle varchar(1000),
        originalTItle varchar(1000),
        isAdult bool,
        startYear smallint,
        endYear smallint,
        runtimeMinutes int,
        genres varchar(255),
        PRIMARY KEY (tconst)
    )
    """

CREATES["title_ratings"] = """
    CREATE TABLE title_ratings (
        tconst varchar(20) NOT NULL,
        averageRating decimal(3, 1),
        numVotes int,
        PRIMARY KEY (tconst),
        FOREIGN KEY (tconst)
            REFERENCES title_basics(tconst)
    )
    """

# Dicionary of inseting queries
INSERTS = {}
INSERTS["title_basics"] = """
    INSERT INTO title_basics
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
    """

INSERTS["title_ratings"] = """
    INSERT INTO title_ratings
    VALUES (%s, %s, %s)
    """


In [None]:
# Function to prompt skip overwriting existing file
def check_exist(filename):
    if os.path.exists(filename):
        confirm = input(f"{filename} already exist. Do you want to skip overwriting it (y/n)? ").lower()
        if confirm.lower() == "n":
            return False
        elif confirm.lower() == "y":
            return True
        else:
            sys.exit("Invalid input, please try again.")
    else:
        return


In [None]:
# Create a Filename class to conatins attributes of different filenames
class Filename:
    def __init__(self, name, url, zip, tsv, csv, small):
        self.name = name
        self.url = url
        self.zip = zip
        self.tsv = tsv 
        self.csv = csv
        self.small = small

In [None]:
# Get a dictionary of filenames from the donwload link
filenames = {}
for url in URLS:
    zip = url.rsplit("/", 1)[-1]
    tsv_matches = re.search(r"^(.+)\.(.+)\.(tsv).gz$", zip)
    name = f"{tsv_matches[1]}_{tsv_matches[2]}"
    tsv = os.path.join(name + os.extsep + "tsv")
    csv = os.path.join(name + os.extsep + "csv")
    small = os.path.join(name + "_s" + os.extsep + "csv")
    filenames[name] = Filename(name, url, zip, tsv, csv, small)

In [None]:
# Donwload datasets files from imdb website
for filename in filenames.values():
    if check_exist(filename.zip):
        continue
    try:
        local_filename, header = urlretrieve(filename.url, filename.zip)
        print(f"Downloaded '{local_filename}' to {os.getcwd()}")
    except Error as e:
        print(f"Download '{local_filename}' fail")


In [None]:
# Unzip and rename the tsv files
for filename in filenames.values():
    if check_exist(filename.tsv):
        continue
    try:
        with gzip.open(filename.zip, "rb") as f_in:
            with open(filename.tsv, "wb") as f_out:
                f_out.write(f_in.read())
                print(f"Extracted {filename.tsv} to {os.getcwd()}")
    except Error as e:
        print(f'Extract {filename.zip} Failed')

In [None]:
# Data cleaning and convert tsv to csv file
for filename in filenames.values():
    imdb_table = pd.read_table(filename.tsv, sep="\t")
    
    # Data Cleaning for title_basics
    if filename.name == "title_basics":

        # Locate all rows with primaryTilte issues
        title_issue_df = imdb_table[imdb_table["primaryTitle"].str.contains(r".+\t.+") == True]
        
        # If rows with primaryTilte issues exit
        if title_issue_df.shape[0] > 0:
            rows_fixed = 0
            for index, row in title_issue_df.iterrows():
                values = row.values.flatten().tolist()
                
                # Split the string to two columns
                clean_titles = values[2].split("\t")
                values[2] = clean_titles[0]
                values.insert(3, clean_titles[1])

                # Removed unnecessary NaN value at the end
                values.pop()

                # Replace the row in the table
                imdb_table[imdb_table["tconst"] == values[0]] = values
                rows_fixed += 1
            print(f"Fixed {rows_fixed} row")
        
    # Export csv
    if check_exist(filename.csv):
        continue
    try:
        imdb_table.to_csv(filename.csv, index=False)
        print(f"Converted {filename.tsv} to {filename.csv}")
    except:
        print(f"Convert {filename.tsv} failed")

In [None]:
# Optional
# Reduce rows in files and convert to csv
for filename in filenames.values():
    imdb_table = pd.read_table(filename.tsv, sep="\t")
    imdb_table_s = imdb_table[imdb_table["tconst"] <= "tt0000100"]
    filename.csv = filename.small
    if check_exist(filename.small):
        continue
    try:
        imdb_table_s.to_csv(filename.small, index=False)
        print(f"Converted {filename.tsv} to {filename.small}")
    except:
        print(f"Convert {filename.tsv} failed")


In [None]:
# Read csv and more data cleaning
imdb_df = {}
for filename in filenames.values():
    imdb_df[filename.name] = pd.read_csv(filename.csv, index_col=False)
    temp_df = imdb_df[filename.name]
    rows_fixed = 0
    if filename.name == "title_basics":
        
        # Replace \N with None
        temp_df.replace(r"\\N", None, regex=True, inplace=True)

        # Replace NaN with None
        imdb_df[filename.name] = temp_df.where(pd.notnull(temp_df), None)


In [None]:
# Connect/Create database
cnx = connect(**CONFIG)
cursor = cnx.cursor()

try : 
    # Connect to database
    cursor.execute(f"USE {DB_NAME}")
    print(f"Connected to {DB_NAME} database")
except Error as e:
    if e.errno == errorcode.ER_BAD_DB_ERROR:
        print(f"Database {DB_NAME} does not exists.")
        
        # Create database if not alrady exist
        cursor.execute(f"CREATE DATABASE {DB_NAME}")
        print(f"{DB_NAME} database created.")
        cnx.database = DB_NAME
    else:
        print(f"Failed to connect to {DB_NAME} database")


In [None]:
# Create tables
for table in CREATES:
    create_query = CREATES[table]
    try:
        cursor.execute(create_query)
        print(f"Created table {table}")
    except Error as e:
        if e.errno == errorcode.ER_TABLE_EXISTS_ERROR:
            print(f"{table} already exist")
        else:
            print(f"Fail to create {table}")


In [None]:
# Insert_row(cursor, table, df):
for table in INSERTS:
    insert_rows = 0
    try:
        for index, row in imdb_df[table].iterrows():
            if table == "title_basics":
                insert_query = (f"INSERT INTO {table} VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)")
            elif table == "title_ratings":
                insert_query = (f"INSERT INTO {table} VALUES (%s, %s, %s)")
            insert_data = tuple(row)
            cursor.execute(insert_query, insert_data)
            cnx.commit()
            insert_rows += 1
    except Error as e:
        if e.errno == errorcode.ER_DUP_ENTRY:
            pass
        else:
            print(f"Insert failed at row {index}: {insert_data}")
    print(f"Inserted {insert_rows} rows to {table}")

In [None]:
# Closed MySQL connection
# Always run this block when finished the program
cursor.close
cnx.close