##### data conditioning notebook

* Student name: Jamel Dargan
* Student pace: online full-time
* Scheduled project review date/time: 
* Instructor name: Rafael Carrasco
* Blog post URL: DRAFT https://medium.com/@jameld.pro/data-baby-e6b493f3fa5a


# Data Conditioning

## Import necessary libraries

In [1]:
import os

import numpy as np
import pandas as pd

from glob import glob

In [2]:
# set the current working directory
os.chdir("c:/users/jd/flatiron/project01/dsc-mod-1-project-v2-1-online-ds-ft-120919/")

# print the current working directory
print(os.getcwd(), '\n')


c:\users\jd\flatiron\project01\dsc-mod-1-project-v2-1-online-ds-ft-120919 



## Inspect the file list

In [3]:
# get the list of files using glob
csv_files = glob("zippedData/*gz")

csv_files

['zippedData\\bom.movie_gross.csv.gz',
 'zippedData\\imdb.name.basics.csv.gz',
 'zippedData\\imdb.title.akas.csv.gz',
 'zippedData\\imdb.title.basics.csv.gz',
 'zippedData\\imdb.title.crew.csv.gz',
 'zippedData\\imdb.title.principals.csv.gz',
 'zippedData\\imdb.title.ratings.csv.gz',
 'zippedData\\rt.movie_info.tsv.gz',
 'zippedData\\rt.reviews.tsv.gz',
 'zippedData\\tmdb.movies.csv.gz',
 'zippedData\\tn.movie_budgets.csv.gz']

<h3 align='center'><font color='coral'>LOAD FILES AS DATAFRAMES</font></h3>

In [4]:
# create a dictionary with keys as csv/ tsv filename (cleaned)
# values as their respective dataframe names
csv_files_dict = {}

# populate the dictionary using a for loop
for filename in csv_files:
    # clean the file names
    filename_cleaned = os.path.basename(filename.replace(
    ".csv", "").replace(".tsv", "").replace(".gz", "").replace("-", "_").replace(".", "_"))
    if ".csv" in str(filename):
        filename_df = pd.read_csv(filename, compression='gzip', header=0, index_col=0)
    if ".tsv" in str(filename): # eg., `rt.movie_info.tsv`
        try:
            filename_df = pd.read_csv(
                filename, compression='gzip', sep="\t", index_col=0)
        except: # i.e., `rt.reviews.tsv`
            filename_df = pd.read_csv(
                filename, compression='gzip', sep="\t", encoding="latin-1", index_col=0)
    csv_files_dict[filename_cleaned] = filename_df #use cleaned file names
    # end loop

# view the created dictionaries keys (cleaned file names)
display(csv_files_dict.keys())

dict_keys(['bom_movie_gross', 'imdb_name_basics', 'imdb_title_akas', 'imdb_title_basics', 'imdb_title_crew', 'imdb_title_principals', 'imdb_title_ratings', 'rt_movie_info', 'rt_reviews', 'tmdb_movies', 'tn_movie_budgets'])

In [5]:
# test a dataframe from csv_files_dict
# assign it the variable `current_df`
# in the format: current_df = csv_files_dict[`df`]
current_df = csv_files_dict['imdb_title_basics']

# view only the current dataframe column names
display(current_df.iloc[:0,])


Unnamed: 0_level_0,primary_title,original_title,start_year,runtime_minutes,genres
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


<h3 align='center'><font color='coral'>LOAD DATAFRAMES TO SQLITE DATABASE</font></h3>

## Load files into a sqlite database.

In [6]:
import sqlite3

In [7]:
# create a sqlite file in the current directory
conn = sqlite3.connect("movies_db.sqlite")

#### One-off example, to condition a nested dataframe into a sqlite table

In [8]:
# this converts a dataframe to a sqlite table, with the name 'tmdb_movies'
# enabling sql to create the schema

#csv_files_dict['tmdb_movies'].to_sql("tmdb_movies", conn)

### Create a function to automate this conversion

In [9]:
# Write a function that converts a nested dataframe to a sqlite table
def create_sql_table_from_df(df, name, conn):
    # Use try except
    # it will try to make a table
    # if a table exists the function will execute the except part
    try:
        df.to_sql(name, conn, if_exists='replace')
        print(f"Created table {name}")
    
    # if the table exists it will tell you, and won't cause an error
    except Exception as e:
        print(f"could not make table {name}")
        print(e)

In [10]:
# Looping through the dictionary of keys and dataframes
# We can create the sql tables programmatically

for name, table in csv_files_dict.items():
    create_sql_table_from_df(table, name, conn)

Created table bom_movie_gross
Created table imdb_name_basics
Created table imdb_title_akas
Created table imdb_title_basics
Created table imdb_title_crew
Created table imdb_title_principals
Created table imdb_title_ratings
Created table rt_movie_info
Created table rt_reviews
Created table tmdb_movies
Created table tn_movie_budgets


In [11]:
# view names of all tables in the sql database
conn.execute("select name from sqlite_master where type='table';").fetchall()

[('clean_bom_tbl',),
 ('bom_movie_gross',),
 ('imdb_name_basics',),
 ('imdb_title_akas',),
 ('imdb_title_basics',),
 ('imdb_title_crew',),
 ('imdb_title_principals',),
 ('imdb_title_ratings',),
 ('rt_movie_info',),
 ('rt_reviews',),
 ('tmdb_movies',),
 ('tn_movie_budgets',)]

In [12]:
# test query of a sql table
cur = conn.cursor()
print(cur.execute('''SELECT * FROM tn_movie_budgets LIMIT 2;''').fetchall())

[(1, 'Dec 18, 2009', 'Avatar', '$425,000,000', '$760,507,625', '$2,776,345,279'), (2, 'May 20, 2011', 'Pirates of the Caribbean: On Stranger Tides', '$410,600,000', '$241,063,875', '$1,045,663,875')]


In [13]:
# test query a sql table and wrap the results as a pandas dataframe
cur.execute('''SELECT * FROM rt_movie_info LIMIT 2;''')
df = pd.DataFrame(cur.fetchall())
df.columns = [x[0] for x in cur.description]
df.head()

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,currency,box_office,runtime,studio
0,1,"This gritty, fast-paced, and innovative police...",R,Action and Adventure|Classics|Drama,William Friedkin,Ernest Tidyman,"Oct 9, 1971","Sep 25, 2001",,,104 minutes,
1,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",$,600000.0,108 minutes,Entertainment One


## Add sqlite DB to a .gitignore file (file is too large to push to github)

In [14]:
# Now create a .gitignore file that will ignore unzipped and sqlite files
# files larger than 100 MB cannot be pushed to github from your computer

# create the file here in jupyter and open it from jupyter
with open("./.gitignore", "w+") as f:
    f.write("*.sqlite") # put files you want to ignore here
    f.write("\n") # insert a new line after each file
    f.write("zippedData/")
    f.write("\n")
    f.write("zippedData/*.csv")
    f.write("\n")
    f.write("zippedData/*.gz")

In [15]:
# closing the cursor and the connection
cur.close()
conn.close()

### All tables are now in the sqlite datbase `movies_db.sqlite` and accessible via pandas as dataframes.

### Continue to the Clean and Transform notebook