##### movies-clean-transform notebook
***

<h1>Clean and Transform</h1>

### The purpose of this notebook is to:
    1. View data types
    2. Convert data types where needed
    3. Deal with missing values
    4. Inspect table relationships
    5. Potentially:
        a. Create derived values where advantageous
        b. Make initial joins
        c. Drop irrelevant tables

***

#### import required libraries

In [1]:
import os # for setting the current directory

import numpy as np
import pandas as pd

import sqlite3

import pandasql

In [2]:
# set the current working directory
os.chdir("c:/users/jd/flatiron/project01/dsc-mod-1-project-v2-1-online-ds-ft-120919/")

# print the current working directory
print(os.getcwd())

c:\users\jd\flatiron\project01\dsc-mod-1-project-v2-1-online-ds-ft-120919


#### connect to and preview sqlite database

In [3]:
# connect to sql movies_db data source and instantiate a cursor
conn = sqlite3.connect("movies_db.sqlite")
cur = conn.cursor()

#### import helper functions

In [4]:
from importlib import reload

import helper_functions as hf

In [5]:
# `helper_functions` includes `get_table_list(conn)`, 
# `load_table(conn, table_name)` and `convert_dollars_to_int(df, col)
# note: `get_table_list(conn)` result the same as the above `table_list`
reload(hf)

<module 'helper_functions' from 'c:\\users\\jd\\flatiron\\project01\\dsc-mod-1-project-v2-1-online-ds-ft-120919\\helper_functions.py'>

In [6]:
# function to preview all tables from sqlite_master
# or a sub - set entered as a list
def preview_tables(conn, tables='all'):
    all_tables = hf.get_table_list(conn)
    if tables=='all':
        final_table_names = all_tables
        
    elif type(tables) == list:
        final_table_names = [t for t in all_tables if t in tables]
    
    for table_name in final_table_names:
        print(f"Showing Table: {table_name}")
        query = f"select * from {table_name};"
        df = pd.read_sql(query, conn)
        display(df.head(2))
        display(df.info())
        print("-"*100)
    print("finished")
    return None

<h3 align='center'><font color='coral'>INSPECT TABLES AND DATA TYPES</font></h3>

In [7]:
preview_tables(conn)

Showing Table: clean_bom_tbl


Unnamed: 0,index,title,studio,domestic_gross,foreign_gross,year,Total_gross
0,0,Toy Story 3,BV,415000000.0,652000000.0,2010,1067000000.0
1,1,Alice in Wonderland (2010),BV,334200000.0,691300000.0,2010,1025500000.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 7 columns):
index             3387 non-null int64
title             3387 non-null object
studio            3382 non-null object
domestic_gross    3359 non-null float64
foreign_gross     2037 non-null float64
year              3387 non-null int64
Total_gross       2009 non-null float64
dtypes: float64(3), int64(2), object(2)
memory usage: 185.4+ KB


None

----------------------------------------------------------------------------------------------------
Showing Table: bom_movie_gross


Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 5 columns):
title             3387 non-null object
studio            3382 non-null object
domestic_gross    3359 non-null float64
foreign_gross     2037 non-null object
year              3387 non-null int64
dtypes: float64(1), int64(1), object(3)
memory usage: 132.4+ KB


None

----------------------------------------------------------------------------------------------------
Showing Table: imdb_name_basics


Unnamed: 0,nconst,primary_name,birth_year,death_year,primary_profession,known_for_titles
0,nm0061671,Mary Ellen Bauder,,,"miscellaneous,production_manager,producer","tt0837562,tt2398241,tt0844471,tt0118553"
1,nm0061865,Joseph Bauer,,,"composer,music_department,sound_department","tt0896534,tt6791238,tt0287072,tt1682940"


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 606648 entries, 0 to 606647
Data columns (total 6 columns):
nconst                606648 non-null object
primary_name          606648 non-null object
birth_year            82736 non-null float64
death_year            6783 non-null float64
primary_profession    555308 non-null object
known_for_titles      576444 non-null object
dtypes: float64(2), object(4)
memory usage: 27.8+ MB


None

----------------------------------------------------------------------------------------------------
Showing Table: imdb_title_akas


Unnamed: 0,title_id,ordering,title,region,language,types,attributes,is_original_title
0,tt0369610,10,Джурасик свят,BG,bg,,,0.0
1,tt0369610,11,Jurashikku warudo,JP,,imdbDisplay,,0.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 331703 entries, 0 to 331702
Data columns (total 8 columns):
title_id             331703 non-null object
ordering             331703 non-null int64
title                331703 non-null object
region               278410 non-null object
language             41715 non-null object
types                168447 non-null object
attributes           14925 non-null object
is_original_title    331678 non-null float64
dtypes: float64(1), int64(1), object(6)
memory usage: 20.2+ MB


None

----------------------------------------------------------------------------------------------------
Showing Table: imdb_title_basics


Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama"


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146144 entries, 0 to 146143
Data columns (total 6 columns):
tconst             146144 non-null object
primary_title      146144 non-null object
original_title     146123 non-null object
start_year         146144 non-null int64
runtime_minutes    114405 non-null float64
genres             140736 non-null object
dtypes: float64(1), int64(1), object(4)
memory usage: 6.7+ MB


None

----------------------------------------------------------------------------------------------------
Showing Table: imdb_title_crew


Unnamed: 0,tconst,directors,writers
0,tt0285252,nm0899854,nm0899854
1,tt0438973,,"nm0175726,nm1802864"


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146144 entries, 0 to 146143
Data columns (total 3 columns):
tconst       146144 non-null object
directors    140417 non-null object
writers      110261 non-null object
dtypes: object(3)
memory usage: 3.3+ MB


None

----------------------------------------------------------------------------------------------------
Showing Table: imdb_title_principals


Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0111414,1,nm0246005,actor,,"[""The Man""]"
1,tt0111414,2,nm0398271,director,,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1028186 entries, 0 to 1028185
Data columns (total 6 columns):
tconst        1028186 non-null object
ordering      1028186 non-null int64
nconst        1028186 non-null object
category      1028186 non-null object
job           177684 non-null object
characters    393360 non-null object
dtypes: int64(1), object(5)
memory usage: 47.1+ MB


None

----------------------------------------------------------------------------------------------------
Showing Table: imdb_title_ratings


Unnamed: 0,tconst,averagerating,numvotes
0,tt10356526,8.3,31
1,tt10384606,8.9,559


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73856 entries, 0 to 73855
Data columns (total 3 columns):
tconst           73856 non-null object
averagerating    73856 non-null float64
numvotes         73856 non-null int64
dtypes: float64(1), int64(1), object(1)
memory usage: 1.7+ MB


None

----------------------------------------------------------------------------------------------------
Showing Table: rt_movie_info


Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,currency,box_office,runtime,studio
0,1,"This gritty, fast-paced, and innovative police...",R,Action and Adventure|Classics|Drama,William Friedkin,Ernest Tidyman,"Oct 9, 1971","Sep 25, 2001",,,104 minutes,
1,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",$,600000.0,108 minutes,Entertainment One


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1560 entries, 0 to 1559
Data columns (total 12 columns):
id              1560 non-null int64
synopsis        1498 non-null object
rating          1557 non-null object
genre           1552 non-null object
director        1361 non-null object
writer          1111 non-null object
theater_date    1201 non-null object
dvd_date        1201 non-null object
currency        340 non-null object
box_office      340 non-null object
runtime         1530 non-null object
studio          494 non-null object
dtypes: int64(1), object(11)
memory usage: 146.4+ KB


None

----------------------------------------------------------------------------------------------------
Showing Table: rt_reviews


Unnamed: 0,id,review,rating,fresh,critic,top_critic,publisher,date
0,3,A distinctly gallows take on contemporary fina...,3/5,fresh,PJ Nabarro,0,Patrick Nabarro,"November 10, 2018"
1,3,It's an allegory in search of a meaning that n...,,rotten,Annalee Newitz,0,io9.com,"May 23, 2018"


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54432 entries, 0 to 54431
Data columns (total 8 columns):
id            54432 non-null int64
review        48869 non-null object
rating        40915 non-null object
fresh         54432 non-null object
critic        51710 non-null object
top_critic    54432 non-null int64
publisher     54123 non-null object
date          54432 non-null object
dtypes: int64(2), object(6)
memory usage: 3.3+ MB


None

----------------------------------------------------------------------------------------------------
Showing Table: tmdb_movies


Unnamed: 0,index,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26517 entries, 0 to 26516
Data columns (total 10 columns):
index                26517 non-null int64
genre_ids            26517 non-null object
id                   26517 non-null int64
original_language    26517 non-null object
original_title       26517 non-null object
popularity           26517 non-null float64
release_date         26517 non-null object
title                26517 non-null object
vote_average         26517 non-null float64
vote_count           26517 non-null int64
dtypes: float64(2), int64(3), object(5)
memory usage: 2.0+ MB


None

----------------------------------------------------------------------------------------------------
Showing Table: tn_movie_budgets


Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
id                   5782 non-null int64
release_date         5782 non-null object
movie                5782 non-null object
production_budget    5782 non-null object
domestic_gross       5782 non-null object
worldwide_gross      5782 non-null object
dtypes: int64(1), object(5)
memory usage: 271.2+ KB


None

----------------------------------------------------------------------------------------------------
finished


#### Preview suggests columns of interest (based on our questions) regarding box office performance for titles, studios, and genres
   
    >  bom_movie_gross: ['title', 'studio', 'domestic_gross', 'foreign_gross', 'year']
    >  imdb_title_basics: ['primary_title', 'start_year', 'genres']
    >  rt_movie_info: ['genre', 'theater_date', 'currency', 'box_office']
    >  tmdb_movies: ['genre_ids', 'original_title', 'release_date']
    >  tn_movie_budgets: ['release_date', 'movie', 'production_budget', 'domestic_gross', 'worldwide_gross']

#### Cleaning Notes:
* bom_movie_gross | 'domestic_gross' is a REAL number, while 'foreign_gross' is TEXT; table has no foreign keys (unless title names happen to match exactly)
* imdb_title_basics | 'genres' values are "," separated
* rt_movie_info | 'box_office' is TEXT; 'genre' values are "|" separated
* tmdb_movies | 'genre_ids' are TEXT
* tn_movie_budgets | 'production_budget', 'domestic_gross', and 'worldwide_gross' all each TEXT
* all "*date*" values are text, while "*year*" values are of type INTEGER

#### We will review and clean these tables. in turn, below.

<h2 align='center'><font color='chocolate'>SEQUENTIALLY REVIEW, CLEAN, AND TRANSFORM</font></h2>

### bom_movie_gross

In [8]:
preview_tables(conn, ['bom_movie_gross'])

Showing Table: bom_movie_gross


Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 5 columns):
title             3387 non-null object
studio            3382 non-null object
domestic_gross    3359 non-null float64
foreign_gross     2037 non-null object
year              3387 non-null int64
dtypes: float64(1), int64(1), object(3)
memory usage: 132.4+ KB


None

----------------------------------------------------------------------------------------------------
finished


#### Change `foreign_gross` data type to match `domestic_gross`

### Connect `bom_movie_gross`

In [9]:
# connect to the table and view in pandas for cleaning
cur.execute('''SELECT *
                    FROM bom_movie_gross
                    ;''')

clean_bom_df = pd.DataFrame(cur.fetchall())
clean_bom_df.columns = [x[0] for x in cur.description]
display(clean_bom_df.dtypes)
clean_bom_df.head(3)

title              object
studio             object
domestic_gross    float64
foreign_gross      object
year                int64
dtype: object

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010


### Convert  `clean_bom_df`

In [10]:
# convert `foreign_gross` to remove commas and to match `domestic_gross` data type
clean_bom_df['foreign_gross'] = clean_bom_df['foreign_gross'].str.replace(',', '').astype(float)

# add a `Total_gross` column for EDA
clean_bom_df['Total_gross'] = clean_bom_df['domestic_gross'] + clean_bom_df['foreign_gross']

# and view a sample
clean_bom_df.loc[clean_bom_df.title == 'Alice in Wonderland (2010)', :]

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year,Total_gross
1,Alice in Wonderland (2010),BV,334200000.0,691300000.0,2010,1025500000.0


### Add `clean_bom_df` to sqlite

In [11]:
# add the new df as a table to the sqlite database
clean_bom_df.to_sql('clean_bom_tbl', conn, if_exists='replace')

# view names of all tables in the sql database to verify operation
conn.execute("select name from sqlite_master where type='table';").fetchall()

[('bom_movie_gross',),
 ('imdb_name_basics',),
 ('imdb_title_akas',),
 ('imdb_title_basics',),
 ('imdb_title_crew',),
 ('imdb_title_principals',),
 ('imdb_title_ratings',),
 ('rt_movie_info',),
 ('rt_reviews',),
 ('tmdb_movies',),
 ('tn_movie_budgets',),
 ('clean_bom_tbl',)]

### imdb_title_basics

In [12]:
preview_tables(conn, ['imdb_title_basics'])

Showing Table: imdb_title_basics


Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama"


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146144 entries, 0 to 146143
Data columns (total 6 columns):
tconst             146144 non-null object
primary_title      146144 non-null object
original_title     146123 non-null object
start_year         146144 non-null int64
runtime_minutes    114405 non-null float64
genres             140736 non-null object
dtypes: float64(1), int64(1), object(4)
memory usage: 6.7+ MB


None

----------------------------------------------------------------------------------------------------
finished


### Connect `imdb_title_basics`

In [14]:
# connect to the table and view in pandas for cleaning
cur.execute('''SELECT *
                    FROM imdb_title_basics
                    ;''')

clean_imdb_title_df = pd.DataFrame(cur.fetchall())
clean_imdb_title_df.columns = [x[0] for x in cur.description]
display(clean_imdb_title_df.dtypes)
clean_imdb_title_df.head(3)

tconst              object
primary_title       object
original_title      object
start_year           int64
runtime_minutes    float64
genres              object
dtype: object

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama"
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama


### Extract and expand `clean_imdb_title_df` genres

In [17]:
clean_imdb_title_df.genres.unique()

array(['Action,Crime,Drama', 'Biography,Drama', 'Drama', ...,
       'Music,Musical,Reality-TV', 'Animation,Crime',
       'Adventure,History,War'], dtype=object)

In [18]:
# add the genres to a list
# initialize and empty list
genre_list = []

# convert the `genres` series arrays to nested lists
for row in clean_imdb_title_df.genres:
    genre_list.append(str(row).split(","))

print("nested:\n", genre_list[:5])
print("\nfirst sublist item:", genre_list[1][0])

# flatten the list
flat_genres = []
for sublist in genre_list:
    for item in sublist:
        flat_genres.append(item)

# finally, convert the flat list to a unique list
# we can re - assign our nested list
genre_list = list(dict.fromkeys(flat_genres))
print("\n", len(genre_list), "items in new `genre_list`:\n", genre_list)

nested:
 [['Action', 'Crime', 'Drama'], ['Biography', 'Drama'], ['Drama'], ['Comedy', 'Drama'], ['Comedy', 'Drama', 'Fantasy']]

first sublist item: Biography

 28 items in new `genre_list`:
 ['Action', 'Crime', 'Drama', 'Biography', 'Comedy', 'Fantasy', 'Horror', 'Thriller', 'Adventure', 'Animation', 'Documentary', 'History', 'None', 'Mystery', 'Sci-Fi', 'Romance', 'Family', 'War', 'Music', 'Sport', 'Western', 'Musical', 'Adult', 'News', 'Talk-Show', 'Reality-TV', 'Game-Show', 'Short']


In [None]:
df =
list_column =
new_column =



def expand_list(df, list_column, new_column): 
    lens_of_lists = df[list_column].apply(len)
    origin_rows = range(df.shape[0])
    destination_rows = np.repeat(origin_rows, lens_of_lists)
    non_list_cols = (
      [idx for idx, col in enumerate(df.columns)
       if col != list_column]
    )
    expanded_df = df.iloc[destination_rows, non_list_cols].copy()
    expanded_df[new_column] = (
      [item for items in df[list_column] for item in items]
      )
    expanded_df.reset_index(inplace=True, drop=True)
    return expanded_df

#usage
expanded_dataframe = expand_list(old_dataframe,"Item List", "Item")

### Add `imdb_title_basics` to sqlite

In [11]:
# add the new df as a table to the sqlite database
xxxxxxxxxxxxxxxxxxxxxxxx_df.to_sql('clean_bom_tbl', conn, if_exists='replace')

# view names of all tables in the sql database to verify operation
# conn.execute("select name from sqlite_master where type='table';").fetchall()

[('bom_movie_gross',),
 ('imdb_name_basics',),
 ('imdb_title_akas',),
 ('imdb_title_basics',),
 ('imdb_title_crew',),
 ('imdb_title_principals',),
 ('imdb_title_ratings',),
 ('rt_movie_info',),
 ('rt_reviews',),
 ('tmdb_movies',),
 ('tn_movie_budgets',),
 ('clean_bom_tbl',)]

### `rt_movie_info`

### `tmdb_movies`

### `tn_movie_budgets`

<h2 align='left'><font color='chocolate'>START QUESTIONS-----------------------------------</font></h2>
<h2 align='center'><font color='chocolate'>Top Gross</font></h2>

<h3 align='center'><font color='coral'>by title</font></h3>

<h3>Question(s)</h3>

* What are the recent top grossing movies?

### From initial loading and structuring of data, we know we need to clean a few fields.
### Let's start with `bom_movie_gross`

In [None]:
bom_df_clean.sort_values(by=['domestic_gross'], ascending=False
                        ).head(5)

### Convert the data-types of `year` and of `foreign_gross`

In [None]:
# we can convert year to datetime
bom_df_clean['year'] = pd.to_datetime(bom_df_clean['year'], format='%Y')


In [None]:
# can we make a new `Year` column with just the year
bom_df_clean['Year'] = bom_df_clean['year'].dt.year
bom_df_clean.head(1)

In [None]:
# what years are available in this data set?
# note: we only want the year, not the full datetime
print("\n`Year` stored as type:", bom_df_clean['Year'].dtype)
print(bom_df_clean.Year.unique(), "\n")

In [None]:
# lets ad a column computing total_gross
bom_df_clean['Total_gross'] = bom_df_clean['domestic_gross'] + bom_df_clean['foreign_gross']

# and view a sample
bom_df_clean.loc[bom_df_clean.title == 'Avengers: Infinity War', :]

### Find the highest grossing movies.

In [None]:
# what were the top 10 highest - gross movies
# 2018
bom_2018_df = bom_df_clean.loc[bom_df_clean.Year == 2018]
display(bom_2018_df.sort_values(by=['Total_gross'], ascending=False).head(10))

#2016 - 2018
bom_2016_18_df = bom_df_clean.loc[bom_df_clean.Year.between (2016, 2018)]
display(bom_2016_18_df.sort_values(by=['Total_gross'], ascending=False).head(10))

#### Even before visual EDA the data is beginning to tell a story re. action/ adventure, sequels, franchises, remakes, and animation.

#### What can we learn from visualizing these dataframes?

In [None]:
# import visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# viewing the three most recent years from the dataset
sns.set_style("whitegrid")
sns.boxplot(x="Year", y="Total_gross", data=bom_2016_18_df)

#### Top grossing movies are and have been far above the norm.

### Now let's take a look at `tn_movie_budgets`

#### We see from our previews that budget and gross fields have dollar signs, commas, and a text data type. Let's clean that up.

In [None]:
cur.execute('''SELECT *
                    FROM tn_movie_budgets
                    LIMIT 5;''')

tn_df = pd.DataFrame(cur.fetchall())
tn_df.columns = [x[0] for x in cur.description]
tn_df.dtypes

tn_df

In [None]:
# make a clean df for `tn_df`
cur.execute('''SELECT *
                    FROM tn_movie_budgets
                    ;
                    ''')

tn_df_clean = pd.DataFrame(cur.fetchall())
tn_df_clean.columns = [x[0] for x in cur.description]

In [None]:
# note: sorting will have no effect until text values are converted
# tn_df_clean.sort_values(by=['worldwide_gross'], ascending=False).head(10)

In [None]:
# verify correct column selection to clean
tn_df_clean.columns[3:]

#### We need large integer data types for these fields. Symbols and separators must be removed to do this. Method chaining keeps the operation efficient.

In [None]:
# `convert_dollars_to_int(df, col)` accepts a dataframe and a column
# encountered errors
# we can quickly pass each of the necessary args via a loop
for col in tn_df_clean.columns[3:]:
    tn_df_clean[col] = tn_df_clean[col].str.replace("$", "").str.replace(",", "").astype('int64')

tn_df_clean.dtypes

In [None]:
# view the top - ten in order of `worldwide_gross`
display(tn_df_clean.head(10).sort_values(by=['worldwide_gross'], ascending=False))

#### That's a bit different than the `bom...` table. The top earner far oupaces that from the previous dataframe. A data dictionary could help us to differentiate earnings sources reflected in the data (eg., strictly box office vs all media) as well as providing  clearer understanding for when data were collected.

### The `tn...` dataframe also enables us to compute net earnings figures.

In [None]:
tn_df_clean['Worldwide_net'] = tn_df_clean['worldwide_gross'] - tn_df_clean['production_budget']
display(tn_df_clean.head(10).sort_values(by=['Worldwide_net'], ascending=False))

### What can we infer from a cursory visual analysis of `production_budget`, `worldwide_gross`, and `Worldwide_net`"

In [None]:
fig, ax = plt.subplots(ncols=3, figsize=(13, 5))
sns.distplot(tn_df_clean.production_budget, bins = 65, ax=ax[0])
sns.distplot(tn_df_clean.worldwide_gross, bins = 65, ax=ax[1])
sns.distplot(tn_df_clean.Worldwide_net, bins = 65, ax=ax[2])

plt.show()


In [None]:
fig, ax = plt.subplots(ncols=3, figsize=(13, 5))
sns.boxplot(tn_df_clean.production_budget,  orient='v', ax=ax[0])
sns.boxplot(tn_df_clean.worldwide_gross,  orient='v', ax=ax[1])
sns.boxplot(tn_df_clean.Worldwide_net,  orient='v', ax=ax[2])

plt.subplots_adjust(wspace=.75, hspace=.5);

plt.show()


### It seems a few things are highly funded (far from the median), most are modestly funded, and relatively high net profit is rare thing.

In [None]:
# in numbers for `production_budget`
tn_df_clean.production_budget.describe()

In [None]:
# view a seaborn pairplot matrix with regression lines
sns.pairplot(tn_df_clean, kind="reg")
sns.set(font_scale=.8) # try to prevent overlap of long column names
plt.subplots_adjust(wspace=.02, hspace=.1);
plt.show()

### We see from the visualization that budget is correlated with net and gross figures.

In [None]:
# create a variable for the correlation
corr = tn_df_clean.corr()

# subplots
plt.figure(figsize=(24,9)) # total figure size
plt.subplots_adjust(wspace=.45, hspace=.1); # prevent overlap
sns.set(font_scale=1.55) # adjust for annotation legibility

plt.subplot(1,2,1) # correlation with regression line
sns.regplot(x='production_budget', y='worldwide_gross', data=tn_df_clean)

plt.subplot(1,2,2) # heatmap
sns.heatmap(corr,  cbar=True, linecolor="w", linewidths=1, cmap='Blues', square=True, annot=True, annot_kws={"size": 22})

### For what it is worth, the heatmap suggests that budget has a slightly stronger corellation with `worldwide_gross` than with `domestic_gross`.

### Before we go further, it could be worthwhile to determine whether the `tn_df_clean` dataframe can be joined to the `bom_2016_18_df` dataframe. Their respective `movie` and `title` columns appear promising.

In [None]:
# Can we find a movie by the same name?
display(tn_df_clean.loc[tn_df_clean.movie == 'Dark Phoenix'])
display(bom_2016_18_df.loc[bom_2016_18_df.title == 'Dark Phoenix'])

# rt_genres = pd.merge(rt_df_clean, df1, left_index=True, right_index=True)

In [None]:
print("`tn`", "-"*80)
print("\n", tn_df_clean.info())
print("`bom`", "-"*80)
print("\n", bom_2016_18_df.info())

#### We observe a significant mismatch in the number of entries. A joined table could miss more than 80% of the movies in `tn_df_clean`.

<h3 align='center'><font color='coral'>by studio</font></h3>

In [None]:
# ... the 5 highest grossing studios
print("2018", bom_2018_df.groupby('studio').sum(
).sort_values(by=['Total_gross'], ascending=False).head())


print("-"*80)

print("2016-2018", bom_2016_18_df.groupby('studio').sum(
).sort_values(by=['Total_gross'], ascending=False).head())

#### Looks like we have an issue with `Year` values being summed.

<h3 align='center'><font color='coral'>by genre</font></h3>

## So how about those genres?

### `rt_movie_info` has a genre field, as well as a box office field, so that might make a good transition as we move our investigation toward the next category.

In [None]:
# make a clean df for `rt_movie_info`
cur.execute('''SELECT *
                    FROM rt_movie_info
                    ;
                    ''')

rt_clean_df = pd.DataFrame(cur.fetchall())
rt_clean_df.columns = [x[0] for x in cur.description]

In [None]:
# let's see what shape the dataframe is in
print("rows and columns:", rt_clean_df.shape)

# how many values are missing
print("number of 'box_office' missing values:", rt_clean_df['box_office'].isna().sum())

# we may as well take a deeper look while we're at it
display(rt_clean_df.head(5).sort_values(by=['box_office'], ascending=False))

#### Whoa! That's a rotten number of missing values. And where are the movie titles?
#### Breaking - out those genre categories may be useful for joining tables as we go forward. 

In [None]:
# split `rt_clean_df.genre` values to a new dataframe
df1 = rt_clean_df.genre.str.split("|", expand=True)
df1.columns = "g1 g2 g3 g4 g5 g6 g7".split()
df1

In [None]:
# Create a list of unique genres for future use
genre_list = []

for cols in df1.columns:
    for v in df1[cols]: # check values in all columns
        if v != None: # disregard None values
            if v not in genre_list:
                genre_list.append(v) # add to the list

genre_list

#### Merge the df1 genre columns to  `rt_df_clean` as a new data frame.

In [None]:
# join the two datframes to add genre columns to each row
# these dataframes share the same indices
rt_genres = pd.merge(rt_clean_df, df1, left_index=True, right_index=True)
display(rt_genres.head(1))

### The next is table from imdb. Let's see how useful it is for investigating genre.

In [None]:
# make a clean df for `...basics`
cur.execute('''SELECT *
                    FROM imdb_title_basics
                    ;
                    ''')

imdb_basics_df_clean = pd.DataFrame(cur.fetchall())
imdb_basics_df_clean.columns = [x[0] for x in cur.description]

In [None]:
imdb_basics_df_clean.head(3)

### Is the genre list the same as the previous dataframe.

In [None]:
# create an empty list for this dataframe's genres
imdb_genres = []

a = imdb_basics_df_clean.genres.unique() # returns a numpy array
b = [str(i) for i in a] # returns a list

b = list(filter(None, b)) # remove None values

print(b[0:5])

#### Here we have words, with un-spaced commas inseparable from permanent compound-word arrays.

In [None]:
# let's create a function to test this
def split_list_of_strings(list, separator):
    print("the first element is", list[0]) # show the first item
    for element in b:
        # show the first item after `split`
        element.split(",", 5) # long enough to check for all commas
    print("...after processing, the first element is now", list[0])

split_list_of_strings(b, ",")

In [None]:
element_count = 0
set_b = set(b)

for element in set_b:
    if "Action" in element:
        element_count += 1

print(element_count, "unique elements contain 'Action', out of", len(set_b))
print(set_b)
# b[0:][:10]

#### Let's move on.

***

<h2 align='center'><font color='chocolate'>Top Net</font></h2>

<h3>Question(s)</h3>

* First

<h3 align='center'><font color='coral'>by title</font></h3>

<h3 align='center'><font color='coral'>by studio</font></h3>

<h3 align='center'><font color='coral'>by genre</font></h3>

***

***

<h2 align='center'><font color='chocolate'>Correlation</font></h2>

<h3>Question(s)</h3>

* First

<h3 align='center'><font color='coral'>gross, net</font></h3>

<h3 align='center'><font color='coral'>genre, gross</font></h3>

<h3 align='center'><font color='coral'>genre, net [only if gross, net not correlated</font></h3>

***

<h2 align='center'><font color='chocolate'>Top Studios</font></h2>

<h3>Question(s)</h3>

* First

<h3 align='center'><font color='coral'>gross genre mix</font></h3>

<h3 align='center'><font color='coral'>net genre mix</font></h3>

***

<h2 align='center'><font color='chocolate'>Final Recommendations</font></h2>

In [None]:
# looking at ...
# plt.figure(figsize=(8, 5))
# sns.violinplot(x='year', y='Total_gross', data=bom_2016_18_df)

In [None]:
import matplotlib.pyplot as plt
import matplotlib

matplotlib.style.use('ggplot')


data = [[2000, 2000, 2000, 2001, 2001, 2001, 2002, 2002, 2002],
        ['Jan', 'Feb', 'Mar', 'Jan', 'Feb', 'Mar', 'Jan', 'Feb', 'Mar'],
        [1, 2, 3, 4, 5, 6, 7, 8, 9]]

rows = zip(data[0], data[1], data[2])
headers = ['Year', 'Month', 'Value']
df = pd.DataFrame(rows, columns=headers)

df

In [None]:
pivot_df = df.pivot(index='Year', columns='Month', values='Value')
pivot_df

In [None]:
colors = ["#006D2C", "#31A354","#74C476"]
#Note: .loc[:,['Jan','Feb', 'Mar']] is used here to rearrange the layer ordering
pivot_df.loc[:,['Jan','Feb', 'Mar']].plot.barh(stacked=True, color=colors, figsize=(10,7))

In [None]:
plt.bar()

In [None]:
cur.close()
conn.close()