In [2]:
import pandas as pd
import numpy as np
import sqlite3
import zipfile

## List of Dataframes:

In [3]:
# Dataframes
movie_info_df = pd.read_csv('zippedData/rt.movie_info.tsv.gz', sep='\t', compression='gzip')
print(f'movie_info_df columns: \n{list(movie_info_df.columns)}\n')
movie_gross_df = pd.read_csv('zippedData/bom.movie_gross.csv.gz')
print(f'movie_gross_df columns: \n{list(movie_gross_df.columns)}\n')
movie_budget_df = pd.read_csv('zippedData/tn.movie_budgets.csv.gz')
print(f'movie_budget_df columns: \n{list(movie_budget_df.columns)}\n')
movie_reviews_df = pd.read_csv('zippedData/rt.reviews.tsv.gz', sep='\t', compression='gzip', encoding='latin-1')
print(f'movie_reviews_df columns: \n{list(movie_reviews_df.columns)}\n')
tmbd_movie_df = pd.read_csv('zippedData/tmdb.movies.csv.gz')
print(f'tmbd_movie_df columns: \n{list(tmbd_movie_df.columns)}\n')

movie_info_df columns: 
['id', 'synopsis', 'rating', 'genre', 'director', 'writer', 'theater_date', 'dvd_date', 'currency', 'box_office', 'runtime', 'studio']

movie_gross_df columns: 
['title', 'studio', 'domestic_gross', 'foreign_gross', 'year']

movie_budget_df columns: 
['id', 'release_date', 'movie', 'production_budget', 'domestic_gross', 'worldwide_gross']

movie_reviews_df columns: 
['id', 'review', 'rating', 'fresh', 'critic', 'top_critic', 'publisher', 'date']

tmbd_movie_df columns: 
['Unnamed: 0', 'genre_ids', 'id', 'original_language', 'original_title', 'popularity', 'release_date', 'title', 'vote_average', 'vote_count']



In [31]:
movie_gross_df['domestic_gross'].max()


936700000.0

---

## SQL DATABASE

* Unzipping the '**im.db.zip**' file 
* Connecting to database using '**conn**'
* Printing all of the **table names** within the database


**Important note**: movie_basics & movie_ratings are the most relevant per instructions

In [4]:
# Unzip the database file
with zipfile.ZipFile('zippedData/im.db.zip', 'r') as zip_ref:
    zip_ref.extractall('zippedData')

# Connect to the unzipped SQLite database
conn = sqlite3.connect('zippedData/im.db')

# Run test query
q = """
SELECT tbl_name AS table_name, sql
FROM sqlite_master 
WHERE type='table'
ORDER BY name;
"""
pd.read_sql(q, conn)

Unnamed: 0,table_name,sql
0,directors,"CREATE TABLE ""directors"" (\n""movie_id"" TEXT,\n..."
1,known_for,"CREATE TABLE ""known_for"" (\n""person_id"" TEXT,\..."
2,movie_akas,"CREATE TABLE ""movie_akas"" (\n""movie_id"" TEXT,\..."
3,movie_basics,"CREATE TABLE ""movie_basics"" (\n""movie_id"" TEXT..."
4,movie_ratings,"CREATE TABLE ""movie_ratings"" (\n""movie_id"" TEX..."
5,persons,"CREATE TABLE ""persons"" (\n""person_id"" TEXT,\n ..."
6,principals,"CREATE TABLE ""principals"" (\n""movie_id"" TEXT,\..."
7,writers,"CREATE TABLE ""writers"" (\n""movie_id"" TEXT,\n ..."


In [9]:
# Movie Basics
q = '''
SELECT *
FROM movie_basics
LIMIT 3
;
'''
pd.read_sql(q, conn)

Unnamed: 0,movie_id,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama"
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama


In [10]:
# Moving Ratings
q = '''
SELECT *
FROM movie_ratings
LIMIT 3
;
'''
pd.read_sql(q, conn)

Unnamed: 0,movie_id,averagerating,numvotes
0,tt10356526,8.3,31
1,tt10384606,8.9,559
2,tt1042974,6.4,20


In [6]:
# Directors
q = '''
SELECT *
FROM directors
LIMIT 3
;
'''
pd.read_sql(q, conn)

Unnamed: 0,movie_id,person_id
0,tt0285252,nm0899854
1,tt0462036,nm1940585
2,tt0835418,nm0151540


In [7]:
# Known For
q = '''
SELECT *
FROM known_for
LIMIT 3
;
'''
pd.read_sql(q, conn)

Unnamed: 0,person_id,movie_id
0,nm0061671,tt0837562
1,nm0061671,tt2398241
2,nm0061671,tt0844471


In [8]:
# Movie AKAs
q = '''
SELECT *
FROM movie_akas
LIMIT 3
;
'''
pd.read_sql(q, conn)

Unnamed: 0,movie_id,ordering,title,region,language,types,attributes,is_original_title
0,tt0369610,10,Джурасик свят,BG,bg,,,0.0
1,tt0369610,11,Jurashikku warudo,JP,,imdbDisplay,,0.0
2,tt0369610,12,Jurassic World: O Mundo dos Dinossauros,BR,,imdbDisplay,,0.0


In [11]:
# Persons
q = '''
SELECT *
FROM persons
LIMIT 3
;
'''
pd.read_sql(q, conn)

Unnamed: 0,person_id,primary_name,birth_year,death_year,primary_profession
0,nm0061671,Mary Ellen Bauder,,,"miscellaneous,production_manager,producer"
1,nm0061865,Joseph Bauer,,,"composer,music_department,sound_department"
2,nm0062070,Bruce Baum,,,"miscellaneous,actor,writer"


In [12]:
# Pricipals
q = '''
SELECT *
FROM principals
LIMIT 3
;
'''
pd.read_sql(q, conn)

Unnamed: 0,movie_id,ordering,person_id,category,job,characters
0,tt0111414,1,nm0246005,actor,,"[""The Man""]"
1,tt0111414,2,nm0398271,director,,
2,tt0111414,3,nm3739909,producer,producer,


In [13]:
# Writers
q = '''
SELECT *
FROM writers
LIMIT 3
;
'''
pd.read_sql(q, conn)

Unnamed: 0,movie_id,person_id
0,tt0285252,nm0899854
1,tt0438973,nm0175726
2,tt0438973,nm1802864


---

In [24]:
# joined file
q = '''
SELECT *
FROM movie_basics mb
JOIN directors dr USING(movie_id)
JOIN principals pr USING(movie_id)
JOIN persons p USING(person_id)
JOIN movie_ratings mr USING(movie_id)
WHERE death_year IS NULL
GROUP BY movie_id
LIMIT 20
;
'''
pd.read_sql(q, conn)

Unnamed: 0,movie_id,primary_title,original_title,start_year,runtime_minutes,genres,person_id,ordering,person_id.1,category,job,characters,primary_name,birth_year,death_year,primary_profession,averagerating,numvotes
0,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy",nm0765384,1,nm0016013,actor,,,Valeria Sarmiento,1948.0,,"editor,director,writer",6.5,119
1,tt0112502,Bigfoot,Bigfoot,2017,,"Horror,Thriller",nm6883878,1,nm6883878,director,,,Mc Jones,,,"actor,director",4.1,32
2,tt0137204,Joe Finds Grace,Joe Finds Grace,2017,83.0,"Adventure,Animation,Comedy",nm0365480,1,nm0365480,actor,,"[""Joseph Briteman""]",Anthony Harrison,1961.0,,"actor,writer,producer",8.1,263
3,tt0146592,Pál Adrienn,Pál Adrienn,2010,136.0,Drama,nm1030585,1,nm3140529,actress,,"[""Piroska""]",Ágnes Kocsis,1971.0,,"director,writer,producer",6.8,451
4,tt0159369,Cooper and Hemingway: The True Gen,Cooper and Hemingway: The True Gen,2013,180.0,Documentary,nm0611850,1,nm0001832,actor,,"[""Narrator""]",John Mulholland,,,"director,writer,producer",7.6,53
5,tt0162942,Children of the Green Dragon,A zöld sárkány gyermekei,2010,89.0,Drama,nm1207262,1,nm0753588,actor,,"[""Máté János""]",Bence Miklauzic,1970.0,,"director,writer,assistant_director",6.9,120
6,tt0176694,The Tragedy of Man,Az ember tragédiája,2011,160.0,"Animation,Drama,History",nm0417757,1,nm2103019,archive_footage,,,Marcell Jankovics,1941.0,,"writer,director,animation_department",7.8,584
7,tt0192528,Heaven & Hell,Reverse Heaven,2018,104.0,Drama,nm0667001,1,nm0528638,actress,,"[""Sergeant Coolahan""]",Stuart Paul,,,"writer,director,actor",4.0,72
8,tt0247643,Los pájaros se van con la muerte,Los pájaros se van con la muerte,2011,110.0,"Drama,Mystery",nm0881867,1,nm0815280,actress,,"[""Madre""]",Thaelman Urgelles,1948.0,,"writer,producer,director",4.0,12
9,tt0249516,Foodfight!,Foodfight!,2012,91.0,"Action,Animation,Comedy",nm0440415,1,nm0240381,actress,,"[""Sunshine Goodness""]",Lawrence Kasanoff,1959.0,,"producer,writer,director",1.9,8248
