In [111]:
import pandas as pd
import numpy as np
import sqlite3
import zipfile

## List of Dataframes:

In [118]:
# Dataframes
movie_info_df = pd.read_csv('zippedData/rt.movie_info.tsv.gz', sep='\t', compression='gzip')
movie_info_drop = ['box_office', 'currency', 'dvd_date']
movie_info_df = movie_info_df.drop(columns=movie_info_drop)
print(f'movie_info_df columns: \n{list(movie_info_df.columns)}\n')


movie_gross_df = pd.read_csv('zippedData/bom.movie_gross.csv.gz')
print(f'movie_gross_df columns: \n{list(movie_gross_df.columns)}\n')

movie_budget_df = pd.read_csv('zippedData/tn.movie_budgets.csv.gz')
print(f'movie_budget_df columns: \n{list(movie_budget_df.columns)}\n')

movie_reviews_df = pd.read_csv('zippedData/rt.reviews.tsv.gz', sep='\t', compression='gzip', encoding='latin-1')
print(f'movie_reviews_df columns: \n{list(movie_reviews_df.columns)}\n')

tmbd_movie_df = pd.read_csv('zippedData/tmdb.movies.csv.gz')
print(f'tmbd_movie_df columns: \n{list(tmbd_movie_df.columns)}\n')

movie_info_df columns: 
['id', 'synopsis', 'rating', 'genre', 'director', 'writer', 'theater_date', 'runtime', 'studio']

movie_gross_df columns: 
['title', 'studio', 'domestic_gross', 'foreign_gross', 'year']

movie_budget_df columns: 
['id', 'release_date', 'movie', 'production_budget', 'domestic_gross', 'worldwide_gross']

movie_reviews_df columns: 
['id', 'review', 'rating', 'fresh', 'critic', 'top_critic', 'publisher', 'date']

tmbd_movie_df columns: 
['Unnamed: 0', 'genre_ids', 'id', 'original_language', 'original_title', 'popularity', 'release_date', 'title', 'vote_average', 'vote_count']



In [145]:
tmbd_movie_df[tmbd_movie_df['original_title'].str.contains("Toy")]


Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
3,3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
7,7,"[16, 10751, 35]",10193,en,Toy Story 3,24.445,2010-06-17,Toy Story 3,7.7,8340
10,10,"[16, 35, 10751]",863,en,Toy Story 2,22.698,1999-11-24,Toy Story 2,7.5,7553
726,726,"[27, 14]",56242,en,Demonic Toys: Personal Demons,2.475,2010-02-26,Demonic Toys: Personal Demons,3.8,14
2473,2473,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
2477,2477,"[16, 35, 10751]",863,en,Toy Story 2,22.698,1999-11-24,Toy Story 2,7.5,7553
3007,3007,"[35, 10749]",77665,en,Boy Toy,4.836,2011-08-31,Boy Toy,4.5,10
3508,3508,[99],54358,en,Resurrect Dead: The Mystery of the Toynbee Tiles,1.879,2011-08-01,Resurrect Dead: The Mystery of the Toynbee Tiles,6.5,27
10089,10089,[],175939,en,Live Action Toy Story,0.6,2013-01-08,Live Action Toy Story,9.0,1
10250,10250,"[12, 16, 35, 10751, 14, 27, 10770]",213121,en,Toy Story of Terror!,0.6,2014-08-19,Toy Story of Terror!,7.1,413


---

## SQL DATABASE

* Unzipping the '**im.db.zip**' file 
* Connecting to database using '**conn**'
* Printing all of the **table names** within the database


**Important note**: movie_basics & movie_ratings are the most relevant per instructions

In [119]:
# Unzip the database file
with zipfile.ZipFile('zippedData/im.db.zip', 'r') as zip_ref:
    zip_ref.extractall('zippedData')

# Connect to the unzipped SQLite database
conn = sqlite3.connect('zippedData/im.db')

# Run test query
q = """
SELECT tbl_name AS table_name, sql
FROM sqlite_master 
WHERE type='table'
ORDER BY name;
"""
pd.read_sql(q, conn)

Unnamed: 0,table_name,sql
0,directors,"CREATE TABLE ""directors"" (\n""movie_id"" TEXT,\n..."
1,known_for,"CREATE TABLE ""known_for"" (\n""person_id"" TEXT,\..."
2,movie_akas,"CREATE TABLE ""movie_akas"" (\n""movie_id"" TEXT,\..."
3,movie_basics,"CREATE TABLE ""movie_basics"" (\n""movie_id"" TEXT..."
4,movie_ratings,"CREATE TABLE ""movie_ratings"" (\n""movie_id"" TEX..."
5,persons,"CREATE TABLE ""persons"" (\n""person_id"" TEXT,\n ..."
6,principals,"CREATE TABLE ""principals"" (\n""movie_id"" TEXT,\..."
7,writers,"CREATE TABLE ""writers"" (\n""movie_id"" TEXT,\n ..."


In [142]:
# Movie Basics
q = '''
SELECT *
FROM movie_basics
WHERE original_title LIKE "Toy%"
LIMIT 3
;
'''
pd.read_sql(q, conn)

Unnamed: 0,movie_id,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0435761,Toy Story 3,Toy Story 3,2010,103.0,"Adventure,Animation,Comedy"
1,tt1608255,Toyland,Toyland,2010,68.0,"Biography,Documentary,Family"
2,tt1754780,Toy Masters,Toy Masters,2012,97.0,"Biography,Comedy,Documentary"


In [121]:
# Moving Ratings
q = '''
SELECT *
FROM movie_ratings
LIMIT 3
;
'''
pd.read_sql(q, conn)

Unnamed: 0,movie_id,averagerating,numvotes
0,tt10356526,8.3,31
1,tt10384606,8.9,559
2,tt1042974,6.4,20


In [122]:
# Directors
q = '''
SELECT *
FROM directors
LIMIT 3
;
'''
pd.read_sql(q, conn)

Unnamed: 0,movie_id,person_id
0,tt0285252,nm0899854
1,tt0462036,nm1940585
2,tt0835418,nm0151540


In [123]:
# Known For
q = '''
SELECT *
FROM known_for
LIMIT 3
;
'''
pd.read_sql(q, conn)

Unnamed: 0,person_id,movie_id
0,nm0061671,tt0837562
1,nm0061671,tt2398241
2,nm0061671,tt0844471


In [124]:
# Movie AKAs
q = '''
SELECT *
FROM movie_akas
LIMIT 3
;
'''
pd.read_sql(q, conn)

Unnamed: 0,movie_id,ordering,title,region,language,types,attributes,is_original_title
0,tt0369610,10,Джурасик свят,BG,bg,,,0.0
1,tt0369610,11,Jurashikku warudo,JP,,imdbDisplay,,0.0
2,tt0369610,12,Jurassic World: O Mundo dos Dinossauros,BR,,imdbDisplay,,0.0


In [125]:
# Persons
q = '''
SELECT *
FROM persons
LIMIT 3
;
'''
pd.read_sql(q, conn)

Unnamed: 0,person_id,primary_name,birth_year,death_year,primary_profession
0,nm0061671,Mary Ellen Bauder,,,"miscellaneous,production_manager,producer"
1,nm0061865,Joseph Bauer,,,"composer,music_department,sound_department"
2,nm0062070,Bruce Baum,,,"miscellaneous,actor,writer"


In [126]:
# Pricipals
q = '''
SELECT *
FROM principals
LIMIT 3
;
'''
pd.read_sql(q, conn)

Unnamed: 0,movie_id,ordering,person_id,category,job,characters
0,tt0111414,1,nm0246005,actor,,"[""The Man""]"
1,tt0111414,2,nm0398271,director,,
2,tt0111414,3,nm3739909,producer,producer,


In [127]:
# Writers
q = '''
SELECT *
FROM writers
LIMIT 3
;
'''
pd.read_sql(q, conn)

Unnamed: 0,movie_id,person_id
0,tt0285252,nm0899854
1,tt0438973,nm0175726
2,tt0438973,nm1802864


---

In [153]:
# joined file
q = '''
SELECT *
FROM movie_basics mb
JOIN directors dr USING(movie_id)
JOIN principals pr USING(movie_id)
JOIN persons p USING(person_id)
JOIN movie_ratings mr USING(movie_id)
WHERE death_year IS NULL
GROUP BY movie_id
ORDER BY start_year
;
'''
pd.read_sql(q, conn)

Unnamed: 0,movie_id,primary_title,original_title,start_year,runtime_minutes,genres,person_id,ordering,person_id.1,category,job,characters,primary_name,birth_year,death_year,primary_profession,averagerating,numvotes
0,tt0146592,Pál Adrienn,Pál Adrienn,2010,136.0,Drama,nm1030585,1,nm3140529,actress,,"[""Piroska""]",Ágnes Kocsis,1971.0,,"director,writer,producer",6.8,451
1,tt0162942,Children of the Green Dragon,A zöld sárkány gyermekei,2010,89.0,Drama,nm1207262,1,nm0753588,actor,,"[""Máté János""]",Bence Miklauzic,1970.0,,"director,writer,assistant_director",6.9,120
2,tt0312305,Quantum Quest: A Cassini Space Odyssey,Quantum Quest: A Cassini Space Odyssey,2010,45.0,"Adventure,Animation,Sci-Fi",nm0820800,1,nm1517976,actor,,"[""Dave""]",Daniel St. Pierre,1961.0,,"animation_department,art_department,director",5.1,287
3,tt0326592,The Overnight,The Overnight,2010,88.0,,nm1208371,1,nm1211335,actor,,"[""Ted Bell""]",Jed I. Goodman,,,"camera_department,writer,producer",7.5,24
4,tt0326965,In My Sleep,In My Sleep,2010,104.0,"Drama,Mystery,Thriller",nm1075006,1,nm0934618,actor,,"[""Marcus""]",Allen Wolf,1970.0,,"miscellaneous,producer,director",5.5,1889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72532,tt9913056,Swarm Season,Swarm Season,2019,86.0,Documentary,nm1502645,1,nm1502645,director,,,Sarah Christman,,,"miscellaneous,director,editor",6.2,5
72533,tt9913084,Diabolik sono io,Diabolik sono io,2019,75.0,Documentary,nm0812850,1,nm0769233,actor,,"[""Zarcone""]",Giancarlo Soldi,1954.0,,"director,writer,producer",6.2,6
72534,tt9914286,Sokagin Çocuklari,Sokagin Çocuklari,2019,98.0,"Drama,Family",nm4394529,1,nm4394529,actor,,,Ahmet Faik Akinci,,,"director,writer",8.7,136
72535,tt9914942,La vida sense la Sara Amat,La vida sense la Sara Amat,2019,,,nm1716653,1,nm1290838,actor,,,Laura Jou,,,"miscellaneous,actress,director",6.6,5


# Filtering