In [754]:
import pandas as pd
import numpy as np
import sqlite3
import zipfile

## List of Dataframes:

In [755]:
# Dataframes
movie_info_df = pd.read_csv('zippedData/rt.movie_info.tsv.gz', sep='\t', compression='gzip')
movie_info_drop = ['box_office', 'currency', 'dvd_date', 'studio']
movie_info_df = movie_info_df.drop(columns=movie_info_drop)
print(f'movie_info_df columns: \n{list(movie_info_df.columns)}\n')


movie_gross_df = pd.read_csv('zippedData/bom.movie_gross.csv.gz')
print(f'movie_gross_df columns: \n{list(movie_gross_df.columns)}\n')

movie_budget_df = pd.read_csv('zippedData/tn.movie_budgets.csv.gz')
print(f'movie_budget_df columns: \n{list(movie_budget_df.columns)}\n')

movie_reviews_df = pd.read_csv('zippedData/rt.reviews.tsv.gz', sep='\t', compression='gzip', encoding='latin-1')
movie_review_drop = ['rating']
movie_reviews_df = movie_reviews_df.drop(columns=movie_review_drop) # Dropping rating: poor formatting & lacking 50% of data.
print(f'movie_reviews_df columns: \n{list(movie_reviews_df.columns)}\n')


tmbd_movie_df = pd.read_csv('zippedData/tmdb.movies.csv.gz')
print(f'tmbd_movie_df columns: \n{list(tmbd_movie_df.columns)}\n')

movie_info_df columns: 
['id', 'synopsis', 'rating', 'genre', 'director', 'writer', 'theater_date', 'runtime']

movie_gross_df columns: 
['title', 'studio', 'domestic_gross', 'foreign_gross', 'year']

movie_budget_df columns: 
['id', 'release_date', 'movie', 'production_budget', 'domestic_gross', 'worldwide_gross']

movie_reviews_df columns: 
['id', 'review', 'fresh', 'critic', 'top_critic', 'publisher', 'date']

tmbd_movie_df columns: 
['Unnamed: 0', 'genre_ids', 'id', 'original_language', 'original_title', 'popularity', 'release_date', 'title', 'vote_average', 'vote_count']



In [803]:
combined_df = combined_df.merge(movie_budget_df[['movie', 'production_budget']], 
                                left_on='title', 
                                right_on='movie', 
                                how='inner')

# if you want to drop the 'movie' column after the merge, you can do so
combined_df = combined_df.drop('movie', axis=1)
combined_df.info()



<class 'pandas.core.frame.DataFrame'>
Int64Index: 1497 entries, 0 to 1496
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           1497 non-null   int64  
 1   genre_ids            1497 non-null   object 
 2   id                   1497 non-null   int64  
 3   original_language    1497 non-null   object 
 4   original_title       1497 non-null   object 
 5   popularity           1497 non-null   float64
 6   release_date         1497 non-null   object 
 7   title                1497 non-null   object 
 8   vote_average         1497 non-null   float64
 9   vote_count           1497 non-null   int64  
 10  studio               1496 non-null   object 
 11  domestic_gross       1495 non-null   float64
 12  foreign_gross        1278 non-null   object 
 13  year                 1497 non-null   int64  
 14  production_budget_x  1497 non-null   object 
 15  production_budget_y  1497 non-null   o

In [788]:
combined_df = tmbd_movie_df.merge(movie_gross_df, on='title', how='inner')
combined_df.head()

Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count,studio,domestic_gross,foreign_gross,year
0,1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610,P/DW,217600000.0,277300000,2010
1,2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368,Par.,312400000.0,311500000,2010
2,4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186,WB,292600000.0,535700000,2010
3,7,"[16, 10751, 35]",10193,en,Toy Story 3,24.445,2010-06-17,Toy Story 3,7.7,8340,BV,415000000.0,652000000,2010
4,8,"[16, 10751, 35]",20352,en,Despicable Me,23.673,2010-07-09,Despicable Me,7.2,10057,Uni.,251500000.0,291600000,2010


In [800]:
tmbd_movie_df.head()

Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186


### movie_info_df: cleaned

In [761]:
movie_info_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 197 entries, 1 to 1542
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   id                  197 non-null    int64         
 1   synopsis            197 non-null    object        
 2   rating              197 non-null    object        
 3   genre               197 non-null    object        
 4   director            197 non-null    object        
 5   writer              197 non-null    object        
 6   theater_date        197 non-null    datetime64[ns]
 7   runtime_in_minutes  197 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(6)
memory usage: 13.9+ KB


In [760]:
movie_info_df.isna().sum().sort_values(ascending=False)

runtime_in_minutes    0
theater_date          0
writer                0
director              0
genre                 0
rating                0
synopsis              0
id                    0
dtype: int64

Cleaning & Filtering for **MOVIE_INFO_DF**:

In [759]:
# Dropping 8 null rows with missing genre. Most info is missing with them:
movie_info_df = movie_info_df.dropna(subset=['genre'])

# filling all empty values with 'unknown' for ['synopsis', 'runtime', 'director', 'theater_date', 'writer']:
columns_to_fill = ['synopsis', 'runtime', 'director', 'theater_date', 'writer']
movie_info_df[columns_to_fill] = movie_info_df[columns_to_fill].fillna('unknown')

# Filtering date to 2010 and forward
movie_info_df['theater_date'] = pd.to_datetime(movie_info_df['theater_date'], format='%b %d, %Y', errors='coerce')
movie_info_df = movie_info_df[movie_info_df['theater_date'] >= 'Jan 1, 2010']



# changing runtime to int + removing 'minutes' + changing name of the column to reflect:
movie_info_df.loc[movie_info_df['runtime'] != 'unknown', 'runtime'] = movie_info_df.loc[movie_info_df['runtime'] != 'unknown', 'runtime'].str.replace(' minutes', '').astype(int)
movie_info_df = movie_info_df.rename(columns={'runtime': 'runtime_in_minutes'})

# drop all 26 unknown runtimes:
movie_info_df = movie_info_df[movie_info_df['runtime_in_minutes'] != 'unknown']


In [762]:
movie_info_df.head(3)

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,runtime_in_minutes
1,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,2012-08-17,108
8,14,"""Love Ranch"" is a bittersweet love story that ...",R,Drama,Taylor Hackford,Mark Jacobson,2010-06-30,117
15,22,Two-time Academy Award Winner Kevin Spacey giv...,R,Comedy|Drama|Mystery and Suspense,George Hickenlooper,Norman Snider,2010-12-17,108


**MOVIE_INFO_DF** Analysis:

In [763]:
# 81% of the movie market is rated PG-13, R and NR:
movie_info_df['rating'].value_counts()

R        89
PG-13    55
NR       30
PG       22
G         1
Name: rating, dtype: int64

In [764]:
# average movie length is just over 100 minutes
movie_info_df['runtime_in_minutes'].mean()

106.5228426395939

In [765]:
# Top Directors
movie_info_df['director'].value_counts().head(20)

unknown                      17
Clint Eastwood                2
Christian Ditter              2
Seth MacFarlane               2
Clay Kaytis|Fergal Reilly     1
Max Joseph                    1
Antoine Fuqua                 1
Vikramaditya Motwane          1
Kelly Fremon Craig            1
Daniel Lee                    1
Paul McGuigan                 1
Anton Corbijn                 1
David Fincher                 1
Tim Story                     1
Sam Mendes                    1
Patrick Lussier               1
Daniel Alfredson              1
Cameron Crowe                 1
Stuart Beattie                1
James Marsh                   1
Name: director, dtype: int64

In [732]:
# Top Writers
movie_info_df['writer'].value_counts().head(20)

unknown                                                                34
Seth MacFarlane|Alec Sulkin|Wellesley Wild                              2
Maggie Carey                                                            1
Stuart Beattie                                                          1
Eric Kripke                                                             1
Prasoon Joshi                                                           1
Isaac Aptaker|Elizabeth Berger                                          1
Norman Snider                                                           1
Peter Morgan                                                            1
Desiree Akhavan|Cecilia Frugiuele                                       1
Jan Sardi                                                               1
Aline Brosh McKenna                                                     1
Peter Byck|Eric Driscoll|Matt Weinhold|Karen Weigert                    1
Paolo Sorrentino|Umberto Contarello   

In [733]:
# Top Genres seem to be a mix of Drama & Comedy
movie_info_df['genre'].value_counts().head(20)

Drama                                                     41
Comedy                                                    18
Comedy|Drama                                              16
Drama|Mystery and Suspense                                10
Comedy|Romance                                             9
Drama|Romance                                              6
Action and Adventure|Mystery and Suspense                  6
Mystery and Suspense                                       5
Horror                                                     5
Action and Adventure|Science Fiction and Fantasy           4
Documentary|Special Interest                               3
Comedy|Drama|Romance                                       3
Animation|Comedy|Kids and Family                           3
Comedy|Drama|Mystery and Suspense                          3
Art House and International|Drama|Special Interest         2
Art House and International|Drama|Romance                  2
Art House and Internatio

### movie_reviews_df: cleaned

In [771]:
movie_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4713 entries, 24 to 54360
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   id          4713 non-null   int64         
 1   review      4713 non-null   object        
 2   fresh       4713 non-null   object        
 3   critic      4713 non-null   object        
 4   top_critic  4713 non-null   int64         
 5   publisher   4713 non-null   object        
 6   date        4713 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(4)
memory usage: 294.6+ KB


In [770]:
movie_reviews_df.isna().sum().sort_values(ascending=False)

date          0
publisher     0
top_critic    0
critic        0
fresh         0
review        0
id            0
dtype: int64

**MOVIE_REVIEWS_DF** Data cleaning:

In [769]:
# Filtering to only top_critics == 1
movie_reviews_df = movie_reviews_df[movie_reviews_df['top_critic'] == 1]
# filling null 'publisher', 'critic', 'review' values with 'Unknown':
fill_columns = ['publisher', 'critic', 'review']
movie_reviews_df[fill_columns] = movie_reviews_df[fill_columns].fillna('unknown')
# Filtering date to 2010 and forward
movie_reviews_df['date'] = pd.to_datetime(movie_reviews_df['date'])
movie_reviews_df = movie_reviews_df[movie_reviews_df['date'] >= '2010-01-01']





In [776]:
movie_reviews_df.head(20)

Unnamed: 0,id,review,fresh,critic,top_critic,publisher,date
24,3,"The rapid dialogue is dry and mannered, like a...",rotten,Joe Williams,1,St. Louis Post-Dispatch,2012-09-06
47,3,It feels like each and every moment bursts for...,rotten,Stephanie Merry,1,Washington Post,2012-08-24
50,3,"There's not really a movie there, nothing that...",rotten,Mick LaSalle,1,San Francisco Chronicle,2012-08-23
54,3,The film is all too faithful to its un-cinemat...,rotten,Colin Covert,1,Minneapolis Star Tribune,2012-08-23
55,3,"Cosmopolis,"" because of its allegiance to the ...",rotten,Moira MacDonald,1,Seattle Times,2012-08-23
58,3,Poor Pattinson does the best he can. He's not ...,rotten,Mark Feeney,1,Boston Globe,2012-08-23
60,3,The story seems to cleave into cerebral disqui...,fresh,J. R. Jones,1,Chicago Reader,2012-08-23
62,3,The movie isn't for everyone. But if it grabs ...,fresh,Rene Rodriguez,1,Miami Herald,2012-08-23
64,3,A flawlessly directed film about enigmatic peo...,rotten,Roger Ebert,1,Chicago Sun-Times,2012-08-23
77,3,"Despite the constrictions, Cronenberg keeps th...",fresh,David Denby,1,New Yorker,2012-08-17


**MOVIE_REVIEW_DF** Analysis:

Top_Critics reviews = 4,713

In [773]:
movie_reviews_df['top_critic'].sum()

4713

Names of the top 20 critics:

In [774]:
movie_reviews_df['critic'].value_counts().head(20)

Owen Gleiberman       127
unknown                88
Stephen Whitty         74
Michael Phillips       73
Colin Covert           73
Peter Travers          69
Rafer Guzman           68
Joe Morgenstern        64
James Berardinelli     61
Mick LaSalle           60
Peter Howell           60
Bill Goodykoontz       59
Peter Rainer           55
Claudia Puig           55
Lisa Schwarzbaum       55
Ty Burr                54
Richard Roeper         51
Ann Hornaday           47
Steven Rea             47
Rex Reed               47
Name: critic, dtype: int64

Fresh vs. Rotten; 

- fresh = 60%
- Rotten = 40%

In [775]:
movie_reviews_df['fresh'].value_counts()

fresh     2818
rotten    1895
Name: fresh, dtype: int64

---

# SQL DATABASE

* Unzipping the '**im.db.zip**' file 
* Connecting to database using '**conn**'
* Printing all of the **table names** within the database


**Important note**: movie_basics & movie_ratings are the most relevant per instructions

In [None]:
# Unzip the database file
with zipfile.ZipFile('zippedData/im.db.zip', 'r') as zip_ref:
    zip_ref.extractall('zippedData')

# Connect to the unzipped SQLite database
conn = sqlite3.connect('zippedData/im.db')

# Run test query
q = """
SELECT tbl_name AS table_name, sql
FROM sqlite_master 
WHERE type='table'
ORDER BY name;
"""
pd.read_sql(q, conn)

In [None]:
# Movie Basics
q = '''
SELECT *
FROM movie_basics
WHERE original_title LIKE "Toy%"
LIMIT 3
;
'''
pd.read_sql(q, conn)

In [None]:
# Moving Ratings
q = '''
SELECT *
FROM movie_ratings
LIMIT 3
;
'''
pd.read_sql(q, conn)

In [None]:
# Directors
q = '''
SELECT *
FROM directors
LIMIT 3
;
'''
pd.read_sql(q, conn)

In [None]:
# Known For
q = '''
SELECT *
FROM known_for
LIMIT 3
;
'''
pd.read_sql(q, conn)

In [None]:
# Movie AKAs
q = '''
SELECT *
FROM movie_akas
LIMIT 3
;
'''
pd.read_sql(q, conn)

In [None]:
# Persons
q = '''
SELECT *
FROM persons
LIMIT 3
;
'''
pd.read_sql(q, conn)

In [None]:
# Pricipals
q = '''
SELECT *
FROM principals
LIMIT 3
;
'''
pd.read_sql(q, conn)

In [None]:
# Writers
q = '''
SELECT *
FROM writers
LIMIT 3
;
'''
pd.read_sql(q, conn)

---

In [None]:
# joined file
q = '''
SELECT *
FROM movie_basics mb
JOIN directors dr USING(movie_id)
JOIN principals pr USING(movie_id)
JOIN persons p USING(person_id)
JOIN movie_ratings mr USING(movie_id)
WHERE death_year IS NULL
GROUP BY movie_id
ORDER BY start_year
;
'''
pd.read_sql(q, conn)

# Filtering