In [45]:
import pandas as pd
import numpy as np
import sqlite3
import zipfile

## List of Dataframes:

In [46]:
# Dataframes
movie_info_df = pd.read_csv('zippedData/rt.movie_info.tsv.gz', sep='\t', compression='gzip')
movie_info_drop = ['box_office', 'currency', 'dvd_date', 'studio']
movie_info_df = movie_info_df.drop(columns=movie_info_drop)
print(f'movie_info_df columns: \n{list(movie_info_df.columns)}\n')


movie_gross_df = pd.read_csv('zippedData/bom.movie_gross.csv.gz')
print(f'movie_gross_df columns: \n{list(movie_gross_df.columns)}\n')

movie_budget_df = pd.read_csv('zippedData/tn.movie_budgets.csv.gz')
print(f'movie_budget_df columns: \n{list(movie_budget_df.columns)}\n')

movie_reviews_df = pd.read_csv('zippedData/rt.reviews.tsv.gz', sep='\t', compression='gzip', encoding='latin-1')
movie_review_drop = ['rating']
movie_reviews_df = movie_reviews_df.drop(columns=movie_review_drop) # Dropping rating: poor formatting & lacking 50% of data.
print(f'movie_reviews_df columns: \n{list(movie_reviews_df.columns)}\n')


tmbd_movie_df = pd.read_csv('zippedData/tmdb.movies.csv.gz')
print(f'tmbd_movie_df columns: \n{list(tmbd_movie_df.columns)}\n')

movie_info_df columns: 
['id', 'synopsis', 'rating', 'genre', 'director', 'writer', 'theater_date', 'runtime']

movie_gross_df columns: 
['title', 'studio', 'domestic_gross', 'foreign_gross', 'year']

movie_budget_df columns: 
['id', 'release_date', 'movie', 'production_budget', 'domestic_gross', 'worldwide_gross']

movie_reviews_df columns: 
['id', 'review', 'fresh', 'critic', 'top_critic', 'publisher', 'date']

tmbd_movie_df columns: 
['Unnamed: 0', 'genre_ids', 'id', 'original_language', 'original_title', 'popularity', 'release_date', 'title', 'vote_average', 'vote_count']



**Combined Datraframe**:

In [98]:
#change budget column name to title instead of movie
movie_budget_df = movie_budget_df.rename(columns={'movie': 'title'})

# Merge tmbd_movie_df and movie_gross_df]
main_df = pd.merge(tmbd_movie_df, movie_gross_df, on='title', how='left')

# Keep only the 'title' and 'production_budget' columns from movie_budget_df
main_budget_df = movie_budget_df[['title', 'production_budget']]

# Merge master_df and movie_budget_df
main_df = pd.merge(main_df, movie_budget_df, on='title', how='left')

# Drop unused column
column_to_drop = ['Unnamed: 0', 'release_date_y', 'domestic_gross_y', 'id_y', 'year']
main_df = main_df.drop(columns=column_to_drop)

#rename main_df columns to better reflact data
main_df = main_df.rename(columns={'id_x': 'id', 'release_date_x': 'release_date', 'domestic_gross_x': 'domestic_gross'})




Cleaning & filtering **MAIN_DF**:

In [91]:
# Filter to only english
main_df = main_df[main_df['original_language'] == 'en']

# Filter by 2010 and sooner
main_df = main_df[main_df['release_date'] >= '2010-01-01']







In [99]:
main_df.head(20)


Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count,studio,domestic_gross,foreign_gross,production_budget,worldwide_gross
0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,,,,,
1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610,P/DW,217600000.0,277300000.0,"$165,000,000","$494,870,992"
2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368,Par.,312400000.0,311500000.0,"$170,000,000","$621,156,389"
3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174,,,,"$30,000,000","$364,545,516"
4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186,WB,292600000.0,535700000.0,"$160,000,000","$835,524,642"
5,"[12, 14, 10751]",32657,en,Percy Jackson & the Olympians: The Lightning T...,26.691,2010-02-11,Percy Jackson & the Olympians: The Lightning T...,6.1,4229,,,,"$95,000,000","$223,050,874"
6,"[28, 12, 14, 878]",19995,en,Avatar,26.526,2009-12-18,Avatar,7.4,18676,,,,"$425,000,000","$2,776,345,279"
7,"[16, 10751, 35]",10193,en,Toy Story 3,24.445,2010-06-17,Toy Story 3,7.7,8340,BV,415000000.0,652000000.0,"$200,000,000","$1,068,879,522"
8,"[16, 10751, 35]",20352,en,Despicable Me,23.673,2010-07-09,Despicable Me,7.2,10057,Uni.,251500000.0,291600000.0,"$69,000,000","$543,464,573"
9,"[16, 28, 35, 10751, 878]",38055,en,Megamind,22.855,2010-11-04,Megamind,6.8,3635,P/DW,148400000.0,173500000.0,"$130,000,000","$321,887,208"


In [101]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26606 entries, 0 to 26605
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   genre_ids          26606 non-null  object 
 1   id                 26606 non-null  int64  
 2   original_language  26606 non-null  object 
 3   original_title     26606 non-null  object 
 4   popularity         26606 non-null  float64
 5   release_date       26606 non-null  object 
 6   title              26606 non-null  object 
 7   vote_average       26606 non-null  float64
 8   vote_count         26606 non-null  int64  
 9   studio             2719 non-null   object 
 10  domestic_gross     2699 non-null   float64
 11  foreign_gross      1736 non-null   object 
 12  production_budget  2385 non-null   object 
 13  worldwide_gross    2385 non-null   object 
dtypes: float64(3), int64(2), object(9)
memory usage: 3.0+ MB


In [106]:
main_df.isna().sum().sort_values(ascending=False)


foreign_gross        24870
worldwide_gross      24221
production_budget    24221
domestic_gross       23907
studio               23887
vote_count               0
vote_average             0
title                    0
release_date             0
popularity               0
original_title           0
original_language        0
id                       0
genre_ids                0
dtype: int64

### movie_info_df: cleaned

In [None]:
movie_info_df.info()

In [None]:
movie_info_df.isna().sum().sort_values(ascending=False)

Cleaning & Filtering for **MOVIE_INFO_DF**:

In [None]:
# Dropping 8 null rows with missing genre. Most info is missing with them:
movie_info_df = movie_info_df.dropna(subset=['genre'])

# filling all empty values with 'unknown' for ['synopsis', 'runtime', 'director', 'theater_date', 'writer']:
columns_to_fill = ['synopsis', 'runtime', 'director', 'theater_date', 'writer']
movie_info_df[columns_to_fill] = movie_info_df[columns_to_fill].fillna('unknown')

# Filtering date to 2010 and forward
movie_info_df['theater_date'] = pd.to_datetime(movie_info_df['theater_date'], format='%b %d, %Y', errors='coerce')
movie_info_df = movie_info_df[movie_info_df['theater_date'] >= 'Jan 1, 2010']



# changing runtime to int + removing 'minutes' + changing name of the column to reflect:
movie_info_df.loc[movie_info_df['runtime'] != 'unknown', 'runtime'] = movie_info_df.loc[movie_info_df['runtime'] != 'unknown', 'runtime'].str.replace(' minutes', '').astype(int)
movie_info_df = movie_info_df.rename(columns={'runtime': 'runtime_in_minutes'})

# drop all 26 unknown runtimes:
movie_info_df = movie_info_df[movie_info_df['runtime_in_minutes'] != 'unknown']


In [None]:
movie_info_df.head(3)

**MOVIE_INFO_DF** Analysis:

In [None]:
# 81% of the movie market is rated PG-13, R and NR:
movie_info_df['rating'].value_counts()

In [None]:
# average movie length is just over 100 minutes
movie_info_df['runtime_in_minutes'].mean()

In [None]:
# Top Directors
movie_info_df['director'].value_counts().head(20)

In [None]:
# Top Writers
movie_info_df['writer'].value_counts().head(20)

In [None]:
# Top Genres seem to be a mix of Drama & Comedy
movie_info_df['genre'].value_counts().head(20)

### movie_reviews_df: cleaned

In [None]:
movie_reviews_df.info()

In [None]:
movie_reviews_df.isna().sum().sort_values(ascending=False)

**MOVIE_REVIEWS_DF** Data cleaning:

In [None]:
# Filtering to only top_critics == 1
movie_reviews_df = movie_reviews_df[movie_reviews_df['top_critic'] == 1]
# filling null 'publisher', 'critic', 'review' values with 'Unknown':
fill_columns = ['publisher', 'critic', 'review']
movie_reviews_df[fill_columns] = movie_reviews_df[fill_columns].fillna('unknown')
# Filtering date to 2010 and forward
movie_reviews_df['date'] = pd.to_datetime(movie_reviews_df['date'])
movie_reviews_df = movie_reviews_df[movie_reviews_df['date'] >= '2010-01-01']





In [None]:
movie_reviews_df.head(20)

**MOVIE_REVIEW_DF** Analysis:

Top_Critics reviews = 4,713

In [None]:
movie_reviews_df['top_critic'].sum()

Names of the top 20 critics:

In [None]:
movie_reviews_df['critic'].value_counts().head(20)

Fresh vs. Rotten; 

- fresh = 60%
- Rotten = 40%

In [None]:
movie_reviews_df['fresh'].value_counts()

---

# SQL DATABASE

* Unzipping the '**im.db.zip**' file 
* Connecting to database using '**conn**'
* Printing all of the **table names** within the database


**Important note**: movie_basics & movie_ratings are the most relevant per instructions

In [None]:
# Unzip the database file
with zipfile.ZipFile('zippedData/im.db.zip', 'r') as zip_ref:
    zip_ref.extractall('zippedData')

# Connect to the unzipped SQLite database
conn = sqlite3.connect('zippedData/im.db')

# Run test query
q = """
SELECT tbl_name AS table_name, sql
FROM sqlite_master 
WHERE type='table'
ORDER BY name;
"""
pd.read_sql(q, conn)

In [None]:
# Movie Basics
q = '''
SELECT *
FROM movie_basics
WHERE original_title LIKE "Toy%"
LIMIT 3
;
'''
pd.read_sql(q, conn)

In [None]:
# Moving Ratings
q = '''
SELECT *
FROM movie_ratings
LIMIT 3
;
'''
pd.read_sql(q, conn)

In [None]:
# Directors
q = '''
SELECT *
FROM directors
LIMIT 3
;
'''
pd.read_sql(q, conn)

In [None]:
# Known For
q = '''
SELECT *
FROM known_for
LIMIT 3
;
'''
pd.read_sql(q, conn)

In [None]:
# Movie AKAs
q = '''
SELECT *
FROM movie_akas
LIMIT 3
;
'''
pd.read_sql(q, conn)

In [None]:
# Persons
q = '''
SELECT *
FROM persons
LIMIT 3
;
'''
pd.read_sql(q, conn)

In [None]:
# Pricipals
q = '''
SELECT *
FROM principals
LIMIT 3
;
'''
pd.read_sql(q, conn)

In [None]:
# Writers
q = '''
SELECT *
FROM writers
LIMIT 3
;
'''
pd.read_sql(q, conn)

---

In [None]:
# joined file
q = '''
SELECT *
FROM movie_basics mb
JOIN directors dr USING(movie_id)
JOIN principals pr USING(movie_id)
JOIN persons p USING(person_id)
JOIN movie_ratings mr USING(movie_id)
WHERE death_year IS NULL
GROUP BY movie_id
ORDER BY start_year
;
'''
pd.read_sql(q, conn)

# Filtering