In [1]:
# importing packages
import numpy as np
import pandas as pd
import sqlite3
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# importing zipped csv data
gross = pd.read_csv('Data/bom.movie_gross.csv.gz', compression = 'gzip')
info = pd.read_csv('Data/rt.movie_info.tsv.gz', compression = 'gzip', sep='\t')
reviews = pd.read_csv('Data/rt.reviews.tsv.gz', compression = 'gzip', sep='\t', encoding='unicode-escape')
tmdb = pd.read_csv('Data/tmdb.movies.csv.gz', compression = 'gzip')
budgets = pd.read_csv('Data/tn.movie_budgets.csv.gz', compression = 'gzip')

In [3]:
# importing database file
conn = sqlite3.connect('Data/im.db')
persons = pd.read_sql("""SELECT * FROM persons""", conn)
principals = pd.read_sql("""SELECT * FROM principals""", conn)
known_for = pd.read_sql("""SELECT * FROM known_for""", conn)
directors = pd.read_sql("""SELECT * FROM directors""", conn)
writers = known_for = pd.read_sql("""SELECT * FROM writers""", conn)
movie_basics = pd.read_sql("""SELECT * FROM movie_basics""", conn)
movie_ratings = pd.read_sql("""SELECT * FROM movie_ratings""", conn)
movie_akas = pd.read_sql("""SELECT * FROM movie_akas""", conn)

## 2. Cleaning

#### Gross

In [4]:
# making foreign gross a float
gross['foreign_gross'] = gross['foreign_gross'].replace('[,]', '', regex=True).astype('float64')
gross['foreign_gross'].fillna(0, inplace=True)

#### Info

In [5]:
# making dates datetime
info['theater_date'] = pd.to_datetime(info['theater_date'])
info['dvd_date'] = pd.to_datetime(info['dvd_date'])
# dropping currency, it is not needed
info.drop('currency',axis=1,inplace=True)

# making box_office a float
info['box_office'] = info['box_office'].str.replace(',', '')
info['box_office'] = info['box_office'].astype('float64')

# making runtaime a flaot
info['runtime'] = info['runtime'].str.split(' ').str[0]
info['runtime'] = info['runtime'].astype('float64')

#### Reviews

In [6]:
# changing fresh_rotten into a boolean
reviews['fresh'].replace('fresh', 1, inplace=True)
reviews['fresh'].replace('rotten', 0, inplace=True)
reviews['fresh'] = reviews['fresh'].astype(bool)

In [7]:
# making the date datetime
reviews['date'] = pd.to_datetime(reviews['date'])

In [8]:
# cleaning reviews with '/' and '.', pt. 1
# to do: make a function
reviews.loc[reviews['rating'].str.contains('/5')==True, ['rating_out_of_5']] = True
reviews.loc[reviews['rating'].str.contains('/5')==True, ['rating']] = reviews['rating'].str.split('/').str[0]
reviews.loc[reviews['rating'].str.contains('/6')==True, ['rating_out_of_6']] = True
reviews.loc[reviews['rating'].str.contains('/6')==True, ['rating']] = reviews['rating'].str.split('/').str[0]
reviews.loc[reviews['rating'].str.contains('/4')==True, ['rating_out_of_4']] = True
reviews.loc[reviews['rating'].str.contains('/4')==True, ['rating']] = reviews['rating'].str.split('/').str[0]
reviews.loc[reviews['rating'].str.contains('/10')==True, ['rating_out_of_10']] = True
reviews.loc[reviews['rating'].str.contains('/10')==True, ['rating']] = reviews['rating'].str.split('/').str[0]
reviews.loc[reviews['rating'].str.contains('.')==True, ['rating_out_of_10']] = True

In [9]:
# manually replacing odd values
# todo: make a dictionary
reviews['rating'].replace('A+', 1, inplace=True)
reviews['rating'].replace('A', 1, inplace=True)
reviews['rating'].replace('A-', .95, inplace=True)
reviews['rating'].replace('B+', .85, inplace=True)
#
reviews['rating'].replace('B', .8, inplace=True)
reviews['rating'].replace('B-', .75, inplace=True)
reviews['rating'].replace('C+', .65, inplace=True)
reviews['rating'].replace('C', .6, inplace=True)
reviews['rating'].replace('C-', .55, inplace=True)
reviews['rating'].replace('D+', .45, inplace=True)
reviews['rating'].replace('D', .4, inplace=True)
reviews['rating'].replace('D-', .35, inplace=True)
reviews['rating'].replace('F+', .25, inplace=True)
reviews['rating'].replace('F', .2, inplace=True)
reviews['rating'].replace('F-', .15, inplace=True)
reviews['rating'].replace(['2.1/2','R','N','T','1-5','3/2'], 'NaN', inplace=True)
reviews['rating'].replace('3 1/2', .7, inplace=True)
reviews['rating'].replace('6/8', .75, inplace=True)
reviews['rating'].replace('2/2', 1, inplace=True)
reviews['rating'].replace('1/2', .5, inplace=True)

In [10]:
# cleaning reviews with '/' and '.', pt. 2
reviews.loc[reviews['rating_out_of_4']==True, ['rating']] = reviews['rating'].astype('float64') / 4
reviews.loc[reviews['rating_out_of_5']==True, ['rating']] = reviews['rating'].astype('float64') / 5
reviews.loc[reviews['rating_out_of_6']==True, ['rating']] = reviews['rating'].astype('float64') / 6
reviews.loc[reviews['rating_out_of_10']==True, ['rating']] = reviews['rating'].astype('float64') / 10
reviews.drop(['rating_out_of_4','rating_out_of_5','rating_out_of_6','rating_out_of_10'],axis=1,inplace=True)

In [11]:
# making rating a float
reviews['rating'] = reviews['rating'].astype('float64')

#### tmdb

In [12]:
# getting rid of unnecessary column
tmdb.drop('Unnamed: 0', axis=1, inplace=True)

In [13]:
# making release date datetime
tmdb['release_date'] = pd.to_datetime(tmdb['release_date'])

In [14]:
#same units for rating 
tmdb['vote_average'] = tmdb['vote_average']/10

#### Budgets

In [15]:
# making release_date datetime
budgets['release_date'] = pd.to_datetime(budgets['release_date'])

In [16]:
# removing '$' and ',' from money variables
budgets['production_budget'] = budgets['production_budget'].replace('[$,]', '', regex=True).astype(int)
budgets['domestic_gross'] = budgets['domestic_gross'].replace('[$,]', '', regex=True).astype(int)
budgets['worldwide_gross'] = budgets['worldwide_gross'].replace('[$,]', '', regex=True).astype('int64')

#### Movie Basics

In [17]:
#making the id an int
movie_basics['movie_id'] = movie_basics['movie_id'].str.split('tt').str[1]
movie_basics['movie_id'] = movie_basics['movie_id'].astype(int)

#### Movie Ratings

In [18]:
#making the id an int
movie_ratings['movie_id'] = movie_ratings['movie_id'].str.split('tt').str[1]
movie_ratings['movie_id'] = movie_ratings['movie_id'].astype(int)

## 3. Making Useful Dataframes

#### Movie Genres

In [19]:
# Creating a list of all genres
#info
info_genres = list(info['genre'].str.split('|'))
info_genres = [x for x in info_genres if str(x) != 'nan']
info_genres = [item for sublist in info_genres for item in sublist]
info_genres = list(set(info_genres))

#movie_basics
movie_basics_genres = list(movie_basics['genres'].str.split(','))
movie_basics_genres = [x for x in movie_basics_genres if str(x) != 'None']
movie_basics_genres = [item for sublist in movie_basics_genres for item in sublist]
movie_basics_genres = list(set(movie_basics_genres))

#concat
genres = list(set(movie_basics_genres + info_genres))

In [20]:
#matching values between the two lists
genres_dict = {
    'Reality-TV':'TV Show',
    'Art House and International':'Art',
    'Romance':'Romance',
    'Comedy':'Comedy',
    'Music':'Music',
    'Special Interest':'Special Interest',
    'Adult':'Adult',
    'Game-Show':'TV Show',
    'Mystery':'Mystery and Suspense',
    'Fantasy':'Science Fiction and Fantasy',
    'Family':'Kids and Family',
    'Crime':'Mystery and Suspense',
    'Sport':'Sports and Fitness',
    'Short':'Shorts',
    'Television':'TV Show',
    'Sports and Fitness':'Sports and Fitness',
    'Mystery and Suspense':'Mystery and Suspense',
    'Action':'Action and Adventure',
    'Drama':'Drama',
    'Horror':'Horror',
    'Sci-Fi':'Science Fiction and Fantasy',
    'Talk-Show':'TV Show',
    'Action and Adventure':'Action and Adventure',
    'Musical':'Music',
    'Documentary':'Documentary',
    'Adventure':'Action and Adventure',
    'Faith and Spirituality':'Faith and Spirituality',
    'Anime and Manga':'Animation',
    'Kids and Family':'Kids and Family',
    'War':'Action and Adventure',
    'News':'TV Show',
    'Classics':'Classic',
    'Animation':'Animation',
    'Thriller':'Mystery and Suspense',
    'Gay and Lesbian':'LGBTQ+',
    'Cult Movies':'Classic',
    'Science Fiction and Fantasy':'Science Fiction and Fantasy',
    'Musical and Performing Arts':'Music',
    'Western':'Western',
    'History':'History',
    'Biography':'Biography',
    28:'Action and Adventure',
    12:'Action and Adventure',
    16:'Animation',
    35:'Comedy',
    80:'Mystery and Suspense',
    99:'Documentary',
    18:'Drama',
    10751:'Kids and Family',
    14:'Science Fiction and Fantasy',
    36:'History',
    27:'Horror',
    10402:'Music',
    9648 :'Mystery',
    10749:'Romance',
    878:'Science Fiction and Fantasy',
    10770:'TV Show',
    53:'Mystery and Suspense',
    10752:'Action and Adventure',
    37:'Western'
    }

In [21]:
movie_genres_info = info[['id','genre']] 
movie_genres_info = movie_genres_info.set_index('id').apply(lambda x: x.str.split('|').explode()).reset_index()
movie_genres_info['genre'] = movie_genres_info['genre'].map(genres_dict)
movie_genres_info.drop_duplicates(inplace = True)
movie_genres_info['table_match'] = 'info'
movie_genres_info.rename(columns= {'id':'movie_id'},inplace = True)
#movie_genres_info

In [22]:
movie_genres_imdb = movie_basics[['movie_id','genres']] 
movie_genres_imdb = movie_genres_imdb.set_index('movie_id').apply(lambda x: x.str.split('|').explode()).reset_index()
movie_genres_imdb['genres'] = movie_genres_imdb['genres'].map(genres_dict)
movie_genres_imdb.drop_duplicates(inplace = True)
movie_genres_imdb['table_match'] = 'movie_basics'
movie_genres_imdb.rename(columns= {'genres':'genre'},inplace = True)
#movie_genres_imdb

In [23]:
import ast
movie_genres_tmdb = tmdb[['id','genre_ids']]
movie_genres_tmdb = movie_genres_tmdb.set_index('id')
movie_genres_tmdb = movie_genres_tmdb['genre_ids'].map(lambda x: ast.literal_eval(x)).explode().to_frame().reset_index()
movie_genres_tmdb['genre_ids'] = movie_genres_tmdb['genre_ids'].map(genres_dict)
movie_genres_tmdb.drop_duplicates(inplace = True)
movie_genres_tmdb['table_match'] = 'tmdb'
movie_genres_tmdb.rename(columns = {'id':'movie_id','genre_ids':'genre'}, inplace = True)
#movie_genres_tmdb

In [24]:
movie_genres = pd.concat([movie_genres_imdb,movie_genres_info,movie_genres_tmdb])
movie_genres

Unnamed: 0,movie_id,genre,table_match
0,63540,,movie_basics
1,66787,,movie_basics
2,69049,Drama,movie_basics
3,69204,,movie_basics
4,100275,,movie_basics
...,...,...,...
47827,381231,Action and Adventure,tmdb
47829,366854,Kids and Family,tmdb
47830,366854,Action and Adventure,tmdb
47832,309885,Mystery and Suspense,tmdb


In [25]:
#removing old genre columns
info.drop('genre', axis=1, inplace=True)
movie_basics.drop('genres', axis=1, inplace=True)
tmdb.drop('genre_ids', axis=1, inplace=True)

## 3. Storing

In [26]:
%store gross
%store info
%store reviews
%store tmdb
%store budgets
%store movie_basics
%store movie_ratings
%store movie_genres

Stored 'gross' (DataFrame)
Stored 'info' (DataFrame)
Stored 'reviews' (DataFrame)
Stored 'tmdb' (DataFrame)
Stored 'budgets' (DataFrame)
Stored 'movie_basics' (DataFrame)
Stored 'movie_ratings' (DataFrame)
Stored 'movie_genres' (DataFrame)
