# Movies Profitability Project

# 1. Preliminary Steps

In [6]:
# imports
import pandas as pd
import numpy as np

In [4]:
# designate data from https://datasets.imdbws.com/
basics_url = 'https://datasets.imdbws.com/title.basics.tsv.gz'
akas_url = 'https://datasets.imdbws.com/title.akas.tsv.gz'
ratings_url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'

In [5]:
# read in files
basics = pd.read_csv(basics_url, sep = '\t', low_memory = False)
akas = pd.read_csv(akas_url, sep = '\t', low_memory = False)
ratings = pd.read_csv(ratings_url, sep = '\t', low_memory = False)

# 2. Cleaning

## 2.1 basics dataframe

In [None]:
# check out df information
basics.info()

### 2.1.1 Replace \N with np.nan

In [8]:
# nulls have been encoded with \N
# find and replace with np.nan
basics.replace({'\\N': np.nan}, inplace = True)

### 2.1.2 Drop nulls in 'runtimeMinutes' and 'genres' columns

In [11]:
# drop nulls in 'runtimeMinutes' column
basics.dropna(subset = ['runtimeMinutes'], inplace = True)

# check
basics['runtimeMinutes'].isna().sum()

0

In [15]:
# drop nulls in 'genres' column
basics.dropna(subset = ['genres'], inplace = True)

# check
basics['genres'].isna().sum()

0

### 2.1.3 Keep only 'movie' in 'titleType' column

In [18]:
# keep only 'titleType' == 'movie'
basics = basics[basics['titleType'] == 'movie']

# check
basics['titleType'].value_counts()

### 2.1.4 Keep only years from 2000-2022 in 'startYear' column

In [34]:
# check min and max year first
basics['startYear'].describe()

count    223440.000000
mean       2013.371411
std           5.852936
min        2000.000000
25%        2009.000000
50%        2014.000000
75%        2018.000000
max        2022.000000
Name: startYear, dtype: float64

In [29]:
# drop nulls from column
basics.dropna(subset = ['startYear'], inplace = True)

# check
basics['startYear'].isna().sum()

0

In [36]:
# keep only 'startYear' from 2000 to 2022, including 2000 and 2022
basics = basics[(basics['startYear'] >= 2000) &
                (basics['startYear'] <= 2022)]

# check
basics['startYear'].value_counts().sort_index(ascending = False)

2022    12844
2021    12373
2020    11576
2019    14076
2018    14336
2017    14375
2016    13962
2015    13481
2014    13115
2013    12388
2012    11637
2011    10781
2010    10208
2009     9361
2008     8158
2007     6964
2006     6523
2005     5838
2004     5213
2003     4592
2002     4129
2001     3869
2000     3641
Name: startYear, dtype: int64

### 2.1.5 Exclude documentaries

In [37]:
# check out 'genres' column
basics['genres'].value_counts()

Documentary                    53251
Drama                          36051
Comedy                         13456
Comedy,Drama                    6459
Horror                          5798
                               ...  
Documentary,Sci-Fi,Thriller        1
Comedy,History,Mystery             1
Crime,Documentary,Romance          1
Animation,Biography,Sport          1
Crime,Fantasy,Sci-Fi               1
Name: genres, Length: 1191, dtype: int64

In [38]:
# exclude any movies that have 'documentary' in their genre
is_documentary = basics['genres'].str.contains('documentary', 
                                               case = False)
basics = basics[~is_documentary]

# check
basics['genres'].value_counts()

Drama                        36051
Comedy                       13456
Comedy,Drama                  6459
Horror                        5798
Drama,Romance                 4311
                             ...  
Animation,Biography,Sport        1
Adventure,History,Music          1
Adventure,History,War            1
Action,Animation,Romance         1
Crime,Fantasy,Sci-Fi             1
Name: genres, Length: 969, dtype: int64

### 2.1.6 Keep only US movies (from column in akas)

## 2.2 akas dataframe

In [None]:
# nulls have been encoded with \N
# find and replace with np.nan
akas.replace({'\\N': np.nan}, inplace = True)
ratings.replace({'\\N': np.nan}, inplace = True)

## 2.3 ratings dataframe

In [None]:
# check for duplicates?