# Data Cleaning for tmdb.movies.csv

## Import libraries

In [1]:
import pandas as pd
import numpy as np

## Import dataset

In [2]:
tmdb_movie = pd.read_csv('data/tmdb.movies.csv')

## Preview general information about the dataset

In [3]:
# Shape of the dataset: rows and columns
print(tmdb_movie.shape)
# Check for NaN values
print(tmdb_movie.isna().sum())
# Preview the dataframe
tmdb_movie.head()

(26517, 10)
Unnamed: 0           0
genre_ids            0
id                   0
original_language    0
original_title       0
popularity           0
release_date         0
title                0
vote_average         0
vote_count           0
dtype: int64


Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186


## Data cleaning process

In [4]:
#Remove the 'unnamed' column and set 'id' column as index
tmdb_movie = tmdb_movie.drop('Unnamed: 0', 1)
tmdb_movie.set_index('id', inplace=True)
tmdb_movie.head()

Unnamed: 0_level_0,genre_ids,original_language,original_title,popularity,release_date,title,vote_average,vote_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
12444,"[12, 14, 10751]",en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
10191,"[14, 12, 16, 10751]",en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
10138,"[12, 28, 878]",en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
862,"[16, 35, 10751]",en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
27205,"[28, 878, 12]",en,Inception,27.92,2010-07-16,Inception,8.3,22186


In [5]:
# Change the type of 'release_date' column from object to datetime 
tmdb_movie['release_date'] = pd.to_datetime(tmdb_movie['release_date'])
print(tmdb_movie['release_date'].dtype)

datetime64[ns]


In [6]:
#Drop unnecessary columns 
tmdb_movie = tmdb_movie.drop(['genre_ids', 'original_language'], axis=1).copy()

In [7]:
#Choose only recent movies: from 2000 to 2018
movies_from_2000_2018 = tmdb_movie[(tmdb_movie['release_date'] > '2000-01-01') 
                                      & (tmdb_movie['release_date'] <= '2018-12-31')].copy()
movies_from_2000_2018

Unnamed: 0_level_0,original_title,popularity,release_date,title,vote_average,vote_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
12444,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
10191,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
10138,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
27205,Inception,27.920,2010-07-16,Inception,8.3,22186
32657,Percy Jackson & the Olympians: The Lightning T...,26.691,2010-02-11,Percy Jackson & the Olympians: The Lightning T...,6.1,4229
...,...,...,...,...,...,...
488143,Laboratory Conditions,0.600,2018-10-13,Laboratory Conditions,0.0,1
485975,_EXHIBIT_84xxx_,0.600,2018-05-01,_EXHIBIT_84xxx_,0.0,1
381231,The Last One,0.600,2018-10-01,The Last One,0.0,1
366854,Trailer Made,0.600,2018-06-22,Trailer Made,0.0,1


In [8]:
#check the missing values again in movie_from_2000_2018 dataframe in each column
movies_from_2000_2018.isna().sum()

original_title    0
popularity        0
release_date      0
title             0
vote_average      0
vote_count        0
dtype: int64

In [9]:
#Extract the month and the day of the week from 'release_date' column and make them into 2 new columns
movies_from_2000_2018['month'] = pd.DatetimeIndex(movies_from_2000_2018['release_date']).month
movies_from_2000_2018['day_of_week'] = movies_from_2000_2018['release_date'].dt.day_name()
movies_from_2000_2018.head()

Unnamed: 0_level_0,original_title,popularity,release_date,title,vote_average,vote_count,month,day_of_week
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
12444,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,11,Friday
10191,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610,3,Friday
10138,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368,5,Friday
27205,Inception,27.92,2010-07-16,Inception,8.3,22186,7,Friday
32657,Percy Jackson & the Olympians: The Lightning T...,26.691,2010-02-11,Percy Jackson & the Olympians: The Lightning T...,6.1,4229,2,Thursday


## Save the cleaned dataframe as a new CSV file

In [10]:
movies_from_2000_2018.to_csv('data/cleaned_tmdb_movie.csv')