In [1]:
# Mount Google Drive to access stored files
# Useful when working on Google Colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Step-by-Step Netflix Data Cleaning with Python Pandas

In this notebook, I will perform data cleaning on the Netflix dataset. The goal is to preprocess the data by handling missing values, removing unnecessary columns, standardizing column names, and converting data types where necessary.

This step is crucial to ensure the dataset is clean and ready for further analysis and visualization.

**Database:**

[Netflix Movies and TV Shows](https://www.kaggle.com/datasets/shivamb/netflix-shows)


# 1. Import the libraries

In [2]:
import pandas as pd

# 2. Load Dataset

In [19]:
# Load the Netflix dataset
data = pd.read_csv('/content/drive/MyDrive/Análise de Dados/DATA CLEANING/Netflix/netflix_titles.csv')

# 3. Preliminary dataset exploration

In [8]:
# Display the first few rows of the dataset for verification
data.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [13]:
# Show general information about the dataset, such as data types and null values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [12]:
# Display all available columns in the dataset
data.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

# 4. Removing unnecessary columns

In [20]:
# Define the columns to keep in the analysis
columns_to_keep = ['show_id', 'type', 'title','country', 'date_added',
                   'release_year', 'rating', 'duration']
# Define the columns to be removed as they will not be used in the analysis
columns_to_drop = ['director','cast','listed_in', 'description']

In [21]:
# Remove unnecessary columns from the dataset
data.drop(columns=columns_to_drop, inplace=True)

In [22]:
data

Unnamed: 0,show_id,type,title,country,date_added,release_year,rating,duration
0,s1,Movie,Dick Johnson Is Dead,United States,"September 25, 2021",2020,PG-13,90 min
1,s2,TV Show,Blood & Water,South Africa,"September 24, 2021",2021,TV-MA,2 Seasons
2,s3,TV Show,Ganglands,,"September 24, 2021",2021,TV-MA,1 Season
3,s4,TV Show,Jailbirds New Orleans,,"September 24, 2021",2021,TV-MA,1 Season
4,s5,TV Show,Kota Factory,India,"September 24, 2021",2021,TV-MA,2 Seasons
...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,United States,"November 20, 2019",2007,R,158 min
8803,s8804,TV Show,Zombie Dumb,,"July 1, 2019",2018,TV-Y7,2 Seasons
8804,s8805,Movie,Zombieland,United States,"November 1, 2019",2009,R,88 min
8805,s8806,Movie,Zoom,United States,"January 11, 2020",2006,PG,88 min


# 5. Updating column names

In [23]:
# Standardize column names to start with an uppercase letter
new_columns_names = []

for i in data.columns:
  new_columns_names.append(i.capitalize())

In [24]:
# Display the first 3 rows after standardizing column names
data.columns = new_columns_names

In [25]:
data.head(3)

Unnamed: 0,Show_id,Type,Title,Country,Date_added,Release_year,Rating,Duration
0,s1,Movie,Dick Johnson Is Dead,United States,"September 25, 2021",2020,PG-13,90 min
1,s2,TV Show,Blood & Water,South Africa,"September 24, 2021",2021,TV-MA,2 Seasons
2,s3,TV Show,Ganglands,,"September 24, 2021",2021,TV-MA,1 Season


# 6. Cleaning specific columns

In [46]:
# Fill null values in the 'Country' column with 'Unspecified'
data['Country'] = data['Country'].fillna('Unspecified')

In [47]:
# Display dataset information after filling null values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Show_id       8807 non-null   object
 1   Type          8807 non-null   object
 2   Title         8807 non-null   object
 3   Country       8807 non-null   object
 4   Date_added    8797 non-null   object
 5   Release_year  8807 non-null   int64 
 6   Rating        8803 non-null   object
 7   Duration      8804 non-null   object
dtypes: int64(1), object(7)
memory usage: 550.6+ KB


# 7. Dropping NaN values

In [48]:
# Check the number of remaining null values in the dataset
data.isna().sum()

Unnamed: 0,0
Show_id,0
Type,0
Title,0
Country,0
Date_added,10
Release_year,0
Rating,4
Duration,3


In [51]:
# Remove any rows that still contain null values
data.dropna(inplace=True)

In [63]:
# Display dataset information after removing null values
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8790 entries, 0 to 8806
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Show_id       8790 non-null   object        
 1   Type          8790 non-null   object        
 2   Title         8790 non-null   object        
 3   Country       8790 non-null   object        
 4   Date_added    8702 non-null   datetime64[ns]
 5   Release_year  8790 non-null   int64         
 6   Rating        8790 non-null   object        
 7   Duration      8790 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(6)
memory usage: 618.0+ KB


# 8. Explore further data transformations

In [62]:
# Convert the 'Date_added' column to datetime format, handling possible errors
data['Date_added'] = pd.to_datetime(data['Date_added'], format='%B %d, %Y', errors='coerce')

# 9. Export Dataset

In [68]:
# Export the cleaned dataset to a new CSV file
# Make sure the path is correct when running locally
data.to_csv('/content/drive/MyDrive/Análise de Dados/DATA CLEANING/Netflix/Cleaned_data.csv', index=False)