# Data Cleaning

In [1]:
import pandas as pd
import ast

In [4]:
# Load data
netflix_path = "netflix_titles_.csv"

In [5]:
# Read CSV files
netflix_df = pd.read_csv(netflix_path)

# Netflix Titles Data Cleaning

In [6]:
# Copy dataset
netflix_clean = netflix_df.copy()

In [7]:
# Fill missing values with 'Unknown'
text_columns = ["description", "director", "cast", "country", "date_added", "rating", "duration"]
for col in text_columns:
    netflix_clean[col].fillna("Unknown", inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  netflix_clean[col].fillna("Unknown", inplace=True)


In [8]:
# Convert 'date_added' to datetime format
netflix_clean["date_added"] = pd.to_datetime(netflix_clean["date_added"], errors='coerce')

In [9]:
# Remove duplicate rows
netflix_clean.drop_duplicates(inplace=True)


# Save Cleaned Data

In [10]:
# Save cleaned datasets to CSV
netflix_clean.to_csv("Cleaned_Netflix_Titles_Data.csv", index=False)


In [11]:
print("Data Cleaning Completed. Cleaned files saved.")

Data Cleaning Completed. Cleaned files saved.
