In [1]:
# 📌 Task 1: Data Cleaning and Preprocessing (Netflix Dataset)

# STEP 1: Import required libraries
import pandas as pd
import numpy as np
import os

# STEP 2: Check working directory
print("📂 Current working directory:", os.getcwd())
print("📂 Files in this directory:", os.listdir())

# STEP 3: Verify dataset file exists before loading
file_name = "netflix_titles.csv"

if file_name in os.listdir():
    print(f"✅ Found {file_name} in the directory.")
    df = pd.read_csv(file_name)
else:
    raise FileNotFoundError(
        f"❌ {file_name} not found! Please make sure it is in the same directory as this notebook.\n"
        f"👉 Current directory: {os.getcwd()}"
    )

# STEP 4: Explore dataset
print("\nShape of dataset:", df.shape)
print("\nColumns:", df.columns)
print("\nInfo:")
print(df.info())
print("\nMissing values per column:")
print(df.isnull().sum())

# STEP 5: Handle missing values
df = df.dropna(subset=["title"])  # essential column
for col in ["director", "cast", "country", "date_added", "rating"]:
    df[col] = df[col].fillna("Unknown")

# STEP 6: Remove duplicate records
df = df.drop_duplicates()

# STEP 7: Standardize text values
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
df["country"] = df["country"].str.strip().str.title()

# STEP 8: Fix data formats
df["date_added"] = pd.to_datetime(df["date_added"], errors="coerce")
df["release_year"] = df["release_year"].astype(int)

# STEP 9: Final check
print("\n✅ Cleaned Dataset Info:")
print(df.info())
print("\nSample Data:")
print(df.head())

# STEP 10: Save cleaned dataset
df.to_csv("netflix_titles_cleaned.csv", index=False)
print("\n💾 Cleaned dataset saved as netflix_titles_cleaned.csv")

# STEP 11: Summary of changes
summary = {
    "initial_shape": (7787, 12),  # original Netflix dataset shape (from Kaggle)
    "final_shape": df.shape,
    "missing_values_handled": "Replaced missing values in director, cast, country, date_added, rating with 'Unknown'",
    "duplicates_removed": True,
    "column_names_standardized": True,
    "date_format_standardized": True,
    "data_types_fixed": True
}
print("\n📊 Summary of Changes:", summary)


📂 Current working directory: C:\Users\MSAF\Desktop
📂 Files in this directory: ['.ipynb_checkpoints', '0B2A7118.JPG', '0B2A7287.JPG', 'Code', 'contacts_to_import.txt', 'course_certificate javaprograming.pdf', 'dbms', 'Degree-Certificate.jpg', 'desktop.ini', 'Doc1.docx', 'DSCF8157.JPG', 'DSCF8162.JPG', 'DSCF8170.JPG', 'engagement list.xlsx', 'HCTAB.pdf', 'i', 'Internet Download Manager.lnk', 'kavya certificates', 'kavya Etg Resume.pdf', 'kavya list.xlsx', 'Kavya.p ETG format 1.pdf', 'Kavya.p ETG format.docx', 'Microsoft Edge.lnk', 'mom list.xlsx', 'my youtube channels.txt', 'Netflix_Data_Cleaning_Task1.ipynb', 'netflix_task1_clean.ipynb', 'netflix_titles.csv', 'netflix_titles_cleaned.csv', 'New Microsoft Word Document (2).docx', 'New Microsoft Word Document.docx', 'New Text Document (2).txt', 'New Text Document.txt', 'POSTER.docx', 'Recent Resume.pdf', 'resume Etg.pdf', 'shankaraiah', 'Shankaraiah - Chrome.lnk', 'signature day pics roll no wise', 'signature day pics roll no wise.zip', 'S