In [3]:
import pandas as pd
import io
import os
import chardet
from google.colab import files

# Install chardet for encoding detection
!pip install -q chardet

# Upload CSV files manually
uploaded = files.upload()

# Output directory for cleaned files
output_dir = "cleaned_datasets"
os.makedirs(output_dir, exist_ok=True)

# Detect encoding of byte data
def detect_encoding(file_bytes):
    result = chardet.detect(file_bytes)
    return result['encoding']

# Cleaning functions
def clean_column_names(df):
    df.columns = (
        df.columns
        .str.strip()
        .str.lower()
        .str.replace(" ", "_")
        .str.replace(r"[^\w\s]", "", regex=True)
    )
    return df

def handle_missing_values(df):
    for col in df.columns:
        if df[col].isnull().sum() > 0:
            if df[col].dtype == 'object':
                df[col] = df[col].fillna(df[col].mode()[0])
            else:
                df[col] = df[col].fillna(df[col].median())
    return df

def standardize_text(df):
    text_cols = df.select_dtypes(include='object').columns
    for col in text_cols:
        df[col] = df[col].astype(str).str.strip().str.lower()
    return df

def convert_dates(df):
    for col in df.columns:
        if "date" in col or "day" in col:
            try:
                df[col] = pd.to_datetime(df[col], errors='coerce')
                df[col] = df[col].dt.strftime('%d-%m-%Y')
            except:
                pass
    return df

def fix_data_types(df):
    for col in df.columns:
        if "age" in col:
            try:
                df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)
            except:
                pass
    return df

# Process each uploaded file
for filename, file in uploaded.items():
    print(f"\n🔄 Cleaning: {filename}")
    try:
        # Detect encoding and load file
        file_bytes = file
        encoding = detect_encoding(file_bytes)
        df = pd.read_csv(io.BytesIO(file_bytes), encoding=encoding)

        # Clean the data
        df = df.drop_duplicates()
        df = clean_column_names(df)
        df = handle_missing_values(df)
        df = standardize_text(df)
        df = convert_dates(df)
        df = fix_data_types(df)

        # Save cleaned file locally
        cleaned_filename = f"{os.path.splitext(filename)[0]}_cleaned.csv"
        cleaned_path = os.path.join(output_dir, cleaned_filename)
        df.to_csv(cleaned_path, index=False)

        print(f"✅ Saved cleaned file as: {cleaned_filename}")

    except Exception as e:
        print(f"❌ Error cleaning {filename}: {e}")


for file in os.listdir(output_dir):
    files.download(os.path.join(output_dir, file))


Saving netflix_titles.csv to netflix_titles (2).csv
Saving KaggleV2-May-2016.csv to KaggleV2-May-2016 (2).csv
Saving sales_data_sample.csv to sales_data_sample (2).csv
Saving Mall_Customers.csv to Mall_Customers (2).csv
Saving marketing_campaign.csv to marketing_campaign (2).csv

🔄 Cleaning: netflix_titles (2).csv


  df[col] = pd.to_datetime(df[col], errors='coerce')


✅ Saved cleaned file as: netflix_titles (2)_cleaned.csv

🔄 Cleaning: KaggleV2-May-2016 (2).csv
✅ Saved cleaned file as: KaggleV2-May-2016 (2)_cleaned.csv

🔄 Cleaning: sales_data_sample (2).csv
❌ Error cleaning sales_data_sample (2).csv: 'johab' codec can't decode byte 0x84 in position 5327: illegal multibyte sequence

🔄 Cleaning: Mall_Customers (2).csv
✅ Saved cleaned file as: Mall_Customers (2)_cleaned.csv

🔄 Cleaning: marketing_campaign (2).csv
✅ Saved cleaned file as: marketing_campaign (2)_cleaned.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>