In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
# Load customer dataset
df = pd.read_csv("customers-100.csv")

# Initial checks
print("Dataset Shape:", df.shape)
print("\nFirst 5 Records:")
print(df.head())

print("\nDataset Information:")
df.info()

print("\nStatistical Summary:")
print(df.describe(include="all"))
# Check duplicate rows
print("\nDuplicate rows before cleaning:", df.duplicated().sum())

# Remove duplicates
df = df.drop_duplicates()

# Check unique customer IDs
if "customer_id" in df.columns:
    print("Unique Customer IDs:", df["customer_id"].nunique())
# Drop irrelevant columns
drop_cols = ["temp_id", "notes", "comments"]
df.drop(columns=drop_cols, inplace=True, errors="ignore")

# Rename columns for clarity
df.rename(columns={
    "cust_id": "customer_id",
    "reg": "region",
    "dob": "date_of_birth"
}, inplace=True)

# Reorder columns alphabetically
df = df.reindex(sorted(df.columns), axis=1)

print("\nColumns after management:")
print(df.columns)
# Check missing values
print("\nMissing values per column:")
print(df.isna().sum())

# Drop columns with >70% missing data
threshold = 0.7 * len(df)
df = df.dropna(axis=1, thresh=threshold)
# Identify numeric and categorical columns
num_cols = df.select_dtypes(include=["int64", "float64"]).columns
cat_cols = df.select_dtypes(include=["object"]).columns

# Numeric imputation (median)
num_imputer = SimpleImputer(strategy="median")
df[num_cols] = num_imputer.fit_transform(df[num_cols])

# Categorical imputation (mode)
cat_imputer = SimpleImputer(strategy="most_frequent")
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

# Drop rows missing critical customer_id
if "customer_id" in df.columns:
    df.dropna(subset=["customer_id"], inplace=True)
# Convert date columns to datetime
date_cols = ["signup_date", "date_of_birth"]
for col in date_cols:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors="coerce")

# Fix numeric columns stored as strings
if "annual_income" in df.columns:
    df["annual_income"] = pd.to_numeric(
        df["annual_income"].astype(str).str.replace(",", ""),
        errors="coerce"
    )

# Convert categorical columns to category type
for col in cat_cols:
    if col in df.columns:
        df[col] = df[col].astype("category")
# Normalize text fields
for col in df.select_dtypes(include="category").columns:
    df[col] = df[col].astype(str).str.lower().str.strip()

# Standardize gender values
if "gender" in df.columns:
    df["gender"] = df["gender"].replace({
        "m": "male",
        "male": "male",
        "f": "female",
        "female": "female"
    })

# Standardize region values
if "region" in df.columns:
    df["region"] = df["region"].replace({
        "north": "northern",
        "south": "southern",
        "east": "eastern",
        "west": "western"
    })
# Ensure no duplicates remain
assert df.duplicated().sum() == 0, "Duplicate rows still exist!"

# Check missing values
print("\nFinal Missing Values:")
print(df.isna().sum())

print("\nFinal Dataset Info:")
df.info()
# Save cleaned customer dataset
df.to_csv("cleaned_customers_data.csv", index=False)

print("\n‚úÖ Customer dataset cleaned successfully!")
print("üìÅ File saved as: cleaned_customers_data.csv")


Dataset Shape: (100, 12)

First 5 Records:
   Index      Customer Id First Name Last Name  \
0      1  DD37Cf93aecA6Dc     Sheryl    Baxter   
1      2  1Ef7b82A4CAAD10    Preston    Lozano   
2      3  6F94879bDAfE5a6        Roy     Berry   
3      4  5Cef8BFA16c5e3c      Linda     Olsen   
4      5  053d585Ab6b3159     Joanna    Bender   

                           Company               City  \
0                  Rasmussen Group       East Leonard   
1                      Vega-Gentry  East Jimmychester   
2                    Murillo-Perry      Isabelborough   
3  Dominguez, Mcmillan and Donovan         Bensonview   
4         Martin, Lang and Andrade     West Priscilla   

                      Country                 Phone 1                Phone 2  \
0                       Chile            229.077.5154       397.884.0519x718   
1                    Djibouti              5153435776       686-620-1820x944   
2         Antigua and Barbuda         +1-539-402-0259    (496)978-3969x58