In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.impute import SimpleImputer
# Load people dataset
df = pd.read_csv("people-100.csv")

# Initial checks
print("Dataset Shape:", df.shape)
print("\nFirst 5 Rows:")
print(df.head())

print("\nDataset Info:")
df.info()

print("\nStatistical Summary:")
print(df.describe(include="all"))
# Check duplicate rows
print("\nDuplicate rows:", df.duplicated().sum())

# Remove duplicate rows
df = df.drop_duplicates()

# Check duplicate person IDs if available
if "person_id" in df.columns:
    print("Unique person IDs:", df["person_id"].nunique())
# Drop irrelevant columns
drop_cols = ["temp_id", "notes", "comments"]
df.drop(columns=drop_cols, inplace=True, errors="ignore")

# Rename columns for clarity
df.rename(columns={
    "job": "job_title",
    "dob": "date_of_birth",
    "phone": "phone_number"
}, inplace=True)

# Reorder columns
df = df.reindex(sorted(df.columns), axis=1)

print("\nColumns after management:")
print(df.columns)
# Check missing values
print("\nMissing values per column:")
print(df.isna().sum())

# Drop columns with >70% missing data
threshold = 0.7 * len(df)
df = df.dropna(axis=1, thresh=threshold)
# Identify numeric & categorical columns
num_cols = df.select_dtypes(include=["int64", "float64"]).columns
cat_cols = df.select_dtypes(include=["object"]).columns

# Impute numeric columns with median
num_imputer = SimpleImputer(strategy="median")
df[num_cols] = num_imputer.fit_transform(df[num_cols])

# Impute categorical columns with mode
cat_imputer = SimpleImputer(strategy="most_frequent")
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])
# Convert mixed-format date columns to datetime
date_cols = ["birth_date", "date_of_birth", "join_date"]
for col in date_cols:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors="coerce", dayfirst=True)

# Convert age to numeric if stored as string
if "age" in df.columns:
    df["age"] = pd.to_numeric(df["age"], errors="coerce")
# Normalize job titles
if "job_title" in df.columns:
    df["job_title"] = (
        df["job_title"]
        .str.lower()
        .str.strip()
    )

    # Map inconsistent job titles
    job_map = {
        "s/w engineer": "software engineer",
        "software eng": "software engineer",
        "developer": "software engineer",
        "data sci": "data scientist",
        "data analyst": "data analyst",
        "analyst": "data analyst",
        "hr exec": "hr executive"
    }

    df["job_title"] = df["job_title"].replace(job_map)
def clean_phone(phone):
    if pd.isna(phone):
        return np.nan
    phone = re.sub(r"\D", "", str(phone))  # remove non-digits
    if len(phone) == 10:
        return phone
    else:
        return np.nan

if "phone_number" in df.columns:
    df["phone_number"] = df["phone_number"].apply(clean_phone)
# Normalize all categorical text columns
for col in df.select_dtypes(include="object").columns:
    df[col] = df[col].str.lower().str.strip()
# Ensure no duplicate rows
assert df.duplicated().sum() == 0, "Duplicates still exist!"

print("\nFinal Missing Values:")
print(df.isna().sum())

print("\nFinal Dataset Info:")
df.info()
# Save cleaned dataset
df.to_csv("cleaned_people_data.csv", index=False)

print("\n‚úÖ People dataset cleaned successfully!")
print("üìÅ Saved as: cleaned_people_data.csv")


Dataset Shape: (100, 9)

First 5 Rows:
   Index          User Id First Name Last Name     Sex  \
0      1  88F7B33d2bcf9f5     Shelby   Terrell    Male   
1      2  f90cD3E76f1A9b9    Phillip   Summers  Female   
2      3  DbeAb8CcdfeFC2c   Kristine    Travis    Male   
3      4  A31Bee3c201ef58    Yesenia  Martinez    Male   
4      5  1bA7A3dc874da3c       Lori      Todd    Male   

                        Email                   Phone Date of birth  \
0        elijah57@example.net  001-084-906-7849x73518    1945-10-26   
1       bethany14@example.com       214.112.6044x4913    1910-03-24   
2       bthompson@example.com            277.609.7938    1992-07-02   
3   kaitlinkaiser@example.com            584.094.6111    2017-08-03   
4  buchananmanuel@example.net       689-207-3558x7233    1938-12-01   

            Job Title  
0     Games developer  
1      Phytotherapist  
2           Homeopath  
3   Market researcher  
4  Veterinary surgeon  

Dataset Info:
<class 'pandas.core.frame.