In [None]:
import pandas as pd
import numpy as np

data = {
    " User Name ": ["Anurag", "  Ravi", None, "Neha  "],
    "AGE": ["23", "twenty", " 30", None],
    " Salary ": ["50000", "60000", None, " 55000 "],
    "Join Date": ["2022-01-10", "invalid_date", None, "2023-05-20"],
    "Email": ["anurag@gmail.com", None, "ravi@", "neha@gmail.com"]
}

df = pd.DataFrame(data)
print("üî¥ RAW DATASET:")
print(df)


# 1Ô∏è‚É£ Standardize column names
df.columns = df.columns.str.lower().str.strip().str.replace(" ", "_")

# 2Ô∏è‚É£ Remove duplicate rows (important in production)
df = df.drop_duplicates()

# 3Ô∏è‚É£ Handle missing values
df = df.dropna(subset=["user_name"])   # name are mandatory

# 4Ô∏è‚É£ Clean numeric columns safely
df["age"] = pd.to_numeric(df["age"], errors="coerce")
df["salary"] = pd.to_numeric(df["salary"], errors="coerce")

# 5Ô∏è‚É£ Fill missing salary with median (production best practice)
df["salary"].fillna(df["salary"].median(), inplace=True)

# 6Ô∏è‚É£ Convert date column properly
df["join_date"] = pd.to_datetime(df["join_date"], errors="coerce")

# 7Ô∏è‚É£ Remove invalid emails
df = df[df["email"].str.contains("@", na=False)]

df["age"].fillna(df["age"].median(), inplace=True)

# 8Ô∏è‚É£ Final cleanup after conversions
df.dropna(inplace=True)

print("\nüü¢ CLEANED DATASET (PRODUCTION READY):")
print(df)

üî¥ RAW DATASET:
   User Name      AGE  Salary      Join Date             Email
0      Anurag      23    50000    2022-01-10  anurag@gmail.com
1        Ravi  twenty    60000  invalid_date              None
2        None      30     None          None             ravi@
3      Neha      None   55000     2023-05-20    neha@gmail.com

üü¢ CLEANED DATASET (PRODUCTION READY):
  user_name   age  salary  join_date             email
0    Anurag  23.0   50000 2022-01-10  anurag@gmail.com
3    Neha    23.0   55000 2023-05-20    neha@gmail.com
