<a href="https://colab.research.google.com/github/krishna-gera/my-aiml-learning/blob/main/day-2/day2_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# create_day02_clean.py
# Run this from your project root (where you normally run your notebooks / scripts).

import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
import sys

def find_train_csv():
    search_paths = [
        Path("train.csv"),
        Path("data/train.csv"),
        Path("datasets/train.csv"),
        Path("input/train.csv"),
        Path("../train.csv"),
        Path("day02/train.csv")
    ]
    for p in search_paths:
        if p.exists():
            return p.resolve()
    # try a quick glob search under current folder
    for p in Path('.').rglob('train*.csv'):
        return p.resolve()
    return None

def safe_read_csv(path):
    try:
        return pd.read_csv(path)
    except UnicodeDecodeError:
        # fallback common encodings
        return pd.read_csv(path, encoding='latin1')
    except Exception as e:
        raise

def prepare_clean_files(df):
    # Ensure expected columns exist
    expected = {'PassengerId','Survived','Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked'}
    present = set(df.columns)
    missing = expected - present
    if missing:
        print(f"[Warning] Missing columns in train.csv: {missing}. Proceeding with columns that exist.")
    # 1) Basic repairs for numeric columns
    if 'Fare' in df.columns:
        df['Fare'] = pd.to_numeric(df['Fare'], errors='coerce')
        df['Fare'].fillna(df['Fare'].median(), inplace=True)
    if 'Age' in df.columns:
        df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
        df['Age'].fillna(df['Age'].median(), inplace=True)
    # 2) Embarked fill
    if 'Embarked' in df.columns:
        if df['Embarked'].isna().any():
            try:
                mode_emb = df['Embarked'].mode()[0]
            except Exception:
                mode_emb = 'S'
            df['Embarked'].fillna(mode_emb, inplace=True)
    # 3) Family features
    if 'SibSp' in df.columns and 'Parch' in df.columns:
        df['FamilySize'] = df['SibSp'].fillna(0).astype(int) + df['Parch'].fillna(0).astype(int) + 1
        df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    # 4) Sex to binary (keep original too)
    if 'Sex' in df.columns:
        df['Sex_bin'] = df['Sex'].map(lambda x: 1 if str(x).lower().startswith('m') else 0)
    # 5) Keep a minimal cleaned table for Day 2 usage
    minimal_cols = []
    for c in ['PassengerId','Survived','Pclass','Sex','Sex_bin','Age','SibSp','Parch','Fare','Embarked','FamilySize','IsAlone']:
        if c in df.columns:
            minimal_cols.append(c)
    df_min = df[minimal_cols].copy()
    # 6) Save both minimal cleaned file and a preserved copy with Name/Cabin/Ticket for later feature engineering
    return df_min, df

def main():
    print("=== Day 2 cleaning script starting ===")
    train_path = find_train_csv()
    if train_path is None:
        print("ERROR: Could not find train.csv. Please download Titanic train.csv from Kaggle and place it in project root or data/ folder.")
        print("Common locations checked: ./train.csv, ./data/train.csv, ./datasets/train.csv, ./input/train.csv")
        sys.exit(1)
    print(f"Found train file at: {train_path}")
    df_raw = safe_read_csv(train_path)
    print("Raw data shape:", df_raw.shape)
    # Prepare cleaned minimal + preserved raw copy
    df_min, df_preserved = prepare_clean_files(df_raw)
    # Ensure output folder exists
    out_dir = Path("day02")
    out_dir.mkdir(parents=True, exist_ok=True)
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    clean_path = out_dir / "day02_titanic_clean.csv"
    preserved_path = out_dir / "day02_titanic_preserved.csv"
    # Save CSVs
    df_min.to_csv(clean_path, index=False)
    df_preserved.to_csv(preserved_path, index=False)
    print(f"Saved cleaned (minimal) dataset -> {clean_path} (shape: {df_min.shape})")
    print(f"Saved preserved dataset -> {preserved_path} (shape: {df_preserved.shape})")
    # Quick sanity checks
    print("\n--- Quick checks on cleaned file ---")
    print("Columns:", list(df_min.columns))
    print("Missing values per column:\n", df_min.isna().sum())
    print("\nSample rows:\n", df_min.head().to_string(index=False))
    print("\nIf everything looks good, commit the files:\n  git add day02/*\n  git commit -m \"Day 2: Titanic cleaned dataset saved\"\n  git push")
    print("=== Done ===")

if __name__ == "__main__":
    main()


=== Day 2 cleaning script starting ===
Found train file at: /content/train.csv
Raw data shape: (891, 12)
Saved cleaned (minimal) dataset -> day02/day02_titanic_clean.csv (shape: (891, 12))
Saved preserved dataset -> day02/day02_titanic_preserved.csv (shape: (891, 15))

--- Quick checks on cleaned file ---
Columns: ['PassengerId', 'Survived', 'Pclass', 'Sex', 'Sex_bin', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'FamilySize', 'IsAlone']
Missing values per column:
 PassengerId    0
Survived       0
Pclass         0
Sex            0
Sex_bin        0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
FamilySize     0
IsAlone        0
dtype: int64

Sample rows:
  PassengerId  Survived  Pclass    Sex  Sex_bin  Age  SibSp  Parch    Fare Embarked  FamilySize  IsAlone
           1         0       3   male        1 22.0      1      0  7.2500        S           2        0
           2         1       1 female        0 38.0      1      0 71.2833        C        

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Fare'].fillna(df['Fare'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are sett