<a href="https://colab.research.google.com/github/indracharan-png/Titanic-Machine-Learning-from-Disaster-kaggle-/blob/main/data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [40]:
import pandas as pd
import regex as re
import numpy as np
import random




def preprocess_Data(df):
    # Convert the column names from camelCase to snake_case
    df.columns = [re.sub(r'(?<!^)(?=[A-Z])', '_', col_name).lower() for col_name in df.columns]

    df = df.copy()

    # Impute the 'age' feature using stratified random sampling based on 'pclass' and 'sex' features
    for (pclass, sex), group in df.groupby(['pclass', 'sex']):
        # Select indices in a group where 'age' is null
        missing_indices = group['age'].index[group['age'].isna()]

        missing_size = len(missing_indices)
        random.seed(42)

        # Compute the donor data points from the current group
        donors = group['age'].dropna().values

        # Falback-1: If no donors are avaialble in the group, compute donors from other groups
        if donors.size == 0:
            donors = df.loc[(df['sex'] == sex) & (df['age'].notna()), 'age'].values

        # Fallback-2: Compute the donors from the global age data coloumn
        if donors.size == 0:
            donors = df.loc[df['age'].notna(), 'age'].values


        sampled_ages = np.random.choice(donors, size = missing_size, replace=True)
        df.loc[missing_indices, 'age'] = sampled_ages




    # Fill up the empty cells in 'cabin' feature with 'Unknown' string, which indicates missing information
    df['cabin'] = df['cabin'].fillna('Unknown')
    # Extract the deck information from the first letter of the 'cabin' feature, and create a new feature 'deck' ('U' for 'Unknown')
    df['deck'] = df['cabin'].str[0]

    # Built a 'title' feature from 'name' feature
    df['title'] = df['name'].str.extract(r',\s*([^\.]+)\.', expand=False).str.strip().str.lower()

    # Group titles into few buckets
    title_grouping_map = {}
    for title in df['title'].unique():
        if title not in ['mr', 'mrs', 'miss', 'master']:
            title_grouping_map[title] = 'rare'
        else:
            title_grouping_map[title] = title
    df['title'].replace(title_grouping_map)
    df['title'] = df['title'].fillna('Unknown')

    # Compute family size if exists for each person
    df['family_size'] = df['sib_sp'] + df['parch'] + 1
    df['is_alone'] = (df['family_size'] == 1).astype(int)

    # Extract the ticket prefix info
    df['ticket_prefix'] = (
        df['ticket'].astype(str)
        .str.replace(r'[^A-Za-z0-9]', ' ', regex=True)
        .str.split().str[0]
        .fillna('UNK')
    )

    return df


if __name__ == "__main__":
    # Load the CSV file into a DataFrame
    df = pd.read_csv('train.csv')

    # df.info()

    df_copy = preprocess_Data(df)

    df_copy.info()

    # print(df['name'].head())

    # print("Hello world")


0           A/5 21171
1            PC 17599
2    STON/O2. 3101282
3              113803
4              373450
5              330877
6               17463
7              349909
8              347742
9              237736
Name: ticket, dtype: object
0         A
1        PC
2      STON
3    113803
4    373450
5    330877
6     17463
7    349909
8    347742
9    237736
Name: ticket_prefix, dtype: object
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   passenger_id   891 non-null    int64  
 1   survived       891 non-null    int64  
 2   pclass         891 non-null    int64  
 3   name           891 non-null    object 
 4   sex            891 non-null    object 
 5   age            891 non-null    float64
 6   sib_sp         891 non-null    int64  
 7   parch          891 non-null    int64  
 8   ticket         891 non-null    object 
 9   far