# WCG (Woman-Child-Group) + KNN Model for Titanic

In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import make_scorer, precision_score, recall_score

## 1. Data Loading and Preprocessing

In [2]:
def load_and_clean_data(train_path='../input/train.csv', test_path='../input/test.csv'):
    """Load and clean train/test data"""
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    data = pd.concat([train_data, test_data]).reset_index(drop=True)
    
    # Clean string columns (trim whitespace and replace empty strings with NaN)
    str_cols = data.select_dtypes(include=["object"]).columns
    data[str_cols] = (data[str_cols]
                      .apply(lambda s: s.str.strip())
                      .replace(r"^\s*$", np.nan, regex=True))
    
    # Convert Perished to Survived if needed (for compatibility)
    if "Perished" in data.columns and "Survived" not in data.columns:
        data["Survived"] = 1 - data["Perished"]
    
    return data

In [3]:
data = load_and_clean_data()

## 2. Feature Engineering

In [4]:
def create_ticket_and_group_features(data):
    """Create Ticket_id and Group_id features"""
    # Extract surname from Name
    data['Surname'] = data['Name'].apply(lambda x: x.split(',')[0])
    
    # Create Ticket_id: Pclass + Ticket + Fare + Embarked
    data['Ticket_id'] = (data['Pclass'].astype(str) + '-' + 
                         data['Ticket'].str[:-1] + '-' + 
                         data['Fare'].astype(str) + '-' + 
                         data['Embarked'].astype(str))
    
    # Create Group_id: Surname + Ticket_id
    data['Group_id'] = data['Surname'] + '-' + data['Ticket_id']
    
    return data

def create_title_feature(data):
    """Create Title feature (man/woman/boy)"""
    data['Title'] = 'man'
    data.loc[data.Sex == 'female', 'Title'] = 'woman'
    data.loc[data['Name'].str.contains('Master', na=False), 'Title'] = 'boy'
    return data

def identify_wcg_groups(data):
    """Identify Woman-Child Groups (WCG)"""
    # Set men to 'noGroup'
    data.loc[data.Title == 'man', 'Group_id'] = 'noGroup'
    
    # Count women and children in each group
    data['WC_count'] = data.loc[data.Title != 'man'].groupby('Group_id')['Group_id'].transform('count')
    
    # Assign 'noGroup' to single passengers
    data.loc[data.WC_count <= 1, 'Group_id'] = 'noGroup'
    
    return data

def assign_ungrouped_to_groups(data):
    """Assign ungrouped women/children to existing groups based on Ticket_id"""
    count = 0
    for i in range(len(data)):
        if (data.loc[i, 'Title'] != 'man') and (data.loc[i, 'Group_id'] == 'noGroup'):
            # Find group with same Ticket_id
            same_ticket = data.loc[(data['Ticket_id'] == data.loc[i, 'Ticket_id']) & 
                                   (data.Title != 'man'), 'Group_id']
            if len(same_ticket) > 0:
                group = same_ticket.iloc[0]
                if group != 'noGroup':
                    data.loc[i, 'Group_id'] = group
                    count += 1
    
    print(f'{count} passengers were added to an existing group')
    return data

In [5]:
# Apply feature engineering
data = create_ticket_and_group_features(data)
data = create_title_feature(data)
data = identify_wcg_groups(data)
data = assign_ungrouped_to_groups(data)

11 passengers were added to an existing group


  return op(a, b)


In [6]:
# Show group statistics
number_of_groups = data.loc[data.Group_id != 'noGroup', 'Group_id'].nunique()
number_of_passengers = data.loc[data.Group_id != 'noGroup', 'Group_id'].count()
print(f'Number of groups found: {number_of_groups}')
print(f'Number of passengers in groups: {number_of_passengers}')

Number of groups found: 80
Number of passengers in groups: 230


## 3. WCG Survival Feature

In [7]:
def calculate_wcg_survival(data):
    """Calculate survival rate for each Woman-Child Group"""
    # Calculate mean survival for each group in training data
    data['WCSurvived'] = (data.loc[(data.Title != 'man') & (data.Group_id != 'noGroup')]
                          .groupby('Group_id')['Survived']
                          .transform('mean'))
    
    # Find test-only groups
    test_groups = (set(data[891:1309].Group_id.unique()) - 
                   set(data[0:891].Group_id.unique()))
    
    # Assign WCSurvived for test-only groups based on Pclass
    data.loc[data.Group_id.isin(test_groups), 'WCSurvived'] = 0
    data.loc[(data.Group_id.isin(test_groups)) & (data.Pclass != 3), 'WCSurvived'] = 1
    
    print('WCSurvived distribution in test data:')
    print(data[891:1309].WCSurvived.value_counts().to_string())
    
    return data

In [8]:
data = calculate_wcg_survival(data)

WCSurvived distribution in test data:
WCSurvived
1.00    47
0.00    24
0.75     2
0.50     1


## 4. Base Predictions using WCG + Gender Model

In [9]:
def create_base_predictions(data):
    """Create baseline predictions using WCG + Gender model"""
    # Initialize all predictions to 0 (died)
    data.loc[891:1308, 'Predict'] = 0
    
    # Women survive (Predict=1)
    mask_women = (data.index >= 891) & (data.index <= 1308) & (data.Sex == 'female')
    data.loc[mask_women, 'Predict'] = 1
    
    # WCG women with WCSurvived=0 die (Predict=0)
    mask_wcg_women_died = (data.index >= 891) & (data.index <= 1308) & \
                          (data.Sex == 'female') & (data['WCSurvived'] == 0)
    data.loc[mask_wcg_women_died, 'Predict'] = 0
    
    # WCG boys with WCSurvived=1 survive (Predict=1)
    mask_wcg_boys_survived = (data.index >= 891) & (data.index <= 1308) & \
                             (data.Title == 'boy') & (data['WCSurvived'] == 1)
    data.loc[mask_wcg_boys_survived, 'Predict'] = 1
    
    return data

In [10]:
data = create_base_predictions(data)

# Show predictions summary
boys_survived = data[891:1309][(data.Title == 'boy') & (data.Predict == 1)]
women_died = data[891:1309][(data.Title == 'woman') & (data.Predict == 0)]
print(f'{len(boys_survived)} boys predicted to survive')
print(f'{len(women_died)} women predicted to die')

8 boys predicted to survive
15 women predicted to die


  boys_survived = data[891:1309][(data.Title == 'boy') & (data.Predict == 1)]
  women_died = data[891:1309][(data.Title == 'woman') & (data.Predict == 0)]


## 5. Feature Preparation for KNN

In [11]:
def prepare_fare_features(data):
    """Calculate per-person fare (Pfare)"""
    data['Ticket_freq'] = data.groupby('Ticket')['Ticket'].transform('count')
    data['Pfare'] = data['Fare'] / data['Ticket_freq']
    return data

In [12]:
data = prepare_fare_features(data)

## 6. KNN Model for Adult Males

In [13]:
def create_preprocessing_pipeline():
    """Create sklearn preprocessing pipeline"""
    numerical_cols = ['Pfare']
    categorical_cols = ['Pclass', 'Embarked']
    
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer()),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])
    
    return preprocessor

def train_male_knn_model(data, features=['Pfare', 'Pclass', 'Embarked']):
    """Train KNN ensemble model for adult males"""
    # Isolate adult males (not in WCG)
    train_male = data[0:891].loc[(data.Sex == 'male') & (data.WCSurvived.isnull())]
    test_male = data[891:1309].loc[(data.Sex == 'male') & (data.WCSurvived.isnull())]
    
    X_m = train_male[features]
    y_m = train_male['Survived']
    
    # Create ensemble of KNN models
    m1 = KNeighborsClassifier(n_neighbors=1)
    m2 = KNeighborsClassifier(n_neighbors=3)
    m3 = KNeighborsClassifier(n_neighbors=7)
    
    preprocessor = create_preprocessing_pipeline()
    
    male_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('voting', VotingClassifier([('m1', m1), ('m2', m2), ('m3', m3)]))
    ])
    
    # Cross-validation
    precision = cross_val_score(male_pipeline, X_m, y_m, cv=15, n_jobs=-1, scoring='precision').mean()
    recall = cross_val_score(male_pipeline, X_m, y_m, cv=15, n_jobs=-1, scoring='recall').mean()
    accuracy = cross_val_score(male_pipeline, X_m, y_m, cv=15, n_jobs=-1).mean()
    
    print(f'Male model - Precision: {precision:.3f}, Recall: {recall:.3f}, Accuracy: {accuracy:.3f}')
    
    # Train and predict
    male_pipeline.fit(X_m, y_m)
    X_test_m = test_male[features]
    predictions_m = male_pipeline.predict(X_test_m)
    
    print(f'{(predictions_m == 1).sum()} adult males predicted to survive')
    
    return test_male, predictions_m

In [14]:
test_male, predictions_m = train_male_knn_model(data)

Male model - Precision: 0.404, Recall: 0.267, Accuracy: 0.816
13 adult males predicted to survive


## 7. KNN Model for Non-WCG Females

In [15]:
def train_female_knn_model(data, features=['Pfare', 'Pclass', 'Embarked']):
    """Train KNN ensemble model for non-WCG females"""
    # Isolate non-WCG females
    train_female = data[0:891].loc[(data.Sex == 'female') & (data.WCSurvived.isnull())]
    test_female = data[891:1309].loc[(data.Sex == 'female') & (data.WCSurvived.isnull())]
    
    X_f = train_female[features]
    y_f = train_female['Survived']
    
    # Create ensemble of KNN models
    f1 = KNeighborsClassifier(n_neighbors=4)
    f2 = KNeighborsClassifier(n_neighbors=9)
    f3 = KNeighborsClassifier(n_neighbors=11)
    
    preprocessor = create_preprocessing_pipeline()
    
    female_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('voting', VotingClassifier([('f1', f1), ('f2', f2), ('f3', f3)]))
    ])
    
    # Custom scorers (predicting death, pos_label=0)
    custom_precision = make_scorer(precision_score, pos_label=0, zero_division=0)
    custom_recall = make_scorer(recall_score, pos_label=0)
    
    # Cross-validation
    cv_results = cross_validate(
        female_pipeline, X_f, y_f, cv=10,
        scoring={'precision': custom_precision, 'recall': custom_recall, 'accuracy': 'accuracy'},
        return_train_score=False
    )
    
    print(f'Female model - Precision: {cv_results["test_precision"].mean():.3f}, '
          f'Recall: {cv_results["test_recall"].mean():.3f}, '
          f'Accuracy: {cv_results["test_accuracy"].mean():.3f}')
    
    # Train and predict
    female_pipeline.fit(X_f, y_f)
    X_test_f = test_female[features]
    predictions_f = female_pipeline.predict(X_test_f)
    
    print(f'{(predictions_f == 0).sum()} non-WCG females predicted to die')
    
    return test_female, predictions_f

In [16]:
test_female, predictions_f = train_female_knn_model(data)

Female model - Precision: 0.583, Recall: 0.267, Accuracy: 0.829
7 non-WCG females predicted to die


## 8. Final Predictions and Submission

In [17]:
def create_final_predictions(data, test_male, predictions_m, test_female, predictions_f):
    """Update predictions with KNN results"""
    # Update male predictions
    mask_male = (data.index >= 891) & (data.index <= 1308) & \
                (data.Sex == 'male') & (data.WCSurvived.isnull())
    data.loc[mask_male, 'Predict'] = predictions_m
    
    # Update female predictions
    mask_female = (data.index >= 891) & (data.index <= 1308) & \
                  (data.Sex == 'female') & (data.WCSurvived.isnull())
    data.loc[mask_female, 'Predict'] = predictions_f
    
    return data

def save_submission(data, output_path='../output/knn.csv'):
    """Save predictions to CSV file in Perished format"""
    # Convert Survived format (Predict) to Perished format
    output = pd.DataFrame({
        'PassengerId': data[891:1309].PassengerId,
        'Perished': (1 - data[891:1309].Predict).astype('int')
    })
    
    # Create output directory if needed
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    output.to_csv(output_path, index=False)
    print(f'Submission saved to {output_path}')
    
    return output

In [18]:
# Create final predictions
data = create_final_predictions(data, test_male, predictions_m, test_female, predictions_f)

# Save submission
output = save_submission(data, '../output/knn.csv')

Submission saved to ../output/knn.csv


In [19]:
# Show sample predictions
print('\nSample predictions:')
print(output.head(20))


Sample predictions:
     PassengerId  Perished
891          892         1
892          893         0
893          894         1
894          895         1
895          896         0
896          897         1
897          898         0
898          899         1
899          900         0
900          901         1
901          902         1
902          903         1
903          904         0
904          905         1
905          906         0
906          907         0
907          908         1
908          909         1
909          910         1
910          911         0
