# WCG + KNN Final Model

- Male: KNN_XGB_Soft (KNN k=1,3,7 + XGBoost with soft voting) + threshold p≥0.90
- Female: KNN_Ensemble (KNN k=4,9,11 with soft voting) + threshold p≤0.08

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('dark')
sns.set_palette('Set2')
import warnings
warnings.filterwarnings('ignore')

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import make_scorer, precision_score, recall_score

try:
    from xgboost import XGBClassifier
    XGBOOST_AVAILABLE = True
    print("✓ XGBoost available")
except ImportError:
    XGBOOST_AVAILABLE = False
    print("✗ XGBoost not available")
    raise ImportError("XGBoost is required")

✓ XGBoost available


## Data Loading and Preprocessing

In [2]:
train_data = pd.read_csv('../input/train.csv')
test_data = pd.read_csv('../input/test.csv')
data = pd.concat([train_data, test_data]).reset_index().drop(['index'], axis=1)

str_cols = data.select_dtypes(include=["object"]).columns
data[str_cols] = (data[str_cols]
                .apply(lambda s: s.str.strip())
                .replace(r"^\s*$", np.nan, regex=True))

if "Perished" in data and "Survived" not in data:
    data["Survived"] = 1 - data["Perished"]

In [3]:
data['Surname'] = data['Name'].apply(lambda x: x.split(',')[0])

In [4]:
# Ticket_id と Group_id
data['Ticket_id'] = (data['Pclass'].astype(str) + '-' + 
                     data['Ticket'].str[:-1] + '-' + 
                     data['Fare'].astype(str) + '-' + 
                     data['Embarked'].astype(str))
data['Group_id'] = data['Surname'] + '-' + data['Ticket_id']

In [5]:
# Title
data['Title'] = 'man'
data.loc[data.Sex == 'female', 'Title'] = 'woman'
data.loc[data['Name'].str.contains('Master', na=False), 'Title'] = 'boy'

In [6]:
# WCG識別
data.loc[data.Title == 'man', 'Group_id'] = 'noGroup'
data['WC_count'] = data.loc[data.Title != 'man'].groupby('Group_id')['Group_id'].transform('count')
data.loc[data.WC_count <=1, 'Group_id'] = 'noGroup'

In [7]:
count = 0
for i in range(len(data)):
    if (data.loc[i,'Title'] != 'man') & (data.loc[i,'Group_id'] == 'noGroup'):
        matching = data.loc[(data['Ticket_id'] == data.loc[i, 'Ticket_id']) & (data.Title != 'man'), 'Group_id']
        if len(matching) > 0:
            data.loc[i,'Group_id'] = matching.iloc[0]
            if (data.loc[i, 'Group_id'] != 'noGroup'):
                count += 1

print(f'{count} passengers were added to an existing group')

11 passengers were added to an existing group


In [8]:
# WCSurvived
data['WCSurvived'] = data.loc[(data.Title != 'man') & (data.Group_id != 'noGroup')].groupby('Group_id').Survived.transform('mean')
test_groups = set(data[891:1309].Group_id.unique()) - set(data[0:891].Group_id.unique())
data.loc[data.Group_id.isin(test_groups), 'WCSurvived'] = 0
data.loc[(data.Group_id.isin(test_groups)) & (data.Pclass != 3), 'WCSurvived'] = 1

In [9]:
# WCG予測
data.loc[891:1308, 'Predict'] = 0
data.loc[(data.index >= 891) & (data.index <= 1308) & (data.Sex == 'female'), 'Predict'] = 1
data.loc[(data.index >= 891) & (data.index <= 1308) & (data.Sex == 'female') & (data['WCSurvived'] == 0), 'Predict'] = 0
data.loc[(data.index >= 891) & (data.index <= 1308) & (data.Title == 'boy') & (data['WCSurvived'] == 1), 'Predict'] = 1

In [10]:
# Pfare
data['Ticket_freq'] = data.groupby('Ticket')['Ticket'].transform('count')
data['Pfare'] = data['Fare'] / data['Ticket_freq']

## Male Model: KNN_XGB_Soft with Threshold p≥0.90

In [11]:
train_male = data[0:891].loc[(data.Sex=='male') & (data.WCSurvived.isnull())]
test_male = data[891:1309].loc[(data.Sex=='male') & (data.WCSurvived.isnull())]

features = ['Pfare', 'Pclass', 'Embarked']
numerical_cols = ['Pfare']
categorical_cols = ['Pclass', 'Embarked']

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

y_m = train_male['Survived']
X_m = train_male[features]

In [12]:
# Male Model: KNN_XGB_Soft
male_model = VotingClassifier(
    estimators=[
        ('knn1', KNeighborsClassifier(n_neighbors=1)),
        ('knn3', KNeighborsClassifier(n_neighbors=3)),
        ('knn7', KNeighborsClassifier(n_neighbors=7)),
        ('xgb', XGBClassifier(
            objective="binary:logistic",
            eval_metric="logloss",
            max_depth=5,
            learning_rate=0.1,
            gamma=0.1,
            colsample_bytree=1.0,
            min_child_weight=1.0,
            n_estimators=500,
            subsample=1.0,
            random_state=42,
            n_jobs=-1
        ))
    ],
    voting='soft'
)

male_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', male_model)
])

print('=== MALE MODEL: KNN_XGB_Soft ===')
cv_results = cross_validate(
    male_pipeline, X_m, y_m, cv=15,
    scoring=['accuracy', 'precision', 'recall'],
    return_train_score=False,
    n_jobs=-1
)

print(f"Accuracy:  {cv_results['test_accuracy'].mean():.4f} ± {cv_results['test_accuracy'].std():.4f}")
print(f"Precision: {cv_results['test_precision'].mean():.4f} ± {cv_results['test_precision'].std():.4f}")
print(f"Recall:    {cv_results['test_recall'].mean():.4f} ± {cv_results['test_recall'].std():.4f}")

# Fit and predict with threshold
male_pipeline.fit(X_m, y_m)
X_test_m = test_male[features]

# Get probabilities and apply threshold (male p≥0.90 for survival)
male_proba = male_pipeline.predict_proba(X_test_m)[:, 1]  # Probability of survival
predictions_m = (male_proba >= 0.90).astype(int)

print(f"\nMale survivors predicted (threshold ≥0.90): {predictions_m.sum()}")
print(f"Max probability: {male_proba.max():.4f}")
print(f"Min probability: {male_proba.min():.4f}")

=== MALE MODEL: KNN_XGB_Soft ===
Accuracy:  0.8119 ± 0.0580
Precision: 0.4056 ± 0.3246
Recall:    0.2800 ± 0.2010

Male survivors predicted (threshold ≥0.90): 0
Max probability: 0.8199
Min probability: 0.0049


## Female Model: KNN_Ensemble with Threshold p≤0.08

In [13]:
train_female = data[0:891].loc[(data.Sex=='female') & (data.WCSurvived.isnull())]
test_female = data[891:1309].loc[(data.Sex=='female') & (data.WCSurvived.isnull())]

y_f = train_female['Survived']
X_f = train_female[features]

custom_precision = make_scorer(precision_score, pos_label=0, zero_division=0)
custom_recall = make_scorer(recall_score, pos_label=0)

In [14]:
# Female Model: KNN_Ensemble (soft voting for probabilities)
female_model = VotingClassifier(
    estimators=[
        ('knn4', KNeighborsClassifier(n_neighbors=4)),
        ('knn9', KNeighborsClassifier(n_neighbors=9)),
        ('knn11', KNeighborsClassifier(n_neighbors=11))
    ],
    voting='soft'  # Changed to soft for probability estimation
)

female_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', female_model)
])

print('=== FEMALE MODEL: KNN_Ensemble ===')
cv_results = cross_validate(
    female_pipeline, X_f, y_f, cv=9,
    scoring={
        'accuracy': 'accuracy',
        'precision': custom_precision,
        'recall': custom_recall
    },
    return_train_score=False,
    n_jobs=-1
)

print(f"Accuracy:  {cv_results['test_accuracy'].mean():.4f} ± {cv_results['test_accuracy'].std():.4f}")
print(f"Precision: {cv_results['test_precision'].mean():.4f} ± {cv_results['test_precision'].std():.4f}")
print(f"Recall:    {cv_results['test_recall'].mean():.4f} ± {cv_results['test_recall'].std():.4f}")

# Fit and predict with threshold
female_pipeline.fit(X_f, y_f)
X_test_f = test_female[features]

# Get probabilities and apply threshold (female p≤0.08 for death)
female_proba = female_pipeline.predict_proba(X_test_f)[:, 1]  # Probability of survival
predictions_f = (female_proba > 0.08).astype(int)  # Survive if p > 0.08

print(f"\nFemale deaths predicted (threshold ≤0.08): {(predictions_f == 0).sum()}")
print(f"Max probability: {female_proba.max():.4f}")
print(f"Min probability: {female_proba.min():.4f}")

=== FEMALE MODEL: KNN_Ensemble ===
Accuracy:  0.8081 ± 0.0635
Precision: 0.5278 ± 0.3889
Recall:    0.2500 ± 0.1667

Female deaths predicted (threshold ≤0.08): 0
Max probability: 1.0000
Min probability: 0.4293


## Final Prediction and Submission

In [15]:
import os

# Update predictions
mask_male = (data.index >= 891) & (data.index <= 1308) & (data.Sex=='male') & (data.WCSurvived.isnull())
data.loc[mask_male, 'Predict'] = predictions_m

mask_female = (data.index >= 891) & (data.index <= 1308) & (data.Sex=='female') & (data.WCSurvived.isnull())
data.loc[mask_female, 'Predict'] = predictions_f

# Create submission
output = pd.DataFrame({
    'PassengerId': data[891:1309].PassengerId, 
    'Perished': (1 - data[891:1309].Predict).astype('int')
})

os.makedirs('../output', exist_ok=True)
output.to_csv('../output/wcg_knn_final.csv', index=False)

print('Submission file saved: wcg_knn_final.csv')
print(f'Survival rate: {(output["Perished"] == 0).sum() / len(output) * 100:.2f}%')
print(f'Death rate: {(output["Perished"] == 1).sum() / len(output) * 100:.2f}%')

Submission file saved: wcg_knn_final.csv
Survival rate: 34.69%
Death rate: 65.31%


## Summary

In [16]:
print('=' * 60)
print('FINAL MODEL CONFIGURATION')
print('=' * 60)
print('Male Model:   KNN_XGB_Soft')
print('              - KNN k=1, 3, 7')
print('              - XGBoost (n_estimators=500, max_depth=5)')
print('              - Soft Voting')
print('              - Threshold: p ≥ 0.90 for survival')
print()
print('Female Model: KNN_Ensemble')
print('              - KNN k=4, 9, 11')
print('              - Soft Voting (for probability)')
print('              - Threshold: p ≤ 0.08 for death')
print()
print('Features:     Pfare, Pclass, Embarked')
print('=' * 60)

FINAL MODEL CONFIGURATION
Male Model:   KNN_XGB_Soft
              - KNN k=1, 3, 7
              - XGBoost (n_estimators=500, max_depth=5)
              - Soft Voting
              - Threshold: p ≥ 0.90 for survival

Female Model: KNN_Ensemble
              - KNN k=4, 9, 11
              - Soft Voting (for probability)
              - Threshold: p ≤ 0.08 for death

Features:     Pfare, Pclass, Embarked
