# WCG + KNN (Stratified Cross-Validation)

wcg_knn.ipynb with StratifiedKFold instead of regular KFold

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('dark')
sns.set_palette('Set2')
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import make_scorer, precision_score, recall_score

In [2]:
train_data = pd.read_csv('../input/train.csv')
test_data = pd.read_csv('../input/test.csv')
data = pd.concat([train_data, test_data]).reset_index().drop(['index'], axis=1)

str_cols = data.select_dtypes(include=["object"]).columns
data[str_cols] = (data[str_cols]
                .apply(lambda s: s.str.strip())
                .replace(r"^\s*$", np.nan, regex=True))

if "Perished" in data and "Survived" not in data:
    data["Survived"] = 1 - data["Perished"]

In [3]:
data['Surname'] = data['Name'].apply(lambda x: x.split(',')[0])

In [4]:
# Ticket_id と Group_id
data['Ticket_id'] = (data['Pclass'].astype(str) + '-' + 
                     data['Ticket'].str[:-1] + '-' + 
                     data['Fare'].astype(str) + '-' + 
                     data['Embarked'].astype(str))
data['Group_id'] = data['Surname'] + '-' + data['Ticket_id']

In [5]:
# Title
data['Title'] = 'man'
data.loc[data.Sex == 'female', 'Title'] = 'woman'
data.loc[data['Name'].str.contains('Master', na=False), 'Title'] = 'boy'

In [6]:
# WCG識別
data.loc[data.Title == 'man', 'Group_id'] = 'noGroup'
data['WC_count'] = data.loc[data.Title != 'man'].groupby('Group_id')['Group_id'].transform('count')
data.loc[data.WC_count <=1, 'Group_id'] = 'noGroup'

  return op(a, b)


In [7]:
count = 0
for i in range(len(data)):
    if (data.loc[i,'Title'] != 'man') & (data.loc[i,'Group_id'] == 'noGroup'):
        matching = data.loc[(data['Ticket_id'] == data.loc[i, 'Ticket_id']) & (data.Title != 'man'), 'Group_id']
        if len(matching) > 0:
            data.loc[i,'Group_id'] = matching.iloc[0]
            if (data.loc[i, 'Group_id'] != 'noGroup'):
                count += 1

print(f'{count} passengers were added to an existing group')

11 passengers were added to an existing group


In [8]:
# WCSurvived
data['WCSurvived'] = data.loc[(data.Title != 'man') & (data.Group_id != 'noGroup')].groupby('Group_id').Survived.transform('mean')
test_groups = set(data[891:1309].Group_id.unique()) - set(data[0:891].Group_id.unique())
data.loc[data.Group_id.isin(test_groups), 'WCSurvived'] = 0
data.loc[(data.Group_id.isin(test_groups)) & (data.Pclass != 3), 'WCSurvived'] = 1

In [9]:
# WCG予測
data.loc[891:1308, 'Predict'] = 0
data.loc[(data.index >= 891) & (data.index <= 1308) & (data.Sex == 'female'), 'Predict'] = 1
data.loc[(data.index >= 891) & (data.index <= 1308) & (data.Sex == 'female') & (data['WCSurvived'] == 0), 'Predict'] = 0
data.loc[(data.index >= 891) & (data.index <= 1308) & (data.Title == 'boy') & (data['WCSurvived'] == 1), 'Predict'] = 1

In [10]:
# Pfare
data['Ticket_freq'] = data.groupby('Ticket')['Ticket'].transform('count')
data['Pfare'] = data['Fare'] / data['Ticket_freq']

## Male Model (StratifiedKFold)

In [11]:
train_male = data[0:891].loc[(data.Sex=='male') & (data.WCSurvived.isnull())]
test_male = data[891:1309].loc[(data.Sex=='male') & (data.WCSurvived.isnull())]

cols = ['PassengerId', 'Name', 'Pfare', 'Pclass', 'Embarked', 'Survived']
y_m = train_male['Survived']
features = ['Pfare', 'Pclass', 'Embarked']
X_m = train_male[features]

numerical_cols = ['Pfare']
categorical_cols = ['Pclass', 'Embarked']

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

In [12]:
# StratifiedKFold with 15 splits
skf_male = StratifiedKFold(n_splits=15, shuffle=True, random_state=42)

m1 = KNeighborsClassifier(n_neighbors=1)
m2 = KNeighborsClassifier(n_neighbors=3)
m3 = KNeighborsClassifier(n_neighbors=7)

male_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('voting', VotingClassifier([('m1', m1), ('m2', m2), ('m3', m3)]))
])

print('=' * 60)
print('MALE MODEL (StratifiedKFold with 15 splits)')
print('=' * 60)
print('15-fold precision of the ensemble: {:.3f}'.format(
    cross_val_score(male_pipeline, X_m, y_m, cv=skf_male, n_jobs=-1, scoring='precision').mean()))
print('15-fold recall of the ensemble: {:.3f}'.format(
    cross_val_score(male_pipeline, X_m, y_m, cv=skf_male, n_jobs=-1, scoring='recall').mean()))
print('15-fold accuracy of the ensemble: {:.3f}'.format(
    cross_val_score(male_pipeline, X_m, y_m, cv=skf_male, n_jobs=-1).mean()))

# Fit and predict
male_pipeline.fit(X_m, y_m)
learn_train_m = male_pipeline.predict(X_m)
X_test_m = test_male[features]
predictions_m = male_pipeline.predict(X_test_m)

print(f'\nMale survivors predicted: {predictions_m.sum()}')

MALE MODEL (StratifiedKFold with 15 splits)


  _warn_prf(average, modifier, msg_start, len(result))


15-fold precision of the ensemble: 0.306
15-fold recall of the ensemble: 0.287
15-fold accuracy of the ensemble: 0.799

Male survivors predicted: 13.0


## Female Model (StratifiedKFold)

In [13]:
train_female = data[0:891].loc[(data.Sex=='female') & (data.WCSurvived.isnull())]
test_female = data[891:1309].loc[(data.Sex=='female') & (data.WCSurvived.isnull())]

y_f = train_female['Survived']
X_f = train_female[features]

custom_precision = make_scorer(precision_score, pos_label=0, zero_division=0)
custom_recall = make_scorer(recall_score, pos_label=0)

In [14]:
# StratifiedKFold with 9 splits
skf_female = StratifiedKFold(n_splits=9, shuffle=True, random_state=42)

f1 = KNeighborsClassifier(n_neighbors=4)
f2 = KNeighborsClassifier(n_neighbors=9)
f3 = KNeighborsClassifier(n_neighbors=11)

female_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('voting', VotingClassifier([('f1', f1), ('f2', f2), ('f3', f3)]))
])

# Cross-validation scores
cv_results = cross_validate(
    female_pipeline, X_f, y_f, cv=skf_female,
    scoring={
        'precision': custom_precision,
        'recall': custom_recall,
        'accuracy': 'accuracy'
    },
    return_train_score=True
)

print('=' * 60)
print('FEMALE MODEL (StratifiedKFold with 9 splits)')
print('=' * 60)
print(f'CV Score - Precision (9-fold): {cv_results["test_precision"].mean():.3f} (+/- {cv_results["test_precision"].std():.3f})')
print(f'CV Score - Recall (9-fold):    {cv_results["test_recall"].mean():.3f} (+/- {cv_results["test_recall"].std():.3f})')
print(f'CV Score - Accuracy (9-fold):  {cv_results["test_accuracy"].mean():.3f} (+/- {cv_results["test_accuracy"].std():.3f})')
print()
print('Train Scores (for comparison):')
print(f'Train Precision: {cv_results["train_precision"].mean():.3f}')
print(f'Train Recall:    {cv_results["train_recall"].mean():.3f}')
print(f'Train Accuracy:  {cv_results["train_accuracy"].mean():.3f}')
print('=' * 60)

# Fit and predict
female_pipeline.fit(X_f, y_f)
learn_train_f = female_pipeline.predict(X_f)
X_test_f = test_female[features]
predictions_f = female_pipeline.predict(X_test_f)

print(f'\nFemale deaths predicted: {(predictions_f == 0).sum()}')

FEMALE MODEL (StratifiedKFold with 9 splits)
CV Score - Precision (9-fold): 0.437 (+/- 0.437)
CV Score - Recall (9-fold):    0.250 (+/- 0.289)
CV Score - Accuracy (9-fold):  0.818 (+/- 0.068)

Train Scores (for comparison):
Train Precision: 0.623
Train Recall:    0.274
Train Accuracy:  0.838

Female deaths predicted: 7


## Final Prediction and Submission

In [15]:
import os

# Update predictions
mask_male = (data.index >= 891) & (data.index <= 1308) & (data.Sex=='male') & (data.WCSurvived.isnull())
data.loc[mask_male, 'Predict'] = predictions_m

mask_female = (data.index >= 891) & (data.index <= 1308) & (data.Sex=='female') & (data.WCSurvived.isnull())
data.loc[mask_female, 'Predict'] = predictions_f

# Create submission
output = pd.DataFrame({
    'PassengerId': data[891:1309].PassengerId, 
    'Perished': (1 - data[891:1309].Predict).astype('int')
})

os.makedirs('../output', exist_ok=True)
output.to_csv('../output/wcg_knn_stratified.csv', index=False)

print('Submission file saved: wcg_knn_stratified.csv')
print(f'Survival rate: {(output["Perished"] == 0).sum() / len(output) * 100:.2f}%')
print(f'Death rate: {(output["Perished"] == 1).sum() / len(output) * 100:.2f}%')

Submission file saved: wcg_knn_stratified.csv
Survival rate: 36.12%
Death rate: 63.88%


## Summary

In [16]:
print('=' * 60)
print('SUMMARY')
print('=' * 60)
print('Cross-Validation: StratifiedKFold (preserves class distribution)')
print('Male Model:       KNN Ensemble (k=1,3,7) with 15 splits')
print('Female Model:     KNN Ensemble (k=4,9,11) with 9 splits')
print('Features:         Pfare, Pclass, Embarked')
print('=' * 60)

SUMMARY
Cross-Validation: StratifiedKFold (preserves class distribution)
Male Model:       KNN Ensemble (k=1,3,7) with 15 splits
Female Model:     KNN Ensemble (k=4,9,11) with 9 splits
Features:         Pfare, Pclass, Embarked
