In [1]:
# NumPy
import numpy as np

# Dataframe operations
import pandas as pd

# Data visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Scalers
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

# Models
from sklearn.linear_model import LogisticRegression #logistic regression
from sklearn.linear_model import Perceptron
from sklearn import svm #support vector Machine
from sklearn.ensemble import RandomForestClassifier #Random Forest
from sklearn.neighbors import KNeighborsClassifier #KNN
from sklearn.naive_bayes import GaussianNB #Naive bayes
from sklearn.tree import DecisionTreeClassifier #Decision Tree
from sklearn.model_selection import train_test_split #training and testing data split
from sklearn import metrics #accuracy measure
from sklearn.metrics import confusion_matrix #for confusion matrix
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

# Cross-validation
from sklearn.model_selection import KFold #for K-fold cross validation
from sklearn.model_selection import cross_val_score #score evaluation
from sklearn.model_selection import cross_val_predict #prediction
from sklearn.model_selection import cross_validate

# GridSearchCV
from sklearn.model_selection import GridSearchCV

#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.plotting import scatter_matrix

In [2]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")


data_df = pd.concat([train_df, test_df], ignore_index=True)


In [3]:
data_df['Title'] = data_df['Name']
# Cleaning name and extracting Title
for name_string in data_df['Name']:
    data_df['Title'] = data_df['Name'].str.extract('([A-Za-z]+)\.', expand=True)

# Replacing rare titles with more common ones
mapping = {'Mlle': 'Miss', 'Major': 'Mr', 'Col': 'Mr', 'Sir': 'Mr', 'Don': 'Mr', 'Mme': 'Miss',
          'Jonkheer': 'Mr', 'Lady': 'Mrs', 'Capt': 'Mr', 'Countess': 'Mrs', 'Ms': 'Miss', 'Dona': 'Mrs'}
data_df.replace({'Title': mapping}, inplace=True)
titles = ['Dr', 'Master', 'Miss', 'Mr', 'Mrs', 'Rev']
for title in titles:
    age_to_impute = data_df.groupby('Title')['Age'].median()[titles.index(title)]
    data_df.loc[(data_df['Age'].isnull()) & (data_df['Title'] == title), 'Age'] = age_to_impute
    
# Substituting Age values in TRAIN_DF and TEST_DF:
train_df['Age'] = data_df['Age'][:891]
test_df['Age'] = data_df['Age'][891:]

# Dropping Title feature
data_df.drop('Title', axis = 1, inplace = True)

  age_to_impute = data_df.groupby('Title')['Age'].median()[titles.index(title)]
  age_to_impute = data_df.groupby('Title')['Age'].median()[titles.index(title)]
  age_to_impute = data_df.groupby('Title')['Age'].median()[titles.index(title)]
  age_to_impute = data_df.groupby('Title')['Age'].median()[titles.index(title)]
  age_to_impute = data_df.groupby('Title')['Age'].median()[titles.index(title)]
  age_to_impute = data_df.groupby('Title')['Age'].median()[titles.index(title)]


In [4]:
data_df['Family_Size'] = data_df['Parch'] + data_df['SibSp']

# Substituting Age values in TRAIN_DF and TEST_DF:
train_df['Family_Size'] = data_df['Family_Size'][:891]
test_df['Family_Size'] = data_df['Family_Size'][891:]

In [5]:
data_df['Last_Name'] = data_df['Name'].apply(lambda x: str.split(x, ",")[0])
data_df['Fare'].fillna(data_df['Fare'].mean(), inplace=True)

DEFAULT_SURVIVAL_VALUE = 0.5
data_df['Family_Survival'] = DEFAULT_SURVIVAL_VALUE

for grp, grp_df in data_df[['Perished','Name', 'Last_Name', 'Fare', 'Ticket', 'PassengerId',
                           'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Last_Name', 'Fare']):
    
    if (len(grp_df) != 1):
        # A Family group is found.
        for ind, row in grp_df.iterrows():
            smax = grp_df.drop(ind)['Perished'].max()
            smin = grp_df.drop(ind)['Perished'].min()
            passID = row['PassengerId']
            if (smax == 1.0):
                data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 1
            elif (smin==0.0):
                data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 0

print("Number of passengers with family survival information:", 
      data_df.loc[data_df['Family_Survival']!=0.5].shape[0])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_df['Fare'].fillna(data_df['Fare'].mean(), inplace=True)


Number of passengers with family survival information: 420


In [6]:
for _, grp_df in data_df.groupby('Ticket'):
    if (len(grp_df) != 1):
        for ind, row in grp_df.iterrows():
            if (row['Family_Survival'] == 0) | (row['Family_Survival']== 0.5):
                smax = grp_df.drop(ind)['Perished'].max()
                smin = grp_df.drop(ind)['Perished'].min()
                passID = row['PassengerId']
                if (smax == 1.0):
                    data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 1
                elif (smin==0.0):
                    data_df.loc[data_df['PassengerId'] == passID, 'Family_Survival'] = 0
                        
print("Number of passenger with family/group survival information: " 
      +str(data_df[data_df['Family_Survival']!=0.5].shape[0]))

# # Family_Survival in TRAIN_DF and TEST_DF:
train_df['Family_Survival'] = data_df['Family_Survival'][:891]
test_df['Family_Survival'] = data_df['Family_Survival'][891:]

Number of passenger with family/group survival information: 546


In [7]:
data_df['Fare'].fillna(data_df['Fare'].median(), inplace = True)

# Making Bins
data_df['FareBin'] = pd.qcut(data_df['Fare'], 5)

label = LabelEncoder()
data_df['FareBin_Code'] = label.fit_transform(data_df['FareBin'])

train_df['FareBin_Code'] = data_df['FareBin_Code'][:891]
test_df['FareBin_Code'] = data_df['FareBin_Code'][891:]

train_df.drop(['Fare'], axis=1, inplace=True)
test_df.drop(['Fare'], axis=1, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_df['Fare'].fillna(data_df['Fare'].median(), inplace = True)


In [8]:
data_df['AgeBin'] = pd.qcut(data_df['Age'], 4)

label = LabelEncoder()
data_df['AgeBin_Code'] = label.fit_transform(data_df['AgeBin'])

train_df['AgeBin_Code'] = data_df['AgeBin_Code'][:891]
test_df['AgeBin_Code'] = data_df['AgeBin_Code'][891:]

train_df.drop(['Age'], axis=1, inplace=True)
test_df.drop(['Age'], axis=1, inplace=True)


In [9]:
train_df['Sex'] = train_df['Sex'].map({'male': 0, 'female': 1})
test_df['Sex']  = test_df['Sex'].map({'male': 0, 'female': 1})

# PassengerId を保存しておく
test_passenger_ids = test_df['PassengerId'].copy()

train_df.drop(['Name', 'PassengerId', 'SibSp', 'Parch', 'Ticket', 'Cabin',
               'Embarked'], axis = 1, inplace = True)
test_df.drop(['Name','PassengerId', 'SibSp', 'Parch', 'Ticket', 'Cabin',
              'Embarked'], axis = 1, inplace = True)

In [10]:
X = train_df.drop('Perished', axis = 1)
y = train_df['Perished']
X_test = test_df.copy()

In [11]:
std_scaler = StandardScaler()
X = std_scaler.fit_transform(X)
X_test = std_scaler.transform(X_test)

In [12]:
from sklearn.metrics import roc_auc_score
n_neighbors = [6,7,8,9,10,11,12,14,16,18,20,22]
algorithm = ['auto']
weights = ['uniform', 'distance']
leaf_size = list(range(1,50,5))
hyperparams = {'algorithm': algorithm, 'weights': weights, 'leaf_size': leaf_size, 
               'n_neighbors': n_neighbors}

# GridSearchCV で最適なハイパーパラメータを探索
gd = GridSearchCV(estimator = KNeighborsClassifier(), param_grid = hyperparams, verbose=True, 
                  cv=10, scoring = "roc_auc")
gd.fit(X, y)
print(f"Best CV Score: {gd.best_score_}")
print(f"Best Parameters: {gd.best_estimator_}")

# Fold ごとの詳細スコアを表示
print("\n" + "="*80)
print("Fold ごとの Train/Validation スコアと Gap")
print("="*80)

# KFold で再実行して詳細を表示
cv = KFold(n_splits=10, shuffle=False)
best_model = gd.best_estimator_

oof_predictions = np.zeros(len(X))
train_scores = []
val_scores = []
gaps = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    X_train_fold, X_val_fold = X[train_idx], X[val_idx]
    y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
    
    # モデルを訓練
    model = KNeighborsClassifier(**gd.best_params_)
    model.fit(X_train_fold, y_train_fold)
    
    # Train スコア
    train_pred_proba = model.predict_proba(X_train_fold)[:, 1]
    train_score = roc_auc_score(y_train_fold, train_pred_proba)
    
    # Validation スコア
    val_pred_proba = model.predict_proba(X_val_fold)[:, 1]
    val_score = roc_auc_score(y_val_fold, val_pred_proba)
    
    # Gap
    gap = train_score - val_score
    
    # OOF predictions を保存
    oof_predictions[val_idx] = val_pred_proba
    
    train_scores.append(train_score)
    val_scores.append(val_score)
    gaps.append(gap)
    
    print(f"Fold {fold:2d} | Train: {train_score:.6f} | Val: {val_score:.6f} | Gap: {gap:.6f}")

# OOF スコア
oof_score = roc_auc_score(y, oof_predictions)

print("="*80)
print(f"平均 Train スコア: {np.mean(train_scores):.6f} (±{np.std(train_scores):.6f})")
print(f"平均 Val スコア:   {np.mean(val_scores):.6f} (±{np.std(val_scores):.6f})")
print(f"平均 Gap:          {np.mean(gaps):.6f} (±{np.std(gaps):.6f})")
print(f"OOF スコア:        {oof_score:.6f}")
print("="*80)

Fitting 10 folds for each of 240 candidates, totalling 2400 fits
Best CV Score: 0.8661474407944996
Best Parameters: KNeighborsClassifier(leaf_size=11, n_neighbors=14)

Fold ごとの Train/Validation スコアと Gap
Fold  1 | Train: 0.900215 | Val: 0.856209 | Gap: 0.044006
Fold  2 | Train: 0.903704 | Val: 0.778986 | Gap: 0.124719
Fold  3 | Train: 0.903846 | Val: 0.775134 | Gap: 0.128712
Fold  4 | Train: 0.899587 | Val: 0.876010 | Gap: 0.023577
Fold  5 | Train: 0.897427 | Val: 0.864865 | Gap: 0.032562
Fold  6 | Train: 0.893729 | Val: 0.911732 | Gap: -0.018004
Fold  7 | Train: 0.902812 | Val: 0.821282 | Gap: 0.081530
Fold  8 | Train: 0.894989 | Val: 0.860931 | Gap: 0.034059
Fold  9 | Train: 0.894129 | Val: 0.917514 | Gap: -0.023385
Fold 10 | Train: 0.892593 | Val: 0.915855 | Gap: -0.023262
平均 Train スコア: 0.898303 (±0.004102)
平均 Val スコア:   0.857852 (±0.049616)
平均 Gap:          0.040451 (±0.053414)
OOF スコア:        0.858485


In [13]:
if isinstance(X_test, np.ndarray):
    print("NumPy配列です。NaN数:", np.isnan(X).sum())
else:
    print("DataFrameです。NaN数:")
    print(X_test.isnull().sum())


NumPy配列です。NaN数: 0


In [14]:
from sklearn.impute import SimpleImputer

# strategy を指定して、すべての列を保持する
imputer = SimpleImputer(strategy='median')  # または 'mean', 'most_frequent'

# 重要：訓練データでfit、テストデータでtransformを分ける
imputer.fit(X)  # X で学習
X_test_imputed = imputer.transform(X_test)  # X_test に適用

print("X_test_imputed shape:", X_test_imputed.shape)  # (418, 6) になるはず

y_pred = gd.best_estimator_.predict(X_test_imputed)

X_test_imputed shape: (418, 6)


In [15]:
print("X shape:", X.shape)
print("X_test shape:", X_test.shape)
print("X_test_imputed shape:", X_test_imputed.shape)

X shape: (891, 6)
X_test shape: (418, 6)
X_test_imputed shape: (418, 6)


In [16]:
from sklearn.impute import SimpleImputer

# Imputer を作成して X と X_test を補完
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)
X_test_imputed = imputer.transform(X_test)

# モデルを訓練・予測
knn = KNeighborsClassifier(leaf_size=11,
                           n_neighbors=14,
                           weights='uniform')
knn.fit(X_imputed, y)
y_pred = knn.predict(X_test_imputed)

In [17]:
# Submission ファイルを作成
submission = pd.DataFrame({
    'PassengerId': test_passenger_ids,
    'Perished': y_pred
})


submission.to_csv("../output/konstantin.csv", index=False)

print("Submission file created: ../output/konstantin.csv")
print(submission.head())

Submission file created: ../output/konstantin.csv
   PassengerId  Perished
0          892         1
1          893         0
2          894         1
3          895         1
4          896         0
