In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('dark')
sns.set_palette('Set2')
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import make_scorer, precision_score, recall_score

In [2]:
train_data = pd.read_csv('../input/train.csv')
test_data = pd.read_csv('../input/test.csv')
data = pd.concat([train_data, test_data]).reset_index().drop(['index'], axis=1)

str_cols = data.select_dtypes(include=["object"]).columns
data[str_cols] = (data[str_cols]
                .apply(lambda s: s.str.strip())
                .replace(r"^\s*$", np.nan, regex=True))

# Kaggle互換の Survived が欲しい場合（Perished があるときだけ）
if "Perished" in data and "Survived" not in data:
    data["Survived"] = 1 - data["Perished"]

In [3]:
data['Surname'] = data['Name'].apply(lambda x: x.split(',')[0])

In [4]:
# New Ticket_id column
data['Ticket_id'] = 'new_col'
# Initialize Ticket_id = Pclass + Ticket + Fare + Embarked
def ticket_id(row):
    row['Ticket_id'] = str(row.Pclass) + '-' + str(row.Ticket)[:-1] + '-' + str(row.Fare) + '-' + str(row.Embarked)
    return row

data = data.apply(ticket_id, axis='columns')
# New Group_id column
data['Group_id'] = 'new_col2'
# Initialize Group_id = Surname + Ticket_id
def group_id(row):
    row['Group_id'] = str(row.Surname) + '-' + str(row.Ticket_id)
    return row

data = data.apply(group_id, axis='columns')

In [5]:
# creation of the Title feature
data['Title'] = 'man'
data.loc[data.Sex == 'female', 'Title'] = 'woman'
data.loc[data['Name'].str.contains('Master'), 'Title'] = 'boy'

In [6]:
data.loc[data.Title == 'man', 'Group_id'] = 'noGroup'
# New column with WC frequency
data['WC_count'] = data.loc[data.Title != 'man'].groupby('Group_id')['Group_id'].transform('count')
# assign noGroup to every unique value
data.loc[data.WC_count <=1, 'Group_id'] = 'noGroup'

  return op(a, b)


In [7]:
indices = []
count = 0
for i in range(0,1309):
    if (data.loc[i,'Title'] != 'man') & (data.loc[i,'Group_id'] == 'noGroup'):
        data.loc[i,'Group_id'] = data.loc[(data['Ticket_id'] == data.loc[i, 'Ticket_id']) & (data.Title != 'man'), 'Group_id'].iloc[0]
        if (data.loc[i, 'Group_id'] != 'noGroup'):
            indices.append(i)
            count += 1
print('{:d} passengers were added to an existing group'.format(count))

11 passengers were added to an existing group


In [8]:
data['WCSurvived'] = data.loc[(data.Title != 'man') & (data.Group_id != 'noGroup')].groupby('Group_id').Survived.transform('mean')

In [9]:
# Get the family names using set difference
test_groups = set(data[891:1309].Group_id.unique()) - set(data[0:891].Group_id.unique())
# Assign WCSurvived = 0 to 3rd class test families, else 1
data.loc[data.Group_id.isin(test_groups), 'WCSurvived'] = 0
data.loc[(data.Group_id.isin(test_groups)) & (data.Pclass != 3), 'WCSurvived'] = 1

In [10]:
# Set everyone to 0
data.loc[891:1308, 'Predict'] = 0
# Set women to 1, completing the gender model
data.loc[891:1308, 'Predict'][(data.Sex == 'female')] = 1
# Change WCG women with WCSurvived=0 to 0
data.loc[891:1308,'Predict'][(data.Sex == 'female') & (data['WCSurvived'] == 0)] = 0
# Change WCG boys with WCSurvived=1 to 1, completing the WCG + gender model
data.loc[891:1308, 'Predict'][(data.Title == 'boy') & (data['WCSurvived'] == 1)] = 1

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  data.loc[891:1308, 'Predict'][(data.Sex == 'female')] = 1
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the ori

In [11]:
# Calculate Ticket frequency and divide Fare by it
data['Ticket_freq'] = data.groupby('Ticket')['Ticket'].transform('count')
data['Pfare'] = data['Fare'] / data['Ticket_freq']

In [12]:
# Isolating adult males in train and test set
train_male = data[0:891].loc[(data.Sex=='male') & (data.WCSurvived.isnull())]
test_male = data[891:1309].loc[(data.Sex=='male') & (data.WCSurvived.isnull())]

In [13]:
cols = ['PassengerId', 'Name', 'Pfare', 'Pclass', 'Embarked']
y_m = train_male['Survived']
features = ['Pfare', 'Pclass', 'Embarked']
X_m = train_male[features]

numerical_cols = ['Pfare']
categorical_cols = ['Pclass', 'Embarked']

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

In [14]:
# 男性用モデル - RandomForest のみ
clf_male = RandomForestClassifier(random_state=1)

male_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', clf_male)
])

print('\n' + '=' * 70)
print('MALE MODEL - RandomForest')
print('=' * 70)
print('15-fold precision: {:.3f}'.format(
    cross_val_score(male_pipeline, X_m, y_m, cv=15, n_jobs=-1, scoring='precision').mean()))
print('15-fold recall: {:.3f}'.format(
    cross_val_score(male_pipeline, X_m, y_m, cv=15, n_jobs=-1, scoring='recall').mean()))
print('15-fold accuracy: {:.3f}'.format(
    cross_val_score(male_pipeline, X_m, y_m, cv=15, n_jobs=-1).mean()))
print('=' * 70)

# Fit model and make predictions
male_pipeline.fit(X_m, y_m)
learn_train_m = male_pipeline.predict(X_m)
X_test_m = test_male[features]
predictions_m = male_pipeline.predict(X_test_m)
print('\nThe following adult males are predicted to live:')
test_male.loc[(predictions_m==1), cols]


MALE MODEL - RandomForest
15-fold precision: 0.507
15-fold recall: 0.267
15-fold accuracy: 0.834

The following adult males are predicted to live:


Unnamed: 0,PassengerId,Name,Pfare,Pclass,Embarked
919,920,"Brady, Mr. John Bertram",30.5,1,S
925,926,"Mock, Mr. Philipp Edmund",28.875,1,C
930,931,"Hee, Mr. Ling",7.061975,3,S
941,942,"Smith, Mr. Lucien Philip",30.0,1,S
959,960,"Tucker, Mr. Gilbert Milligan Jr",28.5375,1,C
985,986,"Birnbaum, Mr. Jakob",26.0,1,C
1022,1023,"Gracie, Col. Archibald IV",28.5,1,C
1068,1069,"Stengel, Mr. Charles Emil Henry",27.72085,1,C
1096,1097,"Omont, Mr. Alfred Fernand",25.7417,1,C
1125,1126,"Cumings, Mr. John Bradley",35.64165,1,C


In [61]:
m1 = KNeighborsClassifier(n_neighbors=1)
m2 = KNeighborsClassifier(n_neighbors=3)
m3 = KNeighborsClassifier(n_neighbors=7)
# Preprocessing is the same as before
male_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('voting',VotingClassifier([
        ('m1', m1), ('m2', m2), ('m3', m3)]))
])
print('15-fold precision of the ensemble: {:.3f}'.format(
    cross_val_score(male_pipeline, X_m, y_m, cv=15, n_jobs=-1, scoring='precision').mean()))
print('15-fold recall of the ensemble: {:.3f}'.format(
    cross_val_score(male_pipeline, X_m, y_m, cv=15, n_jobs=-1, scoring='recall').mean()))
print('15-fold accuracy of the ensemble: {:.3f}'.format(
    cross_val_score(male_pipeline, X_m, y_m, cv=15, n_jobs=-1).mean()))
# Fit model and make predictions
male_pipeline.fit(X_m, y_m)
learn_train_m = male_pipeline.predict(X_m)
X_test_m = test_male[features]
predictions_m = male_pipeline.predict(X_test_m)
print('\nThe following 9 adult males are predicted to live:')
test_male.loc[(predictions_m==1), cols]

15-fold precision of the ensemble: 0.404
15-fold recall of the ensemble: 0.267
15-fold accuracy of the ensemble: 0.816

The following 9 adult males are predicted to live:


Unnamed: 0,PassengerId,Name,Pfare,Pclass,Embarked
919,920,"Brady, Mr. John Bertram",30.5,1,S
925,926,"Mock, Mr. Philipp Edmund",28.875,1,C
930,931,"Hee, Mr. Ling",7.061975,3,S
941,942,"Smith, Mr. Lucien Philip",30.0,1,S
959,960,"Tucker, Mr. Gilbert Milligan Jr",28.5375,1,C
985,986,"Birnbaum, Mr. Jakob",26.0,1,C
986,987,"Tenglin, Mr. Gunnar Isidor",7.7958,3,S
1017,1018,"Brobeck, Mr. Karl Rudolf",7.7958,3,S
1022,1023,"Gracie, Col. Archibald IV",28.5,1,C
1068,1069,"Stengel, Mr. Charles Emil Henry",27.72085,1,C


In [16]:
data.loc[891:1308, 'Predict'][(data.Sex=='male') & (data.WCSurvived.isnull())] = predictions_m

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  data.loc[891:1308, 'Predict'][(data.Sex=='male') & (data.WCSurvived.isnull())] = predictions_m


In [17]:
train_female = data[0:891].loc[(data.Sex=='female')  & (data.WCSurvived.isnull())]
test_female = data[891:1309].loc[(data.Sex=='female') & (data.WCSurvived.isnull())]

In [18]:
# We set zero_division=0 to avoid raising errors
custom_precision = make_scorer(precision_score, pos_label=0, zero_division=0)
custom_recall = make_scorer(recall_score, pos_label=0)

In [19]:
y_f = train_female['Survived']
X_f = train_female[features]

In [20]:
# 女性用モデル - RandomForest のみ
clf_female = RandomForestClassifier(random_state=1)

female_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', clf_female)
])

# Cross-validation scores
cv_results = cross_validate(
    female_pipeline, X_f, y_f, cv=9,
    scoring={
        'precision': custom_precision,
        'recall': custom_recall,
        'accuracy': 'accuracy'
    },
    return_train_score=True
)

print('\n' + '=' * 70)
print('FEMALE MODEL - RandomForest')
print('=' * 70)
print('VALIDATION SCORES (Cross-Validation Results)')
print('=' * 70)
print(f'CV Score - Precision (9-fold): {cv_results["test_precision"].mean():.3f} (+/- {cv_results["test_precision"].std():.3f})')
print(f'CV Score - Recall (9-fold):    {cv_results["test_recall"].mean():.3f} (+/- {cv_results["test_recall"].std():.3f})')
print(f'CV Score - Accuracy (9-fold):  {cv_results["test_accuracy"].mean():.3f} (+/- {cv_results["test_accuracy"].std():.3f})')
print()
print('Train Scores (for comparison):')
print(f'Train Precision: {cv_results["train_precision"].mean():.3f}')
print(f'Train Recall:    {cv_results["train_recall"].mean():.3f}')
print(f'Train Accuracy:  {cv_results["train_accuracy"].mean():.3f}')
print('=' * 70)

# Preprocessing of training data, fit model
female_pipeline.fit(X_f, y_f)
learn_train_f = female_pipeline.predict(X_f)
X_test_f = test_female[features]
predictions_f = female_pipeline.predict(X_test_f)

print('\nThe following non-WCG females are predicted to die:')
test_female.loc[(predictions_f==0), cols]


FEMALE MODEL - RandomForest
VALIDATION SCORES (Cross-Validation Results)
CV Score - Precision (9-fold): 0.435 (+/- 0.350)
CV Score - Recall (9-fold):    0.306 (+/- 0.229)
CV Score - Accuracy (9-fold):  0.808 (+/- 0.056)

Train Scores (for comparison):
Train Precision: 0.800
Train Recall:    0.618
Train Accuracy:  0.902

The following non-WCG females are predicted to die:


Unnamed: 0,PassengerId,Name,Pfare,Pclass,Embarked
897,898,"Connolly, Miss. Kate",7.6292,3,Q
927,928,"Roth, Miss. Sarah A",8.05,3,S
978,979,"Badman, Miss. Emily Louisa",8.05,3,S
989,990,"Braf, Miss. Elin Ester Maria",7.8542,3,S
1004,1005,"Buckley, Miss. Katherine",7.2833,3,Q
1013,1014,"Schabert, Mrs. Paul (Emma Mock)",28.875,1,C
1016,1017,"Cribb, Miss. Laura Alice",8.05,3,S
1029,1030,"Drapkin, Miss. Jennie",8.05,3,S
1048,1049,"Lundin, Miss. Olga Elida",7.8542,3,S
1060,1061,"Hellstrom, Miss. Hilda Maria",8.9625,3,S


In [21]:
import xgboost as xgb
# 女性用モデル - XGBoost
clf_female = xgb.XGBClassifier(  objective='binary:logistic',
    eval_metric='error',
    max_depth=5,
    learning_rate=0.1,  # Rのeta
    gamma=0.1,
    colsample_bytree=1,
    min_child_weight=1,
    n_estimators=500,  # Rのnrounds
    random_state=1,
    use_label_encoder=False,
    verbosity=1  # )Rのverbose
)
female_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', clf_female)
])

# Cross-validation scores
cv_results = cross_validate(
    female_pipeline, X_f, y_f, cv=9,
    scoring={
        'precision': custom_precision,
        'recall': custom_recall,
        'accuracy': 'accuracy'
    },
    return_train_score=True
)

print('\n' + '=' * 70)
print('FEMALE MODEL - XGBoost (max_depth=3)')
print('=' * 70)
print('VALIDATION SCORES (Cross-Validation Results)')
print('=' * 70)
print(f'CV Score - Precision (9-fold): {cv_results["test_precision"].mean():.3f} (+/- {cv_results["test_precision"].std():.3f})')
print(f'CV Score - Recall (9-fold):    {cv_results["test_recall"].mean():.3f} (+/- {cv_results["test_recall"].std():.3f})')
print(f'CV Score - Accuracy (9-fold):  {cv_results["test_accuracy"].mean():.3f} (+/- {cv_results["test_accuracy"].std():.3f})')
print()
print('Train Scores (for comparison):')
print(f'Train Precision: {cv_results["train_precision"].mean():.3f}')
print(f'Train Recall:    {cv_results["train_recall"].mean():.3f}')
print(f'Train Accuracy:  {cv_results["train_accuracy"].mean():.3f}')
print('=' * 70)

# Preprocessing of training data, fit model
female_pipeline.fit(X_f, y_f)
learn_train_f = female_pipeline.predict(X_f)
X_test_f = test_female[features]
predictions_f = female_pipeline.predict(X_test_f)

print('\nThe following non-WCG females are predicted to die:')
test_female.loc[(predictions_f==0), cols]


FEMALE MODEL - XGBoost (max_depth=3)
VALIDATION SCORES (Cross-Validation Results)
CV Score - Precision (9-fold): 0.370 (+/- 0.391)
CV Score - Recall (9-fold):    0.167 (+/- 0.167)
CV Score - Accuracy (9-fold):  0.783 (+/- 0.082)

Train Scores (for comparison):
Train Precision: 0.727
Train Recall:    0.427
Train Accuracy:  0.866

The following non-WCG females are predicted to die:


Unnamed: 0,PassengerId,Name,Pfare,Pclass,Embarked
897,898,"Connolly, Miss. Kate",7.6292,3,Q
927,928,"Roth, Miss. Sarah A",8.05,3,S
978,979,"Badman, Miss. Emily Louisa",8.05,3,S
1004,1005,"Buckley, Miss. Katherine",7.2833,3,Q
1016,1017,"Cribb, Miss. Laura Alice",8.05,3,S
1029,1030,"Drapkin, Miss. Jennie",8.05,3,S
1060,1061,"Hellstrom, Miss. Hilda Maria",8.9625,3,S
1088,1089,"Nilsson, Miss. Berta Olivia",7.775,3,S
1090,1091,"Rasmussen, Mrs. (Lena Jacobsen Solvang)",8.1125,3,S
1105,1106,"Andersson, Miss. Ida Augusta Margareta",7.775,3,S


In [22]:
import os

data.loc[891:1308, 'Predict'][(data.Sex=='female') & (data.WCSurvived.isnull())] = predictions_f

# Survived形式(Predict) → Perished形式に変換
output = pd.DataFrame({
    'PassengerId': data[891:1309].PassengerId, 
    'Perished': (1 - data[891:1309].Predict).astype('int')
})

# outputフォルダがなければ作成
os.makedirs('../output', exist_ok=True)

output.to_csv('../output/bestplz.csv', index=False)
print('bestplz.csv was successfully saved to output folder!')

bestplz.csv was successfully saved to output folder!


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  data.loc[891:1308, 'Predict'][(data.Sex=='female') & (data.WCSurvived.isnull())] = predictions_f
