In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

## Load Data

In [2]:
train_data = pd.read_csv('../data/titanic/train.csv').drop('PassengerId', axis=1)
test_data = pd.read_csv('../data/titanic/test.csv').drop('PassengerId', axis=1)

## Feature Engineering

In [3]:
from sklearn.preprocessing import LabelEncoder

train_data['Age'] = train_data['Age'].fillna(train_data['Age'].median())
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].median())
train_data['Embarked'] = train_data['Embarked'].fillna(train_data['Embarked'].mode()[0])
test_data['Fare'] = test_data['Fare'].fillna(test_data['Fare'].median())
train_data['Title'] = train_data['Name'].str.extract(' ([A-Za-z]+)\.')
test_data['Title'] = test_data['Name'].str.extract(' ([A-Za-z]+)\.')
rare_titles = ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
train_data['Title'] = train_data['Title'].replace(rare_titles, 'Rare')
test_data['Title'] = test_data['Title'].replace(rare_titles, 'Rare')
for col in ['Sex', 'Embarked', 'Title']:
    encoder = LabelEncoder()
    train_data[col] = encoder.fit_transform(train_data[col])
    test_data[col] = encoder.transform(test_data[col])
train_data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
test_data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [4]:
xtrain, ytrain = train_data.drop('Survived', axis=1), train_data['Survived']
xtest, ytest = test_data.drop('Survived', axis=1), test_data['Survived']

## Adversarial Validation with Features

In [5]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=42)
lr.fit(xtrain, ytrain)
print(lr.score(xtest, ytest))

0.7655502392344498


In [6]:
from dm_utils.utils.feas import adversarial_validation_features

total_drop_feas = []
auc, train_proba, drop_feas, remain_feas = adversarial_validation_features(xtrain, xtest, drop_n=2)
print(auc, len(drop_feas), drop_feas)

0.7002583979328165 2 ['Age', 'Fare']


In [7]:
total_drop_feas.extend(drop_feas)
auc, train_proba, drop_feas, remain_feas = adversarial_validation_features(
    xtrain.drop(columns=total_drop_feas), xtest.drop(columns=total_drop_feas), drop_n=1)
print(auc, drop_feas)

0.49825581395348834 ['Parch']


In [8]:
lr = LogisticRegression(random_state=42)
lr.fit(xtrain.drop(columns=total_drop_feas), ytrain)
print(lr.score(xtest.drop(columns=total_drop_feas), ytest))

0.7751196172248804


## Adversarial Validation with Insances

In [9]:
from sklearn.model_selection import train_test_split

val_size = 0.1
xtrn, xval, ytrn, yval = train_test_split(xtrain, ytrain, test_size=val_size, random_state=42)

In [10]:
from lightgbm import LGBMClassifier
from lightgbm import log_evaluation, early_stopping

callbacks = [log_evaluation(100), early_stopping(stopping_rounds=200)]
lgb = LGBMClassifier(n_estimators=1000, random_state=42, verbosity=-1)
lgb.fit(xtrn, ytrn, eval_set=[(xval, yval)], callbacks=callbacks)
print(lgb.score(xtest, ytest))

Training until validation scores don't improve for 200 rounds
[100]	valid_0's binary_logloss: 0.381167
[200]	valid_0's binary_logloss: 0.434686
Early stopping, best iteration is:
[64]	valid_0's binary_logloss: 0.362309
0.7272727272727273


In [11]:
from dm_utils.utils.feas import adversarial_validation_instances

auc, train_proba, select_idx, remain_idx = adversarial_validation_instances(xtrain, xtest, select_rate=val_size)
print(auc)

0.7002583979328165


In [12]:
xtrn, xval, ytrn, yval = xtrain.iloc[remain_idx], xtrain.iloc[select_idx], ytrain.iloc[remain_idx], ytrain.iloc[select_idx]
auc, train_proba, select_idx, remain_idx = adversarial_validation_instances(xtrn, xtest, select_rate=val_size)
print(auc)

0.6335441132189099


In [13]:
lgb = LGBMClassifier(n_estimators=1000, random_state=42, verbosity=-1)
lgb.fit(xtrn, ytrn, eval_set=[(xval, yval)], callbacks=callbacks)
print(lgb.score(xtest, ytest))

Training until validation scores don't improve for 200 rounds
[100]	valid_0's binary_logloss: 0.421807
[200]	valid_0's binary_logloss: 0.576956
Early stopping, best iteration is:
[42]	valid_0's binary_logloss: 0.349692
0.7535885167464115
