In [1]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train_x = train.drop(['Survived'] , axis=1)
train_y = train['Survived']

test_x = test.copy()


In [2]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:

train_x = train_x.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
test_x = test_x.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
test_x.head()


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S


In [4]:
train_x.head()
test_x.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S


In [5]:
from sklearn.preprocessing import LabelEncoder

for c in ['Sex', 'Embarked']:
    le = LabelEncoder()
    le.fit(train_x[c].fillna('NA'))
    
    train_x[c] = le.transform(train_x[c].fillna('NA'))
    test_x[c] = le.transform(test_x[c].fillna('NA'))

In [6]:
train_x.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.25,3
1,1,0,38.0,1,0,71.2833,0
2,3,0,26.0,0,0,7.925,3
3,1,0,35.0,1,0,53.1,3
4,3,1,35.0,0,0,8.05,3


In [7]:
le.classes_

array(['C', 'NA', 'Q', 'S'], dtype=object)

In [8]:
from xgboost import XGBClassifier
import numpy as np

model = XGBClassifier(n_estimators=20, random_state=71)
model.fit(train_x, train_y)

pred = model.predict_proba(test_x)[:, 1]

pred_label = np.where(pred > 0.5, 1, 0)

submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': pred_label})
submission.to_csv('submission_first.csv', index=False)

In [9]:
submission[10:20]

Unnamed: 0,PassengerId,Survived
10,902,0
11,903,0
12,904,1
13,905,0
14,906,1
15,907,1
16,908,0
17,909,0
18,910,0
19,911,0


In [10]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import KFold

scores_accuracy = []
scores_logloss = []


kf = KFold(n_splits=4, shuffle=True, random_state=71)
for tr_idx, va_idx in kf.split(train_x):
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx] 
    
    model = XGBClassifier(n_estimators = 20, random_state=71)
    model.fit(tr_x, tr_y)
    
    va_pred = model.predict_proba(va_x)[:, 1]
    
    logloss = log_loss(va_y, va_pred)
    accuracy = accuracy_score(va_y, va_pred>0.5)
    
    scores_logloss.append(logloss)
    scores_accuracy.append(accuracy)
    
logloss = np.mean(scores_logloss)
accuracy = np.mean(scores_accuracy)
print(f'logloss: {logloss: .4f}, accuracy: {accuracy: .4f}')

logloss:  0.4384, accuracy:  0.8182


In [11]:
import itertools

param_space = {
    'max_depth': [3,5,7],
    'min_child_weight': [1,2,4]
}

param_combinations = itertools.product(param_space['max_depth'], param_space['min_child_weight'])

params = []
scores = []

for max_depth, min_child_weight in param_combinations:
    
    score_folds = []
    kf = KFold(n_splits=4, shuffle=True ,random_state=123456)
    for tr_idx, va_idx in kf.split(train_x):
        tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
        tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
        
        model = XGBClassifier(n_estimators=20, random_state=71, max_depth=max_depth, min_child_weight=min_child_weight)
        
        model.fit(tr_x, tr_y)
        
        va_pred = model.predict_proba(va_x)[:, 1]
        logloss = log_loss(va_y, va_pred)
        score_folds.append(logloss)
    
    score_mean = np.mean(score_folds)
    print(score_mean)
    
    params.append((max_depth, min_child_weight))
    scores.append(score_mean)
    
best_idx = np.argsort(scores)[::-1][0]
best_param = params[best_idx]
print(f'max_depth: {best_param[0]}, min_child_weight: {best_param[1]}')

0.4242945477645603
0.4215542310426872
0.42260688873757113
0.4386300601872057
0.44170161048896367
0.429008473307758
0.4607997813510307
0.4402184165535684
0.4361315517234925
max_depth: 7, min_child_weight: 1


In [12]:
from sklearn.metrics import mean_squared_error

y_true = [1,1.5,2,1.2,1.8]
y_pred = [.8, 1.5, 1.8, 1.3, 3]

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
print(rmse)

0.5531726674375732


In [13]:
from sklearn.metrics import confusion_matrix

y_true = [1,0,1,1,0,1,1,0]
y_pred = [0,0,1,1,0,0,1,1]

tp = np.sum((np.array(y_true) == 1) & (np.array(y_pred) ==1))
tn = np.sum((np.array(y_true) == 0) & (np.array(y_pred) ==0))
fp = np.sum((np.array(y_true) == 0) & (np.array(y_pred) ==1))
fn = np.sum((np.array(y_true) == 1) & (np.array(y_pred) ==0))

confusion_matrix1 = np.array([[tp, fp], [fn,tn]])

print(confusion_matrix1)

[[3 1]
 [2 2]]


In [14]:
confusion_matrix2 = confusion_matrix(y_true, y_pred)
print(confusion_matrix2)

[[2 1]
 [2 3]]


In [15]:
from sklearn.metrics import accuracy_score

y_true = [1,0,1,1,0,1,1,0]
y_pred = [0,0,1,1,0,0,1,1]

accuracy = accuracy_score(y_true, y_pred)
print(accuracy)

0.625


In [16]:
from sklearn.metrics import log_loss

y_true = [1,0,1,1,0,1]
y_pred = [0.1,0.2,0.8,0.8, 0.1,0.3]

logloss=log_loss(y_true, y_pred)
print(logloss)

0.7135581778200728


In [17]:
from sklearn.metrics import log_loss

y_true = np.array([0,2,1,2,2])
y_pred = np.array([[0.68,0.32,0],[0,0,1],
                [0.6,0.4,0], [0,0,1], [0.28,0.12,0.6]])
logloss = log_loss(y_true, y_pred)
print(logloss)

0.3625557672904274


In [18]:
from sklearn.metrics import f1_score

y_true = np.array([[1,1,0], [1,0,0], 
                  [1,1,1], [0,1,1], [0,0,1]])

y_pred = np.array([[1,0,1], [0,1,0], [1,0,1], [0,0,1], [0,0,1]])

mean_f1= np.mean([f1_score(y_true[i, :], y_pred[i, :]) for i in range(len(y_true))])

n_class = 3
macro_f1 = np.mean([f1_score(y_true[:, c], y_pred[:, c]) for c in range(n_class)])

micro_f1 = f1_score(y_true.reshape(-1), y_pred.reshape(-1))

print(mean_f1, macro_f1, micro_f1)

mean_f1 = f1_score(y_true, y_pred, average='samples')
macro_f1 = f1_score(y_true, y_pred, average='macro')
micro_f1 = f1_score(y_true, y_pred, average='micro')

print(mean_f1, macro_f1, micro_f1)

0.5933333333333334 0.5523809523809523 0.6250000000000001
0.5933333333333334 0.5523809523809523 0.6250000000000001


In [19]:
from sklearn.metrics import confusion_matrix, cohen_kappa_score

def quadratic_weighted_kappa(c_matrix):
    numer  = 0
    denom  = 0
    
    for i in range(c_matrix.shape[0]):
        for j in range(c_matrix.shape[1]):
            n = c_matrix.shape[0]
            wij = ((i-j) ** 2.0)
            oij = c_matrix[i, j]
            eij = c_matrix[i, : ].sum() * c_matrix[:, j].sum() / c_matrix.sum()
            numer += wij * oij
            denom += wij * eij
            
    return 1 - numer / denom

y_true = [1,2,3,4,3]
y_pred = [2,2,4,4,5]

c_matrix = confusion_matrix(y_true, y_pred, labels=[1,2,3,4,5])

print(c_matrix)

kappa = quadratic_weighted_kappa(c_matrix)
print(kappa)

kappa = cohen_kappa_score(y_true, y_pred, weights='quadratic')
print(kappa)

[[0 1 0 0 0]
 [0 1 0 0 0]
 [0 0 0 1 1]
 [0 0 0 1 0]
 [0 0 0 0 0]]
0.6153846153846154
0.6153846153846154


In [20]:
K = 3

y_true = [[1,2], [1,2], [4], [1,2,3,4], [3,4]]
y_pred = [[1,2,4],[4,1,2], [1,4,3], [1,2,3],[1,2,4]]

def apk(y_i_true, y_i_pred):
    assert(len(y_i_pred) <= K)
    assert(len(np.unique(y_i_pred)) == len(y_i_pred))
    
    sum_precision = 0
    num_hits  = 0
    
    for i, p in enumerate(y_i_pred):
        if p in y_i_true:
            num_hits += 1
            precision = num_hits /(i+1)
            sum_precision += precision
    
    return sum_precision / min(len(y_i_true), K)

def mapk(y_true, y_pred):
    return np.mean([apk(y_i_true, y_i_pred) for y_i_true, y_i_pred in zip(y_true, y_pred)])

print(mapk(y_true, y_pred))

0.6499999999999999


In [21]:
print(apk(y_true[0], y_pred[0]))
print(apk(y_true[1], y_pred[1]))

1.0
0.5833333333333333


In [22]:
import xgboost as xgb
from sklearn.metrics import log_loss
    


In [33]:
dtrain = xgb.DMatrix(tr_x, label=tr_y)
dvalid = xgb.DMatrix(va_x, label=va_y)
def logregobj(preds, dtrain):
    labels = dtrain.get_label()
    preds = 1.0 / (1 + np.exp(-preds))
    grad = preds- labels
    hess = preds * (1 - preds)
    return grad, hess

def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'custom-error', float(sum(labels != (preds >0))) / len(labels)

params = {'silent': 1, 'random_state': 71}
num_round = 50
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

bst = xgb.train(params, dtrain, num_round, watchlist, obj=logregobj, custom_metric =evalerror)

pred_val = bst.predict(dvalid)

pred = 1.0 / (1 + np.exp(-pred_val))
logloss = log_loss(va_y, pred)
print(logloss)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-rmse:0.38916	train-custom-error:0.22870	eval-rmse:0.40035	eval-custom-error:0.18919
[1]	train-rmse:0.59687	train-custom-error:0.15247	eval-rmse:0.63569	eval-custom-error:0.14865
[2]	train-rmse:0.81461	train-custom-error:0.12407	eval-rmse:0.85967	eval-custom-error:0.15766
[3]	train-rmse:1.02132	train-custom-error:0.12108	eval-rmse:1.06991	eval-custom-error:0.15315
[4]	train-rmse:1.20425	train-custom-error:0.11211	eval-rmse:1.24243	eval-custom-error:0.15315
[5]	train-rmse:1.36147	train-custom-error:0.11360	eval-rmse:1.40668	eval-custom-error:0.14414
[6]	train-rmse:1.49899	train-custom-error:0.11211	eval-rmse:1.54368	eval-custom-error:0.13514
[7]	train-rmse:1.60471	train-