In [1]:
# linear algebra
import numpy as np 

# data processing
import pandas as pd 

# data visualization
import seaborn as sns
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style

# Algorithms


In [2]:
##Getting the Data
test_df = pd.read_csv("test.csv")
train_df = pd.read_csv("train.csv")

In [3]:
##Data Exploration/Analysis
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
total = train_df.isnull().sum().sort_values(ascending=False)
percent_1 = train_df.isnull().sum()/train_df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(5)

Unnamed: 0,Total,%
Cabin,687,77.1
Age,177,19.9
Embarked,2,0.2
Fare,0,0.0
Ticket,0,0.0


In [7]:
train_df.columns.values

array(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype=object)

In [8]:
### Data Preprocessing
train_df = train_df.drop(['PassengerId'], axis=1)

In [9]:
# extracting and then removing the targets from the training data 
targets = train_df['Survived']


In [10]:
train_df.drop(['Survived'], 1, inplace=True)

In [12]:
train_df

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...
886,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [13]:
# merging train data and test data for future feature engineering
# we'll also remove the PassengerID since this is not an informative feature
combined = train_df.append(test_df)
combined.reset_index(inplace=True)
combined.drop(['index', 'PassengerId'], inplace=True, axis=1)

In [14]:
combined

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...
1304,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1305,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1306,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [15]:
#Now let's map the title can bin them
Title_Dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Dona": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty"
}

In [16]:
#Generate a new Title column
combined['Title'] = combined['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
combined['Title'] = combined['Title'].map(Title_Dictionary)

In [17]:
combined

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,Mr
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,Miss
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,Mrs
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,Mr
...,...,...,...,...,...,...,...,...,...,...,...
1304,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,Mr
1305,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,Royalty
1306,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,Mr
1307,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,Mr


In [18]:
#let's get the median age based on people's gender, Pclass and Title
fill_mean = lambda g: g.fillna(g.mean())
combined['Age'] = combined.groupby(['Sex', 'Title', 'Pclass'])['Age'].apply(fill_mean)

In [19]:
# Name can be dropped now
combined.drop('Name', axis=1, inplace=True)

In [20]:
# removing the title variable
combined.drop('Title', axis=1, inplace=True)

In [21]:
# removing the Cabin variable
combined.drop('Cabin', axis=1, inplace=True)

In [22]:
# removing the Ticket variable
combined.drop('Ticket', axis=1, inplace=True)

In [23]:
#Fill out the missing fare data
combined['Fare'].fillna(combined['Fare'].mean(), inplace=True)

In [24]:
# two missing embarked values - filling them with the most frequent one in the train set
combined['Embarked'].fillna('S', inplace=True)

In [25]:
combined

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.00000,1,0,7.2500,S
1,1,female,38.00000,1,0,71.2833,C
2,3,female,26.00000,0,0,7.9250,S
3,1,female,35.00000,1,0,53.1000,S
4,3,male,35.00000,0,0,8.0500,S
...,...,...,...,...,...,...,...
1304,3,male,28.31891,0,0,8.0500,S
1305,1,female,39.00000,0,0,108.9000,C
1306,3,male,38.50000,0,0,7.2500,S
1307,3,male,28.31891,0,0,8.0500,S


In [26]:
# encoding in dummy variable
embarked_dummies = pd.get_dummies(combined['Embarked'], prefix='Embarked')
combined = pd.concat([combined, embarked_dummies], axis=1)
combined.drop('Embarked', axis=1, inplace=True)

In [27]:
# mapping gender to numerical one 
combined['Sex'] = combined['Sex'].map({'male':1, 'female':0})

In [28]:
# introducing a new feature : the size of families (including the passenger)
combined['FamilySize'] = combined['Parch'] + combined['SibSp'] + 1

In [29]:
#Prepare the training dataset
df_im_input=combined.iloc[:891]
df_im_output=targets

In [30]:
df_im_input

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,FamilySize
0,3,1,22.000000,1,0,7.2500,0,0,1,2
1,1,0,38.000000,1,0,71.2833,1,0,0,2
2,3,0,26.000000,0,0,7.9250,0,0,1,1
3,1,0,35.000000,1,0,53.1000,0,0,1,2
4,3,1,35.000000,0,0,8.0500,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...
886,2,1,27.000000,0,0,13.0000,0,0,1,1
887,1,0,19.000000,0,0,30.0000,0,0,1,1
888,3,0,17.360874,1,2,23.4500,0,0,1,4
889,1,1,26.000000,0,0,30.0000,1,0,0,1


In [31]:
df_im_output

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [32]:
combined

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,FamilySize
0,3,1,22.00000,1,0,7.2500,0,0,1,2
1,1,0,38.00000,1,0,71.2833,1,0,0,2
2,3,0,26.00000,0,0,7.9250,0,0,1,1
3,1,0,35.00000,1,0,53.1000,0,0,1,2
4,3,1,35.00000,0,0,8.0500,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...
1304,3,1,28.31891,0,0,8.0500,0,0,1,1
1305,1,0,39.00000,0,0,108.9000,1,0,0,1
1306,3,1,38.50000,0,0,7.2500,0,0,1,1
1307,3,1,28.31891,0,0,8.0500,0,0,1,1


In [33]:
test_data = combined.iloc[891:]
test_data

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,FamilySize
891,3,1,34.50000,0,0,7.8292,0,1,0,1
892,3,0,47.00000,1,0,7.0000,0,0,1,2
893,2,1,62.00000,0,0,9.6875,0,1,0,1
894,3,1,27.00000,0,0,8.6625,0,0,1,1
895,3,0,22.00000,1,1,12.2875,0,0,1,3
...,...,...,...,...,...,...,...,...,...,...
1304,3,1,28.31891,0,0,8.0500,0,0,1,1
1305,1,0,39.00000,0,0,108.9000,1,0,0,1
1306,3,1,38.50000,0,0,7.2500,0,0,1,1
1307,3,1,28.31891,0,0,8.0500,0,0,1,1


In [34]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

In [35]:
data_dmatrix = xgb.DMatrix(data=df_im_input,label=df_im_output)

In [21]:
#from sklearn.model_selection import train_test_split

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [34]:
#xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

In [35]:
#xg_reg.fit(X_train,y_train)

#preds = xg_reg.predict(X_test)



In [24]:
#rmse = np.sqrt(mean_squared_error(y_test, preds))
#print("RMSE: %f" % (rmse))

RMSE: 10.517005


In [56]:
# evaluate a logistic regression model using k-fold cross-validation
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier as rtree

In [49]:
param_grid = {"objective":["binary:logistic"],
              'colsample_bytree':[0.2,0.4,0.6,0.8],
              'learning_rate':[0.01,0.05,0.1,0.2],
              'max_depth': [3,5,7,9]
    
}

In [50]:
#make item combination for param_grid dict
import itertools as it
s = sorted(param_grid)
print(s)
combination=it.product(*(param_grid[Name] for Name in s))

['colsample_bytree', 'learning_rate', 'max_depth', 'objective']


In [51]:
combination_list = list(combination)
print(combination_list)

[(0.2, 0.01, 3, 'binary:logistic'), (0.2, 0.01, 5, 'binary:logistic'), (0.2, 0.01, 7, 'binary:logistic'), (0.2, 0.01, 9, 'binary:logistic'), (0.2, 0.05, 3, 'binary:logistic'), (0.2, 0.05, 5, 'binary:logistic'), (0.2, 0.05, 7, 'binary:logistic'), (0.2, 0.05, 9, 'binary:logistic'), (0.2, 0.1, 3, 'binary:logistic'), (0.2, 0.1, 5, 'binary:logistic'), (0.2, 0.1, 7, 'binary:logistic'), (0.2, 0.1, 9, 'binary:logistic'), (0.2, 0.2, 3, 'binary:logistic'), (0.2, 0.2, 5, 'binary:logistic'), (0.2, 0.2, 7, 'binary:logistic'), (0.2, 0.2, 9, 'binary:logistic'), (0.4, 0.01, 3, 'binary:logistic'), (0.4, 0.01, 5, 'binary:logistic'), (0.4, 0.01, 7, 'binary:logistic'), (0.4, 0.01, 9, 'binary:logistic'), (0.4, 0.05, 3, 'binary:logistic'), (0.4, 0.05, 5, 'binary:logistic'), (0.4, 0.05, 7, 'binary:logistic'), (0.4, 0.05, 9, 'binary:logistic'), (0.4, 0.1, 3, 'binary:logistic'), (0.4, 0.1, 5, 'binary:logistic'), (0.4, 0.1, 7, 'binary:logistic'), (0.4, 0.1, 9, 'binary:logistic'), (0.4, 0.2, 3, 'binary:logistic'

In [47]:
# k-fold with index pair return
def kfold_index(x_train,k):
    if k<1:
        print('K cannot smaller than 1')
        return
    else:
        l=x_train.shape[0]#22 k=5
        tail=l%k#2
        q=l//k#4
        index=[0]
        output_index=[]
        #first n_examples % k sub_example have n_examples//k+1 elements
        for i in range(1,tail):
            index.append((i*q)+1)
        for j in range(tail,k+1):
            index.append(j*q+tail)
        for m in range(len(index)-1):
            output_index.append([index[m],index[m+1]])
        return output_index

In [61]:
def cv_xgboost(x_train, y_train, param, num_boosting_round, early_stoping_rounds, kfold):
    import xgboost as xgb
    from sklearn import metrics
    #get kfold index
    index = kfold_index(x_train, kfold)
    #train each sub-combination
    auc=[]
    br=[]
    for i in range(kfold):
        cv_xtest = x_train.iloc[index[i][0]:index[i][1],:]
        cv_ytest = y_train.iloc[index[i][0]:index[i][1],]
        
        #drop the test sub-set
        cv_xtrain=x_train.drop(x_train.index[index[i][0]:index[i][1]])
        cv_ytrain=y_train.drop(y_train.index[index[i][0]:index[i][1]])
        
        #make dmatrix for xgboost
        d_cvtrain=xgb.DMatrix(data=cv_xtrain, label=cv_ytrain)
        d_cvtest=xgb.DMatrix(data=cv_xtest, label=cv_ytest)
        
        #use xgb.train to train the model
        xg_l=xgb.train(params = params,
                       dtrain = d_cvtrain,
                       num_boost_round = num_boosting_round,
                       early_stopping_rounds = early_stoping_rounds,
                       evals = [(d_cvtest,'test_evalset')],
                       verbose_eval=False)
        #calculate the auc as the evalvation metric
        fpr, tpr, threshold = roc_curve(cv_ytest, xg_l.predict(d_cvtest))
        aa = metrics.auc(fpr,tpr)
        auc.append(aa)
        
        #get the number of boosting round
        
        bb=xg_l.best_iteration
        br.append(bb)
    return np.mean(auc),np.mean(br)

In [63]:
#params = {"objective":"binary:logistic",'colsample_bytree': 0.3,'learning_rate': 0.1,
               # 'max_depth': 5, 'alpha': 10}
#['colsample_bytree', 'learning_rate', 'max_depth', 'objective']

#cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    #num_boost_round=3000,early_stopping_rounds=50,metrics="rmse", as_pandas=True, seed=123)
params = {'colsample_bytree':0.2,'learning_rate': 0.01,'max_depth': 3,'objective': "binary:logistic"}

In [64]:
score=cv_xgboost(df_im_input,df_im_output,param=params,num_boosting_round=1000,early_stoping_rounds=50,kfold=10)
#{'colsample_bytree':0.2,'learning_rate': 0.01,'max_depth': 3,
                #'objective': "binary:logistic"}




In [66]:
print(score[0])

0.8685076006794183


In [70]:
x=0
n=0
for i in list(combination_list):
   
    scc=cv_xgboost(df_im_input,df_im_output,param={'colsample_bytree':i[0],'learning_rate': i[1],'max_depth': i[2],
                'objective':"binary:logistic"},num_boosting_round=1000,early_stoping_rounds=50,kfold=10)

    if scc[0] >= score[0]:
        x = scc[0]
        opt = i
        n=scc[1]
        print(x)
print('Best Estimators :')
print(i)
print('Best num_boosting_round :')
print(n)


0.8685076006794183
0.8685076006794183


0.8685076006794183
0.8685076006794183
0.8685076006794183


0.8685076006794183
0.8685076006794183
0.8685076006794183


0.8685076006794183
0.8685076006794183
0.8685076006794183


0.8685076006794183
0.8685076006794183
0.8685076006794183


0.8685076006794183
0.8685076006794183
0.8685076006794183


0.8685076006794183
0.8685076006794183
0.8685076006794183


0.8685076006794183
0.8685076006794183
0.8685076006794183


0.8685076006794183
0.8685076006794183
0.8685076006794183


0.8685076006794183
0.8685076006794183


0.8685076006794183
0.8685076006794183
0.8685076006794183


0.8685076006794183
0.8685076006794183
0.8685076006794183


0.8685076006794183
0.8685076006794183
0.8685076006794183


0.8685076006794183
0.8685076006794183
0.8685076006794183


0.8685076006794183
0.8685076006794183
0.8685076006794183


0.8685076006794183
0.8685076006794183
0.8685076006794183


0.8685076006794183
0.8685076006794183
0.8685076006794183


0.8685076006794183
0.8685076006794183
0.8685076006794183


0.8685076006794183
0.8685076006794183
0.8685076006794183


0.8685076006794183
0.8685076006794183


0.8685076006794183
0.8685076006794183
0.8685076006794183


0.8685076006794183
0.8685076006794183
0.8685076006794183


0.8685076006794183
Best Estimators :
(0.8, 0.2, 9, 'binary:logistic')
Best num_boosting_round :
928.0


In [75]:
test_data

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,FamilySize
891,3,1,34.50000,0,0,7.8292,0,1,0,1
892,3,0,47.00000,1,0,7.0000,0,0,1,2
893,2,1,62.00000,0,0,9.6875,0,1,0,1
894,3,1,27.00000,0,0,8.6625,0,0,1,1
895,3,0,22.00000,1,1,12.2875,0,0,1,3
...,...,...,...,...,...,...,...,...,...,...
1304,3,1,28.31891,0,0,8.0500,0,0,1,1
1305,1,0,39.00000,0,0,108.9000,1,0,0,1
1306,3,1,38.50000,0,0,7.2500,0,0,1,1
1307,3,1,28.31891,0,0,8.0500,0,0,1,1


In [77]:
data_dmatrix

<xgboost.core.DMatrix at 0x1a1f4ddb10>

In [80]:
#use opt parameters to train 
params = {'colsample_bytree':0.8,'learning_rate': 0.2,'max_depth': 9,'objective': "binary:logistic"}
data_dmatrix=xgb.DMatrix(data=df_im_input, label=df_im_output)
opt_model=xgb.train(params=params, dtrain=data_dmatrix, num_boost_round=928,early_stopping_rounds=50)

#y_pred = opt_model.predict(test_data)



AssertionError: Must have at least 1 validation dataset for early stopping.