Based on train data, determine if each entry on test dataframe survived or not to Titanic sink

In [1]:
import pandas as pd

X_train_raw = pd.read_csv('titanic/train.csv')
X_test_raw = pd.read_csv('titanic/test.csv')

In [2]:
X_train_raw.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
X_train_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
X_test_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [5]:
#Survived is our target
y=X_train_raw.Survived
X=X_train_raw.copy()
X.drop(['Survived'],axis=1, inplace=True)
print(X.head())

#Passenger ID and Ticket are aleatory identifier - will be removed
X.drop(['PassengerId','Ticket'],axis=1,inplace=True)

#Cabin has too many null observations - will be removed
X.drop(['Cabin'],axis=1,inplace=True)

X.head()

   PassengerId  Pclass                                               Name  \
0            1       3                            Braund, Mr. Owen Harris   
1            2       1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2            3       3                             Heikkinen, Miss. Laina   
3            4       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4            5       3                           Allen, Mr. William Henry   

      Sex   Age  SibSp  Parch            Ticket     Fare Cabin Embarked  
0    male  22.0      1      0         A/5 21171   7.2500   NaN        S  
1  female  38.0      1      0          PC 17599  71.2833   C85        C  
2  female  26.0      0      0  STON/O2. 3101282   7.9250   NaN        S  
3  female  35.0      1      0            113803  53.1000  C123        S  
4    male  35.0      0      0            373450   8.0500   NaN        S  


Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer




#Age and Embarked will have data imputed, as well as Fare only in Test Data
age_imputer = SimpleImputer(strategy="mean")
embarked_imputer = SimpleImputer(strategy='most_frequent')
fare_imputer = SimpleImputer(strategy='mean')

imputer = ColumnTransformer(transformers=[
    ('age',SimpleImputer(strategy="mean"),['Age','Fare']),
    ('embarked',embarked_imputer,['Embarked']),
    #('fare',fare_imputer,X['Fare'])
])

X_imputed=pd.DataFrame(imputer.fit_transform(X))
X_test_imputed=pd.DataFrame(imputer.fit_transform(X_test_raw))
X.head()


Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


In [7]:
X_imputed.head()

Unnamed: 0,0,1,2
0,22,7.25,S
1,38,71.2833,C
2,26,7.925,S
3,35,53.1,S
4,35,8.05,S


In [8]:
X_test_imputed.head()

Unnamed: 0,0,1,2
0,34.5,7.8292,Q
1,47.0,7.0,S
2,62.0,9.6875,Q
3,27.0,8.6625,S
4,22.0,12.2875,S


In [9]:
#substitute columns
X_imputed.columns=['Age','Fare','Embarked']
X_test_imputed.columns=['Age','Fare','Embarked']

X.drop(['Age','Fare','Embarked'],axis=1,inplace=True)

X.head()

Unnamed: 0,Pclass,Name,Sex,SibSp,Parch
0,3,"Braund, Mr. Owen Harris",male,1,0
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0
2,3,"Heikkinen, Miss. Laina",female,0,0
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0
4,3,"Allen, Mr. William Henry",male,0,0


In [10]:
X=pd.concat([X,X_imputed],axis=1,join='inner')

In [11]:
X_test=X_test_raw.copy()
X_test.drop(['Age','Fare','Embarked'],axis=1,inplace=True)
X_test=pd.concat([X_test,X_test_imputed],axis=1,join='inner')

In [12]:
X_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,SibSp,Parch,Ticket,Cabin,Age,Fare,Embarked
0,892,3,"Kelly, Mr. James",male,0,0,330911,,34.5,7.8292,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,1,0,363272,,47.0,7.0,S
2,894,2,"Myles, Mr. Thomas Francis",male,0,0,240276,,62.0,9.6875,Q
3,895,3,"Wirz, Mr. Albert",male,0,0,315154,,27.0,8.6625,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,1,1,3101298,,22.0,12.2875,S


In [13]:
#Create title register from Name
#Create identifications for passengers alone

datacleaner = [X,X_test]

for dataset in datacleaner:
    dataset['FamilySize'] = dataset['SibSp']+dataset['Parch']+1
    
    dataset['IsAlone']=1
    dataset['IsAlone'].loc[dataset['FamilySize']>1]=0
    
    dataset['Title']=dataset['Name'].str.split(", ",expand=True)[1].str.split(".",expand=True)[0]
    
    dataset['FareBin']=pd.qcut(dataset['Fare'],4)
    
    dataset['AgeBin']=pd.cut(dataset['Age'].astype(int),5)
    
stat_min =10
title_names=(X['Title'].value_counts()<stat_min)
X['Title']=X['Title'].apply(lambda x: 'Misc' if title_names.loc[x]==True else x)
print(X['Title'].value_counts())
print(X_test['Title'].value_counts())
X.info()

Mr        517
Miss      182
Mrs       125
Master     40
Misc       27
Name: Title, dtype: int64
Mr        240
Miss       78
Mrs        72
Master     21
Col         2
Rev         2
Dr          1
Dona        1
Ms          1
Name: Title, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   Pclass      891 non-null    int64   
 1   Name        891 non-null    object  
 2   Sex         891 non-null    object  
 3   SibSp       891 non-null    int64   
 4   Parch       891 non-null    int64   
 5   Age         891 non-null    object  
 6   Fare        891 non-null    object  
 7   Embarked    891 non-null    object  
 8   FamilySize  891 non-null    int64   
 9   IsAlone     891 non-null    int64   
 10  Title       891 non-null    object  
 11  FareBin     891 non-null    category
 12  AgeBin      891 non-null    category
dtypes: category(

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [14]:
from sklearn.preprocessing import LabelEncoder

label = LabelEncoder()
for dataset in datacleaner:
    dataset['Sex_Code']=label.fit_transform(dataset['Sex'])
    dataset['AgeBin_Code']=label.fit_transform(dataset['AgeBin'])
    dataset['FareBin_Code']=label.fit_transform(dataset['FareBin'])
    dataset['Embarked_Code']=label.fit_transform(dataset['Embarked'])
    dataset['Title_Code']=label.fit_transform(dataset['Title'])
    


In [15]:
X_dummie=pd.get_dummies(X)



In [16]:
columns_toModel = ['Pclass','SibSp','Parch','FamilySize','IsAlone','Sex_Code','AgeBin_Code','FareBin_Code','Embarked_Code','Title_Code']
X_train_cleaned = X[columns_toModel]
X_test_cleaned=X_test[columns_toModel]

print(X_train_cleaned.info())
print(X_test_cleaned.info())

print(X_train_cleaned.head())
print(X_test_cleaned.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   Pclass         891 non-null    int64
 1   SibSp          891 non-null    int64
 2   Parch          891 non-null    int64
 3   FamilySize     891 non-null    int64
 4   IsAlone        891 non-null    int64
 5   Sex_Code       891 non-null    int32
 6   AgeBin_Code    891 non-null    int32
 7   FareBin_Code   891 non-null    int32
 8   Embarked_Code  891 non-null    int32
 9   Title_Code     891 non-null    int32
dtypes: int32(5), int64(5)
memory usage: 52.3 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   Pclass         418 non-null    int64
 1   SibSp          418 non-null    int64
 2   Parch          418 non-null    int64
 3   FamilySize     41

In [17]:
X_train,X_valid,y_train,y_valid = train_test_split(X_train_cleaned,y,train_size=0.75,test_size=0.25,random_state=0)

In [18]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from sklearn.metrics import mean_absolute_error

model=XGBClassifier(n_estimators=100, learning_rate=0.05)

score = -1 * cross_val_score(model,X_train,y_train,
                            cv=5,
                            scoring='neg_mean_absolute_error')
print(score.mean())

0.16468409830546515


In [19]:
model.fit(X_train,y_train)
train_preds = model.predict(X_valid)
mae=mean_absolute_error(y_valid,train_preds)
print(mae)
final_preds=model.predict(X_test_cleaned)
print(final_preds.shape)
print(X_test_cleaned.shape)
print(final_preds)

0.17488789237668162
(418,)
(418, 10)
[0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 1 0 0 0 1 0 0 1 0 1 0 1 0 1 0 0 0 0 0 1 0 1
 1 0 0 0 1 0 1 1 0 1 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 1 0 1 1 1 0 1 1
 1 0 0 1 0 1 0 0 0 0 0 0 1 1 1 0 0 0 1 0 1 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 0 0 1 1 0 1 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0
 1 0 1 0 0 0 0 0 1 1 1 1 1 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 1 1 0 1
 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 1 0 1 1 0 1 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 1 0 0 0 1 1 1 1 1 0 0 0 1 0 1 1 1 0 1 0 0 0 0 0 1
 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 1 0
 1 0 1 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 0 0 1 0 0 1 0 1 1 0 1 0 0 1 1 0
 0 1 0 0 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 1 0 0
 0 1 0 1 1 1 0 1 0 0 0]


In [20]:
output = pd.DataFrame({'PassengerId':X_test_raw['PassengerId'],
                      'Survived':final_preds})
output.to_csv('titanic_model_3.csv',index=False)

In [21]:
from sklearn.ensemble import RandomForestClassifier

def score_XGB(n_estimators,learning_rate,X_data,y_data):
    model = XGBClassifier(n_estimators=n_estimators,learning_rate=learning_rate)
    score = -1 * cross_val_score(model,X_data,y_data,
                                cv=5,
                                scoring = 'neg_mean_absolute_error')
    return score.mean()

def score_RandomForest(max_leaf_nodes,X_data,y_data):
    model = RandomForestClassifier(max_leaf_nodes=max_leaf_nodes,random_state=0)
    score = -1*cross_val_score(model,X_data,y_data,
                              cv=5,
                              scoring = 'neg_mean_absolute_error')
    return score.mean()

In [23]:
import operator

results = {}
for i in range(1,9):
    for j in range(1,20):
        results[('XGB',i*50,j*0.01)]=score_XGB(i*50,j*0.01,X_train,y_train)

for i in range(1,9):
        results[('RF',i*50)]=score_RandomForest(i*50,X_train,y_train)

sorted_results=sorted(results.items(),key=operator.itemgetter(1))
print(sorted_results[0:5])

[(('XGB', 100, 0.05), 0.16468409830546515), (('XGB', 50, 0.1), 0.16617663561889798), (('XGB', 50, 0.11), 0.16617663561889798), (('XGB', 50, 0.09), 0.16766917293233083), (('XGB', 200, 0.03), 0.16768039501739423)]


In [24]:
print(sorted_results)

[(('XGB', 100, 0.05), 0.16468409830546515), (('XGB', 50, 0.1), 0.16617663561889798), (('XGB', 50, 0.11), 0.16617663561889798), (('XGB', 50, 0.09), 0.16766917293233083), (('XGB', 200, 0.03), 0.16768039501739423), (('XGB', 50, 0.12), 0.16916171024576365), (('XGB', 50, 0.14), 0.16916171024576365), (('XGB', 50, 0.13), 0.16917293233082706), (('XGB', 150, 0.04), 0.16917293233082706), (('XGB', 300, 0.02), 0.16917293233082706), (('XGB', 150, 0.03), 0.1706542475591965), (('XGB', 100, 0.06), 0.17066546964425988), (('XGB', 100, 0.07), 0.17066546964425988), (('XGB', 250, 0.02), 0.17066546964425988), (('XGB', 100, 0.04), 0.1706654696442599), (('XGB', 150, 0.05), 0.1706654696442599), (('XGB', 250, 0.03), 0.1706654696442599), (('XGB', 350, 0.02), 0.1706654696442599), (('XGB', 400, 0.01), 0.1706654696442599), (('XGB', 50, 0.15), 0.17067669172932332), (('XGB', 350, 0.01), 0.17067669172932332), (('XGB', 100, 0.09), 0.1721692290427561), (('XGB', 50, 0.08), 0.17216922904275614), (('XGB', 100, 0.08), 0.172

In [25]:
modelRF = RandomForestClassifier(max_leaf_nodes=100,random_state=0)
modelRF.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=100, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [26]:
from sklearn.feature_selection import RFECV
from sklearn import feature_selection

model_rfe = feature_selection.RFECV(modelRF,step=1,scoring='accuracy',cv = 5)
model_rfe.fit(X_train,y_train)

X_train_rfe = X_train.columns.values[model_rfe.get_support()]
print(X_train_rfe)

['Pclass' 'FamilySize' 'Sex_Code' 'Title_Code']


In [27]:
X_train_features = X_train_cleaned[X_train_rfe]
X_valid_features = X_valid[X_train_rfe]
X_test_cleaned_features = X_test_cleaned[X_train_rfe]

results = {}
for i in range(1,9):
    for j in range(1,20):
        results[('XGB',i*50,j*0.01)]=score_XGB(i*50,j*0.01,X_train_features,y)

for i in range(1,9):
        results[('RF',i*50)]=score_RandomForest(i*50,X_train_features,y)

sorted_results=sorted(results.items(),key=operator.itemgetter(1))
print(sorted_results[0:5])


[(('XGB', 50, 0.01), 0.18739564371351453), (('XGB', 50, 0.02), 0.18739564371351453), (('XGB', 50, 0.03), 0.18739564371351453), (('XGB', 50, 0.04), 0.18739564371351453), (('XGB', 100, 0.01), 0.18739564371351453)]


In [None]:
#from sklearn.preprocessing import StandardScaler

#model=XGBClassifier()
#model.fit(X_train,y_train)

#scaler = StandardScaler()
#scaler.fit(X_train)
#X_train_scaled = scaler.transform(X_train)

#model_rfe = feature_selection.RFECV(model,step=1,min_features_to_select=1, cv=10, scoring='neg_mean_absolute_error')
#model_rfe.fit(X_train_scaled,y_train.values.flatten())

#X_train_rfe = X_train.columns.values[model_rfe.get_support()]
#print(X_train_rfe)

In [34]:
print(model.get_params())

{'objective': 'binary:logistic', 'base_score': 0.5, 'booster': None, 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1, 'gamma': 0, 'gpu_id': -1, 'importance_type': 'gain', 'interaction_constraints': None, 'learning_rate': 0.05, 'max_delta_step': 0, 'max_depth': 6, 'min_child_weight': 1, 'missing': nan, 'monotone_constraints': None, 'n_estimators': 100, 'n_jobs': 0, 'num_parallel_tree': 1, 'random_state': 0, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'subsample': 1, 'tree_method': None, 'validate_parameters': False, 'verbosity': None}


In [45]:
from sklearn import model_selection
from scipy import stats

param_grid = {'n_estimators': [50,100,150,200,250,300],
              'learning_rate': [0.03,0.04,0.05,0.06,0.07,0.08],
              'subsample': [0.3, 0.4,0.5,0.6,0.7,0.8, 0.9],
              'max_depth': [3, 4, 5, 6, 7, 8, 9],
              'colsample_bytree': [0.5,0.6,0.7,0.8,0.9],
              'min_child_weight': [1, 2, 3, 4]
             }

tune_param_model = model_selection.GridSearchCV(model,param_grid=param_grid,scoring='roc_auc',cv=5)
tune_param_model.fit(X_train,y_train)
print(tune_param_model.best_params_)


KeyboardInterrupt: 

In [33]:

train_preds=tune_param_model.predict(X_valid)
mae2=mean_absolute_error(y_valid,train_preds)
#score = -1 * cross_val_score(tune_param_model,X_train,y_train,
#                            cv=5,
#                            scoring='neg_mean_absolute_error')
#print(score.mean())

{'criterion': 'gini', 'max_depth': 2, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'random_state': 0, 'splitter': 'best'}


Hyper parameters optimization approach

In [49]:
def score_XGB_hyper(n_estimators):
    model = XGBClassifier(n_estimators=n_estimators,learning_rate=0.1,max_depth=5,min_child_weight=1,gamma=0,subsample=0.8,colsample_bytree=0.8,nthread=4,scale_pos_weight=1,seed=27)
    score = -1 * cross_val_score(model,X_train,y_train,
                                cv=5,
                                scoring = 'neg_mean_absolute_error')
    return score.mean()

In [51]:
results = {}
for i in range(50,300,5):
        results[(i)]=score_XGB_hyper(i)
sorted_results=sorted(results.items(),key=operator.itemgetter(1))
print(sorted_results[0:5])

[(70, 0.16018404219503984), (65, 0.16019526428010325), (60, 0.16020648636516663), (75, 0.16469532039052853), (80, 0.16470654247559197)]


In [59]:
#Tune max-depth and min_child_weight
param_test_1={
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,6,2)
}

gsearch1 = model_selection.GridSearchCV(XGBClassifier(n_estimators=70,learning_rate=0.1,max_depth=5,min_child_weight=1,gamma=0,subsample=0.8,colsample_bytree=0.8,nthread=4,scale_pos_weight=1,seed=27),param_grid=param_test_1,scoring='roc_auc',cv=5,n_jobs=4,iid=False)
gsearch1.fit(X_train,y_train)
print(gsearch1.best_params_)

{'max_depth': 5, 'min_child_weight': 3}




In [60]:
#optimal value of max-depth and min_child_weight
param_test_2={
    'max_depth':[4,5,6],
    'min_child_weight':[2,3,4]
}
gsearch2 = model_selection.GridSearchCV(XGBClassifier(n_estimators=70,learning_rate=0.1,max_depth=5,min_child_weight=1,gamma=0,subsample=0.8,colsample_bytree=0.8,nthread=4,scale_pos_weight=1,seed=27),param_grid=param_test_2,scoring='roc_auc',cv=5,n_jobs=4,iid=False)
gsearch2.fit(X_train,y_train)
print(gsearch2.best_params_)

{'max_depth': 4, 'min_child_weight': 2}




In [61]:
#tune gamma
param_test_3={
    'gamma':[i/10.0 for i in range (0,5)]
}

gsearch3 = model_selection.GridSearchCV(XGBClassifier(n_estimators=70,learning_rate=0.1,max_depth=4,min_child_weight=2,gamma=0,subsample=0.8,colsample_bytree=0.8,nthread=4,scale_pos_weight=1,seed=27),param_grid=param_test_3,scoring='roc_auc',cv=5,n_jobs=4,iid=False)
gsearch3.fit(X_train,y_train)
print(gsearch3.best_params_)


{'gamma': 0.0}




In [65]:
def score_XGB_hyper2(n_estimators):
    model = XGBClassifier(n_estimators=n_estimators,learning_rate=0.1,max_depth=4,min_child_weight=2,gamma=0,subsample=0.8,colsample_bytree=0.8,nthread=4,scale_pos_weight=1,seed=27)
    score = -1 * cross_val_score(model,X_train,y_train,
                                cv=5,
                                scoring = 'neg_mean_absolute_error')
    return score.mean()

In [66]:
#re-balancing n_estimators
results = {}
for i in range(50,300,5):
        results[(i)]=score_XGB_hyper2(i)
sorted_results=sorted(results.items(),key=operator.itemgetter(1))
print(sorted_results[0:5])

[(105, 0.16616541353383457), (85, 0.1676579508472674), (80, 0.16768039501739423), (100, 0.16917293233082709), (95, 0.1706542475591965)]


In [68]:
#tune subsample and colsample_bytree
param_test_4={
    'subsample':[i/10 for i in range(6,10)],
    'colsample_bytree':[i/10 for i in range(6,10)]
}

gsearch4 = model_selection.GridSearchCV(XGBClassifier(n_estimators=105,learning_rate=0.1,max_depth=4,min_child_weight=2,gamma=0,subsample=0.8,colsample_bytree=0.8,nthread=4,scale_pos_weight=1,seed=27),param_grid=param_test_4,scoring='roc_auc',cv=5,n_jobs=4,iid=False)
gsearch4.fit(X_train,y_train)
print(gsearch4.best_params_)

{'colsample_bytree': 0.7, 'subsample': 0.7}




In [69]:
#optimizing subsample and colsample_bytree
param_test_5={
    'subsample':[i/100 for i in range(65,80,5)],
    'colsample_bytree':[i/100 for i in range(65,80,5)]
}

gsearch5 = model_selection.GridSearchCV(XGBClassifier(n_estimators=105,learning_rate=0.1,max_depth=4,min_child_weight=2,gamma=0,subsample=0.8,colsample_bytree=0.8,nthread=4,scale_pos_weight=1,seed=27),param_grid=param_test_5,scoring='roc_auc',cv=5,n_jobs=4,iid=False)
gsearch5.fit(X_train,y_train)
print(gsearch5.best_params_)

{'colsample_bytree': 0.7, 'subsample': 0.7}




In [70]:
#tuning regularization parameters
param_test_6={
    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}

gsearch6 = model_selection.GridSearchCV(XGBClassifier(n_estimators=105,learning_rate=0.1,max_depth=4,min_child_weight=2,gamma=0,subsample=0.7,colsample_bytree=0.7,nthread=4,scale_pos_weight=1,seed=27),param_grid=param_test_6,scoring='roc_auc',cv=5,n_jobs=4,iid=False)
gsearch6.fit(X_train,y_train)
print(gsearch6.best_params_)

{'reg_alpha': 1}




In [72]:
#optimizing reg alpha
param_test_7={
    'reg_alpha':[i/100 for i in range(90,120,2)]
}

gsearch7 = model_selection.GridSearchCV(XGBClassifier(n_estimators=105,learning_rate=0.1,max_depth=4,min_child_weight=2,gamma=0,subsample=0.7,colsample_bytree=0.7,nthread=4,scale_pos_weight=1,seed=27),param_grid=param_test_7,scoring='roc_auc',cv=5,n_jobs=4,iid=False)
gsearch7.fit(X_train,y_train)
print(gsearch7.best_params_)

{'reg_alpha': 1.08}




In [73]:
def score_XGB_hyper3(n_estimators):
    model = XGBClassifier(n_estimators=n_estimators,learning_rate=0.1,max_depth=4,min_child_weight=2,gamma=0,subsample=0.7,colsample_bytree=0.7,nthread=4,scale_pos_weight=1,seed=27,reg_alpha=1.08)
    score = -1 * cross_val_score(model,X_train,y_train,
                                cv=5,
                                scoring = 'neg_mean_absolute_error')
    return score.mean()

In [75]:
#re-balancing n_estimators
results = {}
for i in range(151,159):
        results[(i)]=score_XGB_hyper3(i)
sorted_results=sorted(results.items(),key=operator.itemgetter(1))
print(sorted_results[0:5])

[(155, 0.16766917293233083), (154, 0.16916171024576365), (156, 0.16916171024576365), (157, 0.16916171024576365), (158, 0.16916171024576365)]


In [76]:
def score_XGB_hyper4(learning_rate):
    model = XGBClassifier(n_estimators=155,learning_rate=learning_rate,max_depth=4,min_child_weight=2,gamma=0,subsample=0.7,colsample_bytree=0.7,nthread=4,scale_pos_weight=1,seed=27,reg_alpha=1.08)
    score = -1 * cross_val_score(model,X_train,y_train,
                                cv=5,
                                scoring = 'neg_mean_absolute_error')
    return score.mean()

In [78]:
results = {}
for i in range(1,10):
        results[(i*0.001)]=score_XGB_hyper4(i*0.001)
sorted_results=sorted(results.items(),key=operator.itemgetter(1))
print(sorted_results[0:5])

[(0.005, 0.16313545056671527), (0.006, 0.16313545056671527), (0.007, 0.16313545056671527), (0.008, 0.16313545056671527), (0.003, 0.16462798788014812)]


In [79]:
model_hyper_f=XGBClassifier(n_estimators=155,learning_rate=0.005,max_depth=4,min_child_weight=2,gamma=0,subsample=0.7,colsample_bytree=0.7,nthread=4,scale_pos_weight=1,seed=27,reg_alpha=1.08)
model_hyper_f.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.005, max_delta_step=0, max_depth=4,
              min_child_weight=2, missing=nan, monotone_constraints=None,
              n_estimators=155, n_jobs=4, nthread=4, num_parallel_tree=1,
              objective='binary:logistic', random_state=27, reg_alpha=1.08,
              reg_lambda=1, scale_pos_weight=1, seed=27, subsample=0.7,
              tree_method=None, validate_parameters=False, verbosity=None)

In [82]:
final_preds_hyper=model_hyper_f.predict(X_test_cleaned)
print(final_preds_hyper)

[0 1 0 0 0 0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 1 0 1 1 1 0 1 0 1 0 0 0 0 0 1 0 1
 1 0 0 0 0 0 1 1 0 1 0 1 0 1 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 1 0 1 1 1 0 1 1
 1 0 0 1 0 1 0 1 0 0 0 0 1 1 1 0 1 0 1 0 1 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 1 1 0 1 0 0 0
 0 0 1 0 0 0 0 0 1 1 1 0 1 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 1 1 1 1 0 1 1 0 1
 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 1 1 0 1 1 0 1 0 0 0 0 1 0 0 1 1 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 1 0 1 0 1 1 1 1 1 0 0 1 1 0 1 1 1 0 1 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 0 1 1 1 0 0 1 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 1 0 1 1 0 0 0 1 1 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 1 1 0
 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 0 0 1 0 0 1 0 1 1 0 1 0 0 1 1 0
 0 1 0 0 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 1 0 0
 1 1 1 1 1 1 0 1 0 0 0]


In [83]:
output = pd.DataFrame({'PassengerId':X_test_raw['PassengerId'],
                      'Survived':final_preds_hyper})
output.to_csv('titanic_model_4.csv',index=False)

In [85]:
train_preds_hyper = model_hyper_f.predict(X_valid)
mae3=mean_absolute_error(y_valid,train_preds_hyper)
print(mae3)

0.18834080717488788


In [134]:
features=[]
results={}
j=0
for i in X_train.columns:
    features.append(i)

i=0
j=i+1

while (i<=len(features)-1):
    test_features=[]
    test_features.append(features[i])
    for j in range(i+1,len(features)-1):
     
        test_features.append(features[j])
        X_features = X_train[test_features]
        model = XGBClassifier(n_estimators=155,learning_rate=0.005,max_depth=4,min_child_weight=2,gamma=0,subsample=0.7,colsample_bytree=0.7,nthread=4,scale_pos_weight=1,seed=27,reg_alpha=1.08)
        score = -1 * cross_val_score(model,X_features,y_train,
                                    cv=5,
                                    scoring = 'neg_mean_absolute_error')
        results[(i,j)]=score.mean()
    
    i+=1

sorted_results=sorted(results.items(),key=operator.itemgetter(1))
print(sorted_results[0:5])

[((1, 7), 0.184087083380092), ((0, 6), 0.18409830546515543), ((0, 7), 0.18410952755021884), ((1, 6), 0.18560206486365166), ((0, 8), 0.18561328694871507)]


In [136]:
print(features[1:7+1])
X_train_final_features = X_train[features[1:7+1]]
X_test_final_features=X_test_cleaned[features[1:7+1]]

['SibSp', 'Parch', 'FamilySize', 'IsAlone', 'Sex_Code', 'AgeBin_Code', 'FareBin_Code']


In [139]:
model_hyper_f.fit(X_train_final_features,y_train)
final_preds_hyper_features=model_hyper_f.predict(X_test_final_features)
print(final_preds_hyper_features)
output = pd.DataFrame({'PassengerId':X_test_raw['PassengerId'],
                      'Survived':final_preds_hyper})
output.to_csv('titanic_model_5.csv',index=False)

[0 1 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 1
 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 1 0 1 0
 1 0 0 1 0 1 1 0 0 0 0 0 1 1 1 1 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 1 0 0 1 1 0 1 1 1 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 1 0 1 0 1 1 0 1 1 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 0 0 1 1 0
 0 1 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 1 0 1 0 0 0]
