Based on train data, determine if each entry on test dataframe survived or not to Titanic sink

In [145]:
import pandas as pd

X_train_raw = pd.read_csv('titanic/train.csv')
X_test_raw = pd.read_csv('titanic/test.csv')

In [146]:
X_train_raw.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [147]:
X_train_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [148]:
X_test_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [149]:
#Survived is our target
y=X_train_raw.Survived
X=X_train_raw.copy()
X.drop(['Survived'],axis=1, inplace=True)
print(X.head())

#Passenger ID and Ticket are aleatory identifier - will be removed
X.drop(['PassengerId','Ticket'],axis=1,inplace=True)

#Cabin has too many null observations - will be removed
X.drop(['Cabin'],axis=1,inplace=True)

X.head()

   PassengerId  Pclass                                               Name  \
0            1       3                            Braund, Mr. Owen Harris   
1            2       1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2            3       3                             Heikkinen, Miss. Laina   
3            4       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4            5       3                           Allen, Mr. William Henry   

      Sex   Age  SibSp  Parch            Ticket     Fare Cabin Embarked  
0    male  22.0      1      0         A/5 21171   7.2500   NaN        S  
1  female  38.0      1      0          PC 17599  71.2833   C85        C  
2  female  26.0      0      0  STON/O2. 3101282   7.9250   NaN        S  
3  female  35.0      1      0            113803  53.1000  C123        S  
4    male  35.0      0      0            373450   8.0500   NaN        S  


Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


In [150]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer




#Age and Embarked will have data imputed, as well as Fare only in Test Data
age_imputer = SimpleImputer(strategy="mean")
embarked_imputer = SimpleImputer(strategy='most_frequent')
fare_imputer = SimpleImputer(strategy='mean')

imputer = ColumnTransformer(transformers=[
    ('age',SimpleImputer(strategy="mean"),['Age','Fare']),
    ('embarked',embarked_imputer,['Embarked']),
    #('fare',fare_imputer,X['Fare'])
])

X_imputed=pd.DataFrame(imputer.fit_transform(X))
X_test_imputed=pd.DataFrame(imputer.fit_transform(X_test_raw))
X.head()


Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


In [151]:
X_imputed.head()

Unnamed: 0,0,1,2
0,22,7.25,S
1,38,71.2833,C
2,26,7.925,S
3,35,53.1,S
4,35,8.05,S


In [152]:
X_test_imputed.head()

Unnamed: 0,0,1,2
0,34.5,7.8292,Q
1,47.0,7.0,S
2,62.0,9.6875,Q
3,27.0,8.6625,S
4,22.0,12.2875,S


In [153]:
#substitute columns
X_imputed.columns=['Age','Fare','Embarked']
X_test_imputed.columns=['Age','Fare','Embarked']

X.drop(['Age','Fare','Embarked'],axis=1,inplace=True)

X.head()

Unnamed: 0,Pclass,Name,Sex,SibSp,Parch
0,3,"Braund, Mr. Owen Harris",male,1,0
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0
2,3,"Heikkinen, Miss. Laina",female,0,0
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0
4,3,"Allen, Mr. William Henry",male,0,0


In [154]:
X=pd.concat([X,X_imputed],axis=1,join='inner')

In [155]:
X_test=X_test_raw.copy()
X_test.drop(['Age','Fare','Embarked'],axis=1,inplace=True)
X_test=pd.concat([X_test,X_test_imputed],axis=1,join='inner')

In [156]:
X_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,SibSp,Parch,Ticket,Cabin,Age,Fare,Embarked
0,892,3,"Kelly, Mr. James",male,0,0,330911,,34.5,7.8292,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,1,0,363272,,47.0,7.0,S
2,894,2,"Myles, Mr. Thomas Francis",male,0,0,240276,,62.0,9.6875,Q
3,895,3,"Wirz, Mr. Albert",male,0,0,315154,,27.0,8.6625,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,1,1,3101298,,22.0,12.2875,S


In [157]:
#Create title register from Name
#Create identifications for passengers alone

datacleaner = [X,X_test]

for dataset in datacleaner:
    dataset['FamilySize'] = dataset['SibSp']+dataset['Parch']+1
    
    dataset['IsAlone']=1
    dataset['IsAlone'].loc[dataset['FamilySize']>1]=0
    
    dataset['Title']=dataset['Name'].str.split(", ",expand=True)[1].str.split(".",expand=True)[0]
    
    dataset['FareBin']=pd.qcut(dataset['Fare'],4)
    
    dataset['AgeBin']=pd.cut(dataset['Age'].astype(int),5)
    
stat_min =10
title_names=(X['Title'].value_counts()<stat_min)
X['Title']=X['Title'].apply(lambda x: 'Misc' if title_names.loc[x]==True else x)
print(X['Title'].value_counts())
print(X_test['Title'].value_counts())
X.info()

Mr        517
Miss      182
Mrs       125
Master     40
Misc       27
Name: Title, dtype: int64
Mr        240
Miss       78
Mrs        72
Master     21
Col         2
Rev         2
Dr          1
Ms          1
Dona        1
Name: Title, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   Pclass      891 non-null    int64   
 1   Name        891 non-null    object  
 2   Sex         891 non-null    object  
 3   SibSp       891 non-null    int64   
 4   Parch       891 non-null    int64   
 5   Age         891 non-null    object  
 6   Fare        891 non-null    object  
 7   Embarked    891 non-null    object  
 8   FamilySize  891 non-null    int64   
 9   IsAlone     891 non-null    int64   
 10  Title       891 non-null    object  
 11  FareBin     891 non-null    category
 12  AgeBin      891 non-null    category
dtypes: category(

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [158]:
from sklearn.preprocessing import LabelEncoder

label = LabelEncoder()
for dataset in datacleaner:
    dataset['Sex_Code']=label.fit_transform(dataset['Sex'])
    dataset['AgeBin_Code']=label.fit_transform(dataset['AgeBin'])
    dataset['FareBin_Code']=label.fit_transform(dataset['FareBin'])
    dataset['Embarked_Code']=label.fit_transform(dataset['Embarked'])
    dataset['Title_Code']=label.fit_transform(dataset['Title'])
    


In [159]:
X_dummie=pd.get_dummies(X)



In [160]:
columns_toModel = ['Pclass','SibSp','Parch','FamilySize','IsAlone','Sex_Code','AgeBin_Code','FareBin_Code','Embarked_Code','Title_Code']
X_train_cleaned = X[columns_toModel]
X_test_cleaned=X_test[columns_toModel]

print(X_train_cleaned.info())
print(X_test_cleaned.info())

print(X_train_cleaned.head())
print(X_test_cleaned.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   Pclass         891 non-null    int64
 1   SibSp          891 non-null    int64
 2   Parch          891 non-null    int64
 3   FamilySize     891 non-null    int64
 4   IsAlone        891 non-null    int64
 5   Sex_Code       891 non-null    int32
 6   AgeBin_Code    891 non-null    int32
 7   FareBin_Code   891 non-null    int32
 8   Embarked_Code  891 non-null    int32
 9   Title_Code     891 non-null    int32
dtypes: int32(5), int64(5)
memory usage: 52.3 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   Pclass         418 non-null    int64
 1   SibSp          418 non-null    int64
 2   Parch          418 non-null    int64
 3   FamilySize     41

In [161]:
X_train,X_valid,y_train,y_valid = train_test_split(X_train_cleaned,y,train_size=0.75,test_size=0.25,random_state=0)

In [162]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from sklearn.metrics import mean_absolute_error

model=XGBClassifier(n_estimators=100, learning_rate=0.05)

score = -1 * cross_val_score(model,X_train,y_train,
                            cv=5,
                            scoring='neg_mean_absolute_error')
print(score.mean())

0.16468409830546515


In [166]:
model.fit(X_train,y_train)
train_preds = model.predict(X_valid)
mae=mean_absolute_error(y_valid,train_preds)
print(mae)
final_preds=model.predict(X_test_cleaned)
print(final_preds.shape)
print(X_test_cleaned.shape)
print(final_preds)

0.17488789237668162
(418,)
(418, 10)
[0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 1 0 0 0 1 0 0 1 0 1 0 1 0 1 0 0 0 0 0 1 0 1
 1 0 0 0 1 0 1 1 0 1 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 1 0 1 1 1 0 1 1
 1 0 0 1 0 1 0 0 0 0 0 0 1 1 1 0 0 0 1 0 1 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 0 0 1 1 0 1 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0
 1 0 1 0 0 0 0 0 1 1 1 1 1 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 1 1 0 1
 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 1 0 1 1 0 1 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 1 0 0 0 1 1 1 1 1 0 0 0 1 0 1 1 1 0 1 0 0 0 0 0 1
 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 1 0
 1 0 1 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 0 0 1 0 0 1 0 1 1 0 1 0 0 1 1 0
 0 1 0 0 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 1 0 0
 0 1 0 1 1 1 0 1 0 0 0]


In [170]:
output = pd.DataFrame({'PassengerId':X_test_raw['PassengerId'],
                      'Survived':final_preds})
output.to_csv('titanic_model_3.csv',index=False)

In [169]:
print(X_test_raw)

     PassengerId  Pclass                                          Name  \
0            892       3                              Kelly, Mr. James   
1            893       3              Wilkes, Mrs. James (Ellen Needs)   
2            894       2                     Myles, Mr. Thomas Francis   
3            895       3                              Wirz, Mr. Albert   
4            896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)   
..           ...     ...                                           ...   
413         1305       3                            Spector, Mr. Woolf   
414         1306       1                  Oliva y Ocana, Dona. Fermina   
415         1307       3                  Saether, Mr. Simon Sivertsen   
416         1308       3                           Ware, Mr. Frederick   
417         1309       3                      Peter, Master. Michael J   

        Sex   Age  SibSp  Parch              Ticket      Fare Cabin Embarked  
0      male  34.5      0      0 