In [1]:
#imports
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier

In [2]:
t_data = pd.read_csv('train.csv', index_col = 'PassengerId')
t_test = pd.read_csv('test.csv', index_col = 'PassengerId')
combined = t_data.append(t_test)
combined

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


### Check out column info

In [3]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 1 to 1309
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    float64
 1   Pclass    1309 non-null   int64  
 2   Name      1309 non-null   object 
 3   Sex       1309 non-null   object 
 4   Age       1046 non-null   float64
 5   SibSp     1309 non-null   int64  
 6   Parch     1309 non-null   int64  
 7   Ticket    1309 non-null   object 
 8   Fare      1308 non-null   float64
 9   Cabin     295 non-null    object 
 10  Embarked  1307 non-null   object 
dtypes: float64(3), int64(3), object(5)
memory usage: 122.7+ KB


1. Cabin - majority nulls
2. Age - some nulls
3. Embarked - 2 nulls
4. Fare - 1 null

### Create X and features

Drop Cabin (77% Null) , Ticket (unique objects), and Name (unique objects)

In [4]:
features = ['Pclass','Age','SibSp','Parch','Fare']

### Change Sex to Boolean

In [5]:
combined.loc[combined.Sex == 'male', 'Sex'] = False
combined.loc[combined.Sex == 'female', 'Sex'] = True
combined.rename(columns = {'Sex':'Is_Female'}, inplace = True)
combined['Is_Female'] = combined['Is_Female'].apply(bool)

# add IsFemale to Features
features.append('Is_Female')
combined

Unnamed: 0_level_0,Survived,Pclass,Name,Is_Female,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.0,3,"Braund, Mr. Owen Harris",False,22.0,1,0,A/5 21171,7.2500,,S
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",True,38.0,1,0,PC 17599,71.2833,C85,C
3,1.0,3,"Heikkinen, Miss. Laina",True,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",True,35.0,1,0,113803,53.1000,C123,S
5,0.0,3,"Allen, Mr. William Henry",False,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
1305,,3,"Spector, Mr. Woolf",False,,0,0,A.5. 3236,8.0500,,S
1306,,1,"Oliva y Ocana, Dona. Fermina",True,39.0,0,0,PC 17758,108.9000,C105,C
1307,,3,"Saether, Mr. Simon Sivertsen",False,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1308,,3,"Ware, Mr. Frederick",False,,0,0,359309,8.0500,,S


### Deal with Missing Values

In [6]:
combined.Cabin.fillna('U', inplace=True)
combined.Embarked.fillna('U',inplace=True)
combined.Age.fillna(combined.iloc[:891].Age.median())
combined.Fare.fillna(combined.iloc[:891].Fare.mean())

PassengerId
1         7.2500
2        71.2833
3         7.9250
4        53.1000
5         8.0500
          ...   
1305      8.0500
1306    108.9000
1307      7.2500
1308      8.0500
1309     22.3583
Name: Fare, Length: 1309, dtype: float64

In [7]:
combined

Unnamed: 0_level_0,Survived,Pclass,Name,Is_Female,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.0,3,"Braund, Mr. Owen Harris",False,22.0,1,0,A/5 21171,7.2500,U,S
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",True,38.0,1,0,PC 17599,71.2833,C85,C
3,1.0,3,"Heikkinen, Miss. Laina",True,26.0,0,0,STON/O2. 3101282,7.9250,U,S
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",True,35.0,1,0,113803,53.1000,C123,S
5,0.0,3,"Allen, Mr. William Henry",False,35.0,0,0,373450,8.0500,U,S
...,...,...,...,...,...,...,...,...,...,...,...
1305,,3,"Spector, Mr. Woolf",False,,0,0,A.5. 3236,8.0500,U,S
1306,,1,"Oliva y Ocana, Dona. Fermina",True,39.0,0,0,PC 17758,108.9000,C105,C
1307,,3,"Saether, Mr. Simon Sivertsen",False,38.5,0,0,SOTON/O.Q. 3101262,7.2500,U,S
1308,,3,"Ware, Mr. Frederick",False,,0,0,359309,8.0500,U,S


### Create Cabin Unknown Col

In [8]:
combined.loc[combined.Cabin != 'U', 'Cabin'] = False
combined.loc[combined.Cabin == 'U', 'Cabin'] = True
combined.rename(columns = {'Cabin':'Cabin_Unknown'}, inplace = True)
combined['Cabin_Unknown'] = combined['Cabin_Unknown'].apply(bool)
combined

features.append('Cabin_Unknown')

### One-Hot Encoding for Embarked

In [9]:
# Apply one-hot encoder to Embarked
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
X_encoded = pd.DataFrame(OH_encoder.fit_transform(combined['Embarked'].to_frame()))

# name encoded columns
X_encoded.columns = OH_encoder.get_feature_names(['Embarked'])

#create list of encoded columns
X_encoded_col = X_encoded.columns.tolist()

# re index
X_encoded.index = combined.index

# add to main dataframe
combined = pd.concat([combined, X_encoded], axis= 'columns')

combined

Unnamed: 0_level_0,Survived,Pclass,Name,Is_Female,Age,SibSp,Parch,Ticket,Fare,Cabin_Unknown,Embarked,Embarked_C,Embarked_Q,Embarked_S,Embarked_U
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,0.0,3,"Braund, Mr. Owen Harris",False,22.0,1,0,A/5 21171,7.2500,True,S,0.0,0.0,1.0,0.0
2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",True,38.0,1,0,PC 17599,71.2833,False,C,1.0,0.0,0.0,0.0
3,1.0,3,"Heikkinen, Miss. Laina",True,26.0,0,0,STON/O2. 3101282,7.9250,True,S,0.0,0.0,1.0,0.0
4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",True,35.0,1,0,113803,53.1000,False,S,0.0,0.0,1.0,0.0
5,0.0,3,"Allen, Mr. William Henry",False,35.0,0,0,373450,8.0500,True,S,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,,3,"Spector, Mr. Woolf",False,,0,0,A.5. 3236,8.0500,True,S,0.0,0.0,1.0,0.0
1306,,1,"Oliva y Ocana, Dona. Fermina",True,39.0,0,0,PC 17758,108.9000,False,C,1.0,0.0,0.0,0.0
1307,,3,"Saether, Mr. Simon Sivertsen",False,38.5,0,0,SOTON/O.Q. 3101262,7.2500,True,S,0.0,0.0,1.0,0.0
1308,,3,"Ware, Mr. Frederick",False,,0,0,359309,8.0500,True,S,0.0,0.0,1.0,0.0


### Cross Validation

In [10]:
data = combined.iloc[:891]
test = combined.iloc[891:]

features.extend(X_encoded_col)
X = data[features]
y = data.Survived 
features

['Pclass',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Is_Female',
 'Cabin_Unknown',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S',
 'Embarked_U']

In [11]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Pclass         891 non-null    int64  
 1   Age            714 non-null    float64
 2   SibSp          891 non-null    int64  
 3   Parch          891 non-null    int64  
 4   Fare           891 non-null    float64
 5   Is_Female      891 non-null    bool   
 6   Cabin_Unknown  891 non-null    bool   
 7   Embarked_C     891 non-null    float64
 8   Embarked_Q     891 non-null    float64
 9   Embarked_S     891 non-null    float64
 10  Embarked_U     891 non-null    float64
dtypes: bool(2), float64(6), int64(3)
memory usage: 71.3 KB


In [12]:
model = XGBClassifier(n_estimators=500)
cv = cross_val_score(model, X, y, cv = 5)
cv.mean()



















0.8002573598644153

### XGBoost

In [13]:
model.fit(X,y)
preds = model.predict(test[features]).astype(int)





In [14]:
output = pd.DataFrame({'PassengerId': test.index,
                       'Survived': preds})
output.to_csv('submission4.csv', index=False)