In [1]:
#imports
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor

In [2]:
t_data = pd.read_csv('train.csv', index_col = 'PassengerId')
t_test = pd.read_csv('test.csv', index_col = 'PassengerId')
t_data

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


### Check out column info

In [3]:
t_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


### Create X and Y 

Drop Cabin (77% Null) , Ticket (unique objects), and Name (unique objects)

In [4]:
features = ['Pclass','Age','SibSp','Parch','Fare']
X = t_data[features]
y = t_data.Survived 

### Change Sex to Boolean

In [5]:
t_data.loc[t_data.Sex == 'male', 'Sex'] = False
t_data.loc[t_data.Sex == 'female', 'Sex'] = True
t_data.rename(columns = {'Sex':'IsFemale'}, inplace = True)
#t_test
t_test.loc[t_test.Sex == 'male', 'Sex'] = False
t_test.loc[t_test.Sex == 'female', 'Sex'] = True
t_test.rename(columns = {'Sex':'IsFemale'}, inplace = True)
# add IsFemale to Features
features.append('IsFemale')

### Drop all NaN -- fix this in next version.  Age is 20% NaN so dont drop that.  Decide whether to drop the 3 rows with embarked NaN

In [6]:
t_data = t_data.fillna(0)

t_test = t_test.fillna(0)

In [7]:
t_data

Unnamed: 0_level_0,Survived,Pclass,Name,IsFemale,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",False,22.0,1,0,A/5 21171,7.2500,0,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",True,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",True,26.0,0,0,STON/O2. 3101282,7.9250,0,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",True,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",False,35.0,0,0,373450,8.0500,0,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",False,27.0,0,0,211536,13.0000,0,S
888,1,1,"Graham, Miss. Margaret Edith",True,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",True,0.0,1,2,W./C. 6607,23.4500,0,S
890,1,1,"Behr, Mr. Karl Howell",False,26.0,0,0,111369,30.0000,C148,C


### One-Hot Encoding for Embarked (Next Version)

In [17]:
# drop 0
t_data = t_data.loc[t_data['Embarked'] != 0]

# Apply one-hot encoder to Embarked
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
X_encoded = pd.DataFrame(OH_encoder.fit_transform(t_data['Embarked'].to_frame()))

#create list of encoded columns
X_encoded.columns = OH_encoder.get_feature_names(['Embarked'])
embarked_cols = X_encoded.columns

# add to main dataframe
t_data2 = pd.concat([t_data, X_encoded], axis= 'columns')
t_data2

Unnamed: 0,Survived,Pclass,Name,IsFemale,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Embarked_C,Embarked_Q,Embarked_S
0,,,,,,,,,,,,0.0,0.0,1.0
1,0.0,3.0,"Braund, Mr. Owen Harris",False,22.0,1.0,0.0,A/5 21171,7.2500,0,S,1.0,0.0,0.0
2,1.0,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",True,38.0,1.0,0.0,PC 17599,71.2833,C85,C,0.0,0.0,1.0
3,1.0,3.0,"Heikkinen, Miss. Laina",True,26.0,0.0,0.0,STON/O2. 3101282,7.9250,0,S,0.0,0.0,1.0
4,1.0,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",True,35.0,1.0,0.0,113803,53.1000,C123,S,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,0.0,2.0,"Montvila, Rev. Juozas",False,27.0,0.0,0.0,211536,13.0000,0,S,1.0,0.0,0.0
888,1.0,1.0,"Graham, Miss. Margaret Edith",True,19.0,0.0,0.0,112053,30.0000,B42,S,0.0,1.0,0.0
889,0.0,3.0,"Johnston, Miss. Catherine Helen ""Carrie""",True,0.0,1.0,2.0,W./C. 6607,23.4500,0,S,,,
890,1.0,1.0,"Behr, Mr. Karl Howell",False,26.0,0.0,0.0,111369,30.0000,C148,C,,,


### Cross Validation

In [16]:
model = XGBRegressor(n_estimators=500)
cv = cross_val_score(model, X, y, cv = 5)
cv.mean()

-0.14934109521858532

### XGBoost

In [None]:
model.fit(X,y)
preds = model.predict(t_test[features])
preds = preds.astype(int)

In [None]:
output = pd.DataFrame({'PassengerId': t_test.index,
                       'Survived': preds})
output.to_csv('submission2.csv', index=False)

In [None]:
sub = pd.read_csv('submission2.csv')
sub
sub.Survived.value_counts()