# Submission 1. XGBoost new features

| Variable | Definition                      | Key                                         |
|----------|---------------------------------|---------------------------------------------|
| survival | Survival                        | 0 = No, 1 = Yes                             |
| pclass   | Ticket class                    | 1 = 1st, 2 = 2nd, 3 = 3rd                   |
| sex      | Sex                             |                                             |
| Age      | Age in years                    |                                             |
| sibsp    | # of siblings / spouses aboard the Titanic |                               |
| parch    | # of parents / children aboard the Titanic  |                               |
| ticket   | Ticket number                   |                                             |
| fare     | Passenger fare                  |                                             |
| cabin    | Cabin number                    |                                             |
| embarked | Port of Embarkation             | C = Cherbourg, Q = Queenstown, S = Southampton |

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder

In [12]:
# Read train and test files into pandas dataframes
df_original = pd.read_csv('train.csv')
df_submission_original = pd.read_csv('test.csv')

# Preprocessing

### Filling Missing values

In [13]:
# Check for missing values
df_original.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [14]:
modes = df_original.mode().iloc[0]
modes

PassengerId                      1
Survived                       0.0
Pclass                         3.0
Name           Abbing, Mr. Anthony
Sex                           male
Age                           24.0
SibSp                          0.0
Parch                          0.0
Ticket                        1601
Fare                          8.05
Cabin                      B96 B98
Embarked                         S
Name: 0, dtype: object

In [15]:
df_original.fillna(modes, inplace=True)

In [16]:
df_original.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [17]:
def add_features(df):
    df['LogFare'] = np.log1p(df['Fare'])
    df['Deck'] = df.Cabin.str[0].map(dict(A="ABC",B="ABC", C="ABC", D="DE", E="DE", F="FG", G="FG"))
    df['Family'] = df.SibSp+df.Parch
    df['Alone'] = df.Family==0
    df['TicketFreq'] = df.groupby('Ticket')['Ticket'].transform('count')
    df['Title'] = df.Name.str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
    df['Title'] = df.Title.map(dict(Mr="Mr",Miss="Miss",Mrs="Mrs",Master="Master"))

add_features(df_original)
add_features(df_submission_original)

In [18]:
categorical_cols = ["Sex","Pclass","Embarked","Deck", "Title"]
numerical_cols = ['Age', 'SibSp', 'Parch', 'LogFare',
                 'Alone', 'TicketFreq', 'Family']
y_cols = ['Survived']

In [32]:
df = df_original[categorical_cols + numerical_cols].copy()
#df[categorical_cols] = df[categorical_cols].astype('category')
y = df_original[y_cols].copy()
#y = y.astype('category')

df_submission = df_submission_original[categorical_cols + numerical_cols].copy()
df_submission[categorical_cols] = df_submission[categorical_cols]#.astype('category')


### Preprocessing Categorical Variables

In [33]:
df_submission['Pclass'].value_counts().sort_index()

Pclass
1    107
2     93
3    218
Name: count, dtype: int64

In [34]:
## One-Hot Encoding to categorical variables
# Create an instance of the OneHotEncoder
encoder = OneHotEncoder(drop='first')

#encoded
encoded_train = encoder.fit_transform(df[categorical_cols])
encoded_test = encoder.transform(df_submission[categorical_cols])

# Fit and transform the categorical columns using the encoder
df_train_encoded = pd.DataFrame(encoded_train.toarray())
df_test_encoded = pd.DataFrame(encoded_test.toarray())

# Assign column names to the encoded DataFrame
df_train_encoded.columns = encoder.get_feature_names_out(categorical_cols)
df_test_encoded.columns = encoder.get_feature_names_out(categorical_cols)

# Concatenate the encoded DataFrame with the original DataFrame
df = pd.concat([df, df_train_encoded], axis=1)
df_submission = pd.concat([df_submission, df_test_encoded], axis=1)

# Drop the original categorical columns
df.drop(categorical_cols, axis=1, inplace=True)
df_submission.drop(categorical_cols, axis=1, inplace=True)
df

Unnamed: 0,Age,SibSp,Parch,LogFare,Alone,TicketFreq,Family,Sex_male,Pclass_2,Pclass_3,Embarked_Q,Embarked_S,Deck_DE,Deck_FG,Deck_nan,Title_Miss,Title_Mr,Title_Mrs,Title_nan
0,22.0,1,0,2.110213,False,1,1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,38.0,1,0,4.280593,False,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,26.0,0,0,2.188856,True,1,0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,35.0,1,0,3.990834,False,2,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,35.0,0,0,2.202765,True,1,0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,27.0,0,0,2.639057,True,1,0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
887,19.0,0,0,3.433987,True,1,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
888,24.0,1,2,3.196630,False,2,3,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
889,26.0,0,0,3.433987,True,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


### Standardizing the data

In [37]:
from sklearn.preprocessing import StandardScaler

# Create an instance of the StandardScaler
scaler = StandardScaler()

# Apply standard scaling to the numerical features
df[numerical_cols] = pd.DataFrame(scaler.fit_transform(df[numerical_cols]),columns=numerical_cols)
df_submission[numerical_cols] = pd.DataFrame(scaler.transform(df_submission[numerical_cols]),columns=numerical_cols)
df[numerical_cols].head()

Unnamed: 0,Age,SibSp,Parch,LogFare,Alone,TicketFreq,Family
0,-0.497793,0.432793,-0.473674,-0.879741,-1.231645,-0.579162,0.05916
1,0.715048,0.432793,-0.473674,1.36122,-1.231645,-0.579162,0.05916
2,-0.194583,-0.474545,-0.473674,-0.79854,0.811922,-0.579162,-0.560975
3,0.48764,0.432793,-0.473674,1.062038,-1.231645,0.155928,0.05916
4,0.48764,-0.474545,-0.473674,-0.784179,0.811922,-0.579162,-0.560975


# Model Building with XGBoost

In [38]:
import xgboost as xgb
xgb_model=xgb.XGBClassifier(enable_categorical=True)

## Hyperparameter tunning with Randomized Search

In [39]:
from sklearn.model_selection import RandomizedSearchCV

# Define parameters to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'min_child_weight': [1, 2, 3],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]    
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(xgb_model, param_distributions=param_grid, n_iter=100, scoring='accuracy', n_jobs=-1, cv=5, random_state=42)

# Fit RandomizedSearchCV
random_search.fit(df, y)

# Evaluate results
best_params = random_search.best_params_
best_score = random_search.best_score_
model = random_search.best_estimator_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

Best Parameters: {'subsample': 0.6, 'n_estimators': 200, 'min_child_weight': 3, 'max_depth': 5, 'learning_rate': 0.05, 'gamma': 0.1, 'colsample_bytree': 1.0}
Best Score: 0.854108342225849


In [40]:
# Fit the best model on the training data
model.fit(df, y)

# Make predictions on the testing data
y_pred_test = model.predict(df_submission)

In [41]:
ds_submission = df_submission_original[['PassengerId']].copy()
ds_submission['Survived'] = y_pred_test
ds_submission.to_csv('submission1.1.csv', index=False)

In [42]:
!kaggle competitions submit -c titanic -f submission1.1.csv -m "XGBoostClassifier with RandomizedSearchCV and feature engineering from FastAI course."

100%|██████████████████████████████████████| 2.77k/2.77k [00:00<00:00, 3.65kB/s]
Successfully submitted to Titanic - Machine Learning from Disaster