# Imports

In [179]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import math
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

## Importing the data

In [180]:
train_df = pd.read_csv("./../data/train_data.csv")
test_df = pd.read_csv("./../data/test_data.csv")

X
1. PassengerId: Unique id for each passenger
2. Survived: 0 = No, 1 = Yes
3. Pclass: Ticket class
4. Sex: sex of the passenger
5. SibSp: # of siblings / spouses aboard the Titanic
6. Parch: # of parents / children aboard the Titanic
7. Ticket: Ticket number
8. Fare: Passenger fare
9. Cabin: Cabin number (-1 means no data on cabin)
10. Embarked: Port of Embarkation

Y
1. Age: Age in years

In [181]:
# Get familiar with data
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age
0,329,1,3,0,1,1,346,20.525,-1,2,31.0
1,74,0,3,1,1,0,166,14.4542,-1,0,26.0
2,254,0,3,1,1,0,419,16.1,-1,2,30.0
3,720,0,3,1,0,0,260,7.775,-1,2,33.0
4,667,0,2,1,0,0,104,13.0,-1,2,25.0


# Data cleaning

As we can see from results .info() there are "Fare" and "Ticket" have Dtype object. But using .head() we see a numeric values. Probably there is NaN and we need to drop or transform them.

In [182]:
# Let's transform our non-alfanumeric values to NaN
train_df['Ticket'] = pd.to_numeric(train_df['Ticket'], errors='coerce')
train_df['Ticket'].isna().sum()

2

We can see that "Ticket" contains 2 NaN

In [183]:
# Let's transform our non-alfanumeric values to NaN in column "Fare"
train_df['Fare'] = pd.to_numeric(train_df['Fare'], errors='coerce')
train_df['Fare'].isna().sum()

4

"Fare" has 4 NaN

In [184]:
# It's nice to have some value there istead of NaN. Let's fill them with mean value
train_df[['Ticket', 'Fare']] = train_df[['Ticket', 'Fare']].fillna(train_df[['Ticket', 'Fare']].mean())

In [185]:
# Lets check dtypes again
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1493 entries, 0 to 1492
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1493 non-null   int64  
 1   Survived     1493 non-null   int64  
 2   Pclass       1493 non-null   int64  
 3   Sex          1493 non-null   int64  
 4   SibSp        1493 non-null   int64  
 5   Parch        1493 non-null   int64  
 6   Ticket       1493 non-null   float64
 7   Fare         1493 non-null   float64
 8   Cabin        1493 non-null   int64  
 9   Embarked     1493 non-null   int64  
 10  Age          1488 non-null   float64
dtypes: float64(3), int64(8)
memory usage: 128.4 KB


We can see that no more objects in our dataset. But Ticket represented as int, but we have it as float now. Let's make it int.

In [186]:
train_df['Ticket'] = train_df['Ticket'].astype('int64')
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1493 entries, 0 to 1492
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1493 non-null   int64  
 1   Survived     1493 non-null   int64  
 2   Pclass       1493 non-null   int64  
 3   Sex          1493 non-null   int64  
 4   SibSp        1493 non-null   int64  
 5   Parch        1493 non-null   int64  
 6   Ticket       1493 non-null   int64  
 7   Fare         1493 non-null   float64
 8   Cabin        1493 non-null   int64  
 9   Embarked     1493 non-null   int64  
 10  Age          1488 non-null   float64
dtypes: float64(2), int64(9)
memory usage: 128.4 KB


Let's check descriptive statistics

In [187]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age
count,1493.0,1493.0,1493.0,1493.0,1493.0,1493.0,1493.0,1493.0,1493.0,1493.0,1488.0
mean,597.697254,0.327528,2.261219,0.690556,0.184863,0.178835,249.171467,19.367148,5.994642,1.834561,215117.4
std,251.365652,0.469469,0.628908,0.462419,0.594308,0.592203,145.543518,33.055481,24.214966,0.538178,8295612.0
min,1.0,0.0,1.0,0.0,0.0,0.0,-496.0,0.0,-1.0,-1.0,-32000.0
25%,377.0,0.0,2.0,0.0,0.0,0.0,181.0,7.925,-1.0,2.0,-35.0
50%,758.0,0.0,2.0,1.0,0.0,0.0,181.0,10.5,-1.0,2.0,16.0
75%,813.0,1.0,3.0,1.0,0.0,0.0,426.0,11.1333,-1.0,2.0,24.0
max,891.0,1.0,3.0,1.0,5.0,6.0,541.0,512.3292,133.0,2.0,320000000.0


## Handle outliers

In [188]:
#There are many outliers in "Age" column. We need to handle them. 
#Setting our ages up to 100 as max
train_df = train_df[train_df['Age'] > 0]
train_df = train_df[train_df['Age'] <= 100] 

## Remove duplicates

In [189]:
train_df = train_df.drop_duplicates()
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 541 entries, 0 to 1492
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  541 non-null    int64  
 1   Survived     541 non-null    int64  
 2   Pclass       541 non-null    int64  
 3   Sex          541 non-null    int64  
 4   SibSp        541 non-null    int64  
 5   Parch        541 non-null    int64  
 6   Ticket       541 non-null    int64  
 7   Fare         541 non-null    float64
 8   Cabin        541 non-null    int64  
 9   Embarked     541 non-null    int64  
 10  Age          541 non-null    float64
dtypes: float64(2), int64(9)
memory usage: 50.7 KB


Finally we can commence our model creation!

# Model creation

In [190]:
# let y be the target column, and X be the rest of the df
model1 = xgb.XGBRegressor()
X = train_df.drop('Age', axis=1).copy()
y = train_df['Age'].copy()

As per Oblig_2 task - we need to use test_data.csv for evaluation the model, but let's fisrt make evaluation using train_test_split() from sklearn.model.selection. After that we'll make evaluation using test_data.csv and comparison of results.

In [191]:
# Split the data into train and test sets with the function train_test_split from sklearn. Use test_size=0.2 and random_state=42
# We use train_test_split to split the data into train and test sets. We will use the train set to train the model, and the test set 
# to evaluate the model.
# The reason we need a test set is to be able to evaluate the model. If we train the model on the whole dataset, 
# it will learn the dataset perfectly, but we will not know how it performs on unseen data.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [192]:
# use the training set (X_train, y_train) to train the model by calling the .fit() method
model1.fit(X_train, y_train)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [193]:
# Use the model to predict the target values for the test set (X_test)
preds = model1.predict(X_test)

In [194]:
# find the mean squared error for the predictions (a value to see the value of the predictions, lower is better)
# find the error between the y_test and the preds
mse1 = mean_squared_error(preds, y_test)
mse1

204.1140071310041

In [195]:
# Compute the root-mean-square
rms1 = np.sqrt(mse1)
rms1

14.28684734750827

Now let's check our test_df dataset for evaluation and compare results

In [196]:
# Use the model to predict the target values for the test set (X_test) from test_df (test_data.csv)
model2 = xgb.XGBRegressor();
X_test_df = test_df.drop('Age', axis=1).copy() # and made a copy
Y_test_df = test_df['Age'].copy()

In [197]:
model2.fit(X, y)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [198]:
preds2 = model2.predict(X_test_df)

In [199]:
# find the mean squared error for the predictions (a value to see the value of the predictions, lower is better)
# find the error between the y_test and the preds
mse2 = mean_squared_error(preds2, Y_test_df)
mse2

151.0929148403456

In [200]:
# Compute the root-mean-square
rms2 = np.sqrt(mse2)
rms2

12.291985797272368

Now we can see that our model2 which is based on test_data.csv has a better (less) root mean squared error than using train_test_split function from sklearn.

# Hyperparameter tuning

Let's try to improve our results using XGBoost hyperparameters tuning

In [201]:
# These are some of the hyperparameters you can tune for XGBoost. 
# A hyperparameter is a parameter that is not learned by the model, but is set by the user.
# The parameters that are learned by the model are called model parameters.
# The model starts off with some default values for the hyperparameters, but you can change them to get potentially better results.
# This process is called hyperparameter tuning.

# If you want, you can adjust the hyperparameters and see if you can get a better result. You can also add more hyperparameters to the 
# dictionary.
# List of hyperparameters: https://xgboost.readthedocs.io/en/latest/parameter.html
params = {
    "learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
    "max_depth": [3, 4, 5, 6, 8, 10, 12, 15],
    "min_child_weight": [1, 3, 5, 7],
    "gamma": [0.0, 0.1, 0.2, 0.3, 0.4],
    "colsample_bytree": [0.3, 0.4, 0.5, 0.7],
    "n_estimators": [100, 200, 300, 400, 500, 900, 1100, 1500],
}

In [202]:
# Use RandomizedSearchCV to find the best hyperparameters for the model. There are other ways to do this, but random search will work for this
# purpose.
# Random search is a method for hyperparameter tuning that will try a given number of random combinations of hyperparameters.
# Use the training set (X_train, y_train) to instantiate the random search by calling the .fit() method with the test set
# HINT: n_iter is the number of iterations to run the random search, if this number is too high, it will take a long time to run, 
# but if it's too low, it will not find the best hyperparameters. You should try to find a happy medium.

# First, create a new, similar model, but with the default hyperparameters. Do not fit this model with the training set.
random_search1 = RandomizedSearchCV(model1, param_distributions=params, n_iter=250, scoring="neg_mean_squared_error", n_jobs=-1, cv=5)
random_search2 = RandomizedSearchCV(model2, param_distributions=params, n_iter=250, scoring="neg_mean_squared_error", n_jobs=-1, cv=5)
# Fit the model with x and y train sets as results using train_test_split function from sklearn.
random_search1.fit(X_train, y_train) # Using test_train_split() function from sklearn
random_search2.fit(X, y) # Using test_data.csv

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [209]:
# Retrieve the best model/estimator from the random search
model_new1 = random_search1.best_params_
model_new2 = random_search2.best_params_
# We can compare both parameters sets for our models
print(f'new_model1 = {model_new1}')
print(f'new_model1 = {model_new2}')

new_model1 = {'n_estimators': 100, 'min_child_weight': 1, 'max_depth': 4, 'learning_rate': 0.05, 'gamma': 0.1, 'colsample_bytree': 0.4}
new_model1 = {'n_estimators': 100, 'min_child_weight': 5, 'max_depth': 4, 'learning_rate': 0.05, 'gamma': 0.4, 'colsample_bytree': 0.7}


In [204]:
model_new1 = random_search1.best_estimator_
model_new2 = random_search2.best_estimator_
print(type(model_new1))
print(type(model_new2))

<class 'xgboost.sklearn.XGBRegressor'>
<class 'xgboost.sklearn.XGBRegressor'>


In [205]:
# Create new predictions with the new model
preds1 = model_new1.predict(X_test)
preds2 = model_new2.predict(X_test_df)

In [206]:
# Get the new mean square error
new_mse1 = mean_squared_error(preds1, y_test)
new_mse2 = mean_squared_error(preds2, Y_test_df)
# Comparing both new mse
print(new_mse1)
print(new_mse2)

136.22276700923595
129.995984840456


In [207]:
# Compute the root-mean-square
new_rms1 = np.sqrt(new_mse1).round(1)
new_rms2 = np.sqrt(new_mse2).round(1)
print(f'After hyperparameters tuning first new model has rms = {new_rms1}')
print(f'After hyperparameters tuning second new model has rms = {new_rms2}')

After hyperparameters tuning first new model has rms = 11.7
After hyperparameters tuning second new model has rms = 11.4


As per results: 
1. After hyperparameters tuning new_model1 has rms = 11.7
2. After hyperparameters tuning new_model2 has rms = 11.4
3. We can observe that our results are improved and best hyperparameters for models are:
4. new_model1 = {'n_estimators': 100, 'min_child_weight': 1, 'max_depth': 4, 'learning_rate': 0.05, 'gamma': 0.1, 'colsample_bytree': 0.4}
5. new_model2 ={'n_estimators': 100, 'min_child_weight': 5, 'max_depth': 4, 'learning_rate': 0.05, 'gamma': 0.4, 'colsample_bytree': 0.7}

In [208]:
print(f"Relation between better error1 on the new_model1 and the old error1: {(new_rms1 / rms1)}")
print(f"Relation between better error2 on the new_model2 and the old error2: {(new_rms2 / rms2)}")

# If the new model did not perform better, this means that the default hyperparameters were better, but it is highly likely that even better ones exist.
# You can try to run the random search again, but with more iterations, or you can try to use GridSearchCV instead of RandomizedSearchCV ot test _every_ combination of hyperparameters.
# You can also edit the hyperparameters in the dictionary to see if you can get better results.

Relation between better error1 on the new model1 and the old error1: 0.8189350467190766
Relation between better error2 on the new model2 and the old error2: 0.9274335480057012


1. Relation between better error1 on the new model1 and the old error1: 0.8189350467190766
2. Relation between better error2 on the new model2 and the old error2: 0.9274335480057012