# Imports

In [102]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import mean_squared_error
import optuna as opt
import matplotlib.pyplot as plt
import math
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV



# Importing Data

In [103]:
df = pd.read_csv("./../data/train_data.csv")
df_test = pd.read_csv("./../data/test_data.csv")


# Data Exploration


In [104]:
df.head(2)
df.info()

# chekcing NaN for Ticket
ticketNa = df["Ticket"].isnull().sum()
print(f"Ammount of Nan Values in Ticket Series: {ticketNa}")

# chekcing NaN for Fare
fareNa = df["Fare"].isnull().sum()
print(f"Ammount of Nan Values in Fare Series: {fareNa}")
#df.head(-10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1493 entries, 0 to 1492
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1493 non-null   int64  
 1   Survived     1493 non-null   int64  
 2   Pclass       1493 non-null   int64  
 3   Sex          1493 non-null   int64  
 4   SibSp        1493 non-null   int64  
 5   Parch        1493 non-null   int64  
 6   Ticket       1493 non-null   object 
 7   Fare         1493 non-null   object 
 8   Cabin        1493 non-null   int64  
 9   Embarked     1493 non-null   int64  
 10  Age          1488 non-null   float64
dtypes: float64(1), int64(8), object(2)
memory usage: 128.4+ KB
Ammount of Nan Values in Ticket Series: 0
Ammount of Nan Values in Fare Series: 0


# Data Cleaning

## Remove duplicates

In [105]:
df = df.drop_duplicates()



## Remove NaN values

In [106]:
df = df.dropna()


## Cast Fare and Ticket

In [107]:

 df["Ticket"] = df["Ticket"].replace("ABC",0)
 df["Fare"] = df["Fare"].replace("Hei!",0)
 df["Fare"] = df["Fare"].replace("Her var det en tekst-streng",0)
 df["Fare"] = df["Fare"].replace("Kanskje du burde fjerne denne?",0)
 df["Fare"] = df["Fare"].replace("Eller ikke. Opp til deg.",0)



# casting type
df["Ticket"] = df["Ticket"].astype(int)
df["Fare"] = df["Fare"].astype(float)

## Clean Age

In [108]:
df = df[df["Age"] > 0]
df = df[df["Age"] < 120]

# Model creation

## Define X and y

In [109]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age
count,541.0,541.0,541.0,541.0,541.0,541.0,541.0,541.0,541.0,541.0,541.0
mean,454.826248,0.401109,2.232902,0.643253,0.491682,0.438078,246.210721,34.536498,17.046211,1.57671,29.925453
std,258.874908,0.490577,0.835169,0.479483,0.897747,0.864073,191.300859,49.352427,36.320279,0.79357,14.778928
min,1.0,0.0,1.0,0.0,0.0,0.0,-496.0,0.0,-1.0,-1.0,0.42
25%,231.0,0.0,1.0,0.0,0.0,0.0,109.0,8.05,-1.0,2.0,21.0
50%,461.0,0.0,2.0,1.0,0.0,0.0,266.0,15.75,-1.0,2.0,28.0
75%,680.0,1.0,3.0,1.0,1.0,1.0,403.0,33.0,-1.0,2.0,39.0
max,891.0,1.0,3.0,1.0,5.0,6.0,541.0,512.3292,133.0,2.0,80.0


In [110]:
from cgi import test


X = df.drop(["Age"], axis=1)
y = df["Age"]
X_test = df_test.drop(["Age"], axis=1)
y_test = df_test["Age"]

In [111]:
# Create the XGBoost regression model. XGBoost stands for: eXtreme Gradient Boosting. 
# This is a very popular algorithm, used in machine learning competitions and in the industry. 
# We will use it for regression, but it can also be used for classification.

model = xgb.XGBRegressor()

In [112]:
# use the training set (X_train, y_train) to train the model by calling the .fit() method
model.fit(X, y)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [113]:
# Use the model to predict the target values for the test set (X_test)
preds = model.predict(X_test)

In [114]:
# find the mean squared error for the predictions (a value to see the value of the predictions, lower is better)
# find the error between the y_test and the preds
mse = mean_squared_error(preds, y_test)

In [115]:
# print the rmse to see how much, on average, your model is off (squared)

rmse = math.sqrt(mse)
rmse

12.46803431929343

# Hyperparameter tuning

In [116]:
# These are some of the hyperparameters you can tune for XGBoost. 
# A hyperparameter is a parameter that is not learned by the model, but is set by the user.
# The parameters that are learned by the model are called model parameters.
# The model starts off with some default values for the hyperparameters, but you can change them to get potentially better results.
# This process is called hyperparameter tuning.

# If you want, you can adjust the hyperparameters and see if you can get a better result. You can also add more hyperparameters to the dictionary.
# List of hyperparameters: https://xgboost.readthedocs.io/en/latest/parameter.html
params = {
    "learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
    "max_depth": [3, 4, 5, 6, 8, 10, 12, 15],
    "min_child_weight": [1, 3, 5, 7],
    "gamma": [0.0, 0.1, 0.2, 0.3, 0.4],
    "colsample_bytree": [0.3, 0.4, 0.5, 0.7],
    "n_estimators": [100, 200, 300, 400, 500, 900, 1100, 1500],
}

In [117]:
# Use RandomizedSearchCV to find the best hyperparameters for the model. There are other ways to do this, but random search will work for this purpose.
# Random search is a method for hyperparameter tuning that will try a given number of random combinations of hyperparameters.
# Use the training set (X_train, y_train) to instantiate the random search by calling the .fit() method with the test set
# HINT: n_iter is the number of iterations to run the random search, if this number is too high, it will take a long time to run, 
# but if it's too low, it will not find the best hyperparameters. You should try to find a happy medium.

# First, create a new, similar model, but with the default hyperparameters. Do not fit this model with the training set.
model2 = xgb.XGBRegressor()

random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=10, scoring="neg_mean_squared_error", n_jobs=-1, cv=5)

# Fit the model with x and y train sets
random_search.fit(X, y)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.co

In [None]:
# Retrieve the best model/estimator from the random search
model_new = random_search.best_params_
model_new

{'n_estimators': 200,
 'min_child_weight': 5,
 'max_depth': 3,
 'learning_rate': 0.1,
 'gamma': 0.0,
 'colsample_bytree': 0.3}

In [None]:
model_new = random_search.best_estimator_

In [None]:
# Create new predictions with the new model
preds = model_new.predict(X_test)

In [None]:
# Get the new root mean square error
mse_new = mean_squared_error(preds, y_test)

rmse = math.sqrt(mse_new)
rmse

11.827914146314786