In [24]:
import pandas as pd
import seaborn as sns


In [25]:
df = sns.load_dataset("titanic")
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [26]:
from sklearn.model_selection import  KFold, RepeatedKFold,  train_test_split,cross_val_score
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier

**Scikit learn random forest only takes numbers as input,
types need to be converted into numbers somehow.**

In [27]:
df.dtypes

Unnamed: 0,0
survived,int64
pclass,int64
sex,object
age,float64
sibsp,int64
parch,int64
fare,float64
embarked,object
class,category
who,object


**1.) Boolean can be converted to 0 and 1 automaticaly**

In [28]:
df['adult_male'].head()

Unnamed: 0,adult_male
0,True
1,False
2,False
3,False
4,True


This can not handle NULL values

In [29]:
df['adult_male'].astype(int).head()


Unnamed: 0,adult_male
0,1
1,0
2,0
3,0
4,1


While this can hangle NULL values

In [30]:
df['adult_male'].astype('Int64').head()

Unnamed: 0,adult_male
0,1
1,0
2,0
3,0
4,1


In [31]:
df_temp = pd.DataFrame({'alive': [True, False, None, True]})
df_temp['alive_mapped'] = df_temp['alive'].astype('Int64')
df_temp['alive_mapped']

Unnamed: 0,alive_mapped
0,1.0
1,0.0
2,
3,1.0


**2.) Labeled data that is boolean in nature is coverted via dictionary mapping**

In [32]:
df['alive'].head()

Unnamed: 0,alive
0,no
1,yes
2,yes
3,yes
4,no


In [33]:
df['alive'].map({'yes': 1, 'no': 0}).head()


Unnamed: 0,alive
0,0
1,1
2,1
3,1
4,0


This operation leaves NaN values as is

In [34]:
df_temp = pd.DataFrame({'alive': ['yes', 'no', None, 'yes']})
df_temp['alive_mapped'] = df_temp['alive'].map({'yes': 1, 'no': 0})
df_temp['alive_mapped']

Unnamed: 0,alive_mapped
0,1.0
1,0.0
2,
3,1.0


**3.) Nominal categorical vairables, have no inherent order or ranking. These can be converted via get_dummies,
that creates boolean variable, that show whether something is present or no**

In [35]:
df['embark_town'].unique()

array(['Southampton', 'Cherbourg', 'Queenstown', nan], dtype=object)

pandas.get_dummies():

drops the original categorical columns and replaces them by the new dummy indicator columns.

In [36]:
pd.get_dummies(df, columns=['who', 'embarked', 'embark_town'] ).dtypes

Unnamed: 0,0
survived,int64
pclass,int64
sex,object
age,float64
sibsp,int64
parch,int64
fare,float64
class,category
adult_male,bool
deck,category


Parameter, dummy_na, specifies how Nan columns are handled, False by default, columns, then NaN will have all columns set as 0.

In [37]:
df_test = pd.DataFrame({'color': ['red', 'blue', None]})
pd.get_dummies(df_test['color'], dummy_na=False)

Unnamed: 0,blue,red
0,False,True
1,True,False
2,False,False


In [38]:
df_test = pd.DataFrame({'color': ['red', 'blue', None]})
pd.get_dummies(df_test['color'], dummy_na=True)

Unnamed: 0,blue,red,NaN
0,False,True,False
1,True,False,False
2,False,False,True


In [39]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


**4.) Ordinal categorical vairables, are variables,
that have - natural order or ranking, but without fixed numeric distances. Listed bellow are ways to achieve this.**

In [40]:
df['class']

Unnamed: 0,class
0,Third
1,First
2,Third
3,First
4,Third
...,...
886,Second
887,First
888,Third
889,First


1.) Using pandas native categorical type.

In [41]:
from pandas.api.types import CategoricalDtype

order = ["First", "Second", "Third"]
cat_type = CategoricalDtype(categories=order, ordered=True)

df["class_mapping"] = df["class"].astype(cat_type)
df[["class", "class_mapping"]]

Unnamed: 0,class,class_mapping
0,Third,Third
1,First,First
2,Third,Third
3,First,First
4,Third,Third
...,...,...
886,Second,Second
887,First,First
888,Third,Third
889,First,First


This creates a pandas native category type

In [42]:
df['class_mapping']

Unnamed: 0,class_mapping
0,Third
1,First
2,Third
3,First
4,Third
...,...
886,Second
887,First
888,Third
889,First


Which cab be mapped like this to integers

In [43]:
df["class_mapping"] = df["class"].astype(cat_type).cat.codes
df['class_mapping']

Unnamed: 0,class_mapping
0,2
1,0
2,2
3,0
4,2
...,...
886,1
887,0
888,2
889,0


2.) Mapping via dictionary of ints

In [44]:
mapping = {"First": 1, "Second": 2, "Third": 3}
df["class_mapping"] =  df['class'].map({'First': 1, 'Second': 2, 'Third': 3})
df[["class", "class_mapping"]]

Unnamed: 0,class,class_mapping
0,Third,3
1,First,1
2,Third,3
3,First,1
4,Third,3
...,...,...
886,Second,2
887,First,1
888,Third,3
889,First,1


3.) Using ordinal encoder

In [45]:
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder(categories=[["First", "Second", "Third"]])
df["class_mapping"] = encoder.fit_transform(df[["class"]])
df[["class", "class_mapping"]]

Unnamed: 0,class,class_mapping
0,Third,2.0
1,First,0.0
2,Third,2.0
3,First,0.0
4,Third,2.0
...,...,...
886,Second,1.0
887,First,0.0
888,Third,2.0
889,First,0.0


# Random forest

Random needs all input features to be numberic somehown(integer or float) and to lack missing values.

What is needed for preprocesing and how to train and validate on random forest classifier.

In [46]:
df = sns.load_dataset("titanic")
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [47]:
df = sns.load_dataset("titanic")

bool_cols = df.select_dtypes(include='bool').columns
df[bool_cols] = df[bool_cols].astype(int)

df = pd.get_dummies(df, columns= ['deck', 'sex', 'embarked', 'who', 'embark_town'])

df['alive'] = df['alive'].map({'yes': 1, 'no': 0})
df['class'] = df['class'].map({'First': 1, 'Second': 2, 'Third': 3})


Random forest needs to work on non null columnns

In [48]:
columns_only_null = df.columns[df.isnull().sum()  > 0]
columns_only_null

Index(['age'], dtype='object')

In [49]:
df['age'] = df['age'].fillna(df['age'].mean())

In [50]:
df

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,class,adult_male,alive,alone,...,sex_male,embarked_C,embarked_Q,embarked_S,who_child,who_man,who_woman,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
0,0,3,22.000000,1,0,7.2500,3,1,0,0,...,True,False,False,True,False,True,False,False,False,True
1,1,1,38.000000,1,0,71.2833,1,0,1,0,...,False,True,False,False,False,False,True,True,False,False
2,1,3,26.000000,0,0,7.9250,3,0,1,1,...,False,False,False,True,False,False,True,False,False,True
3,1,1,35.000000,1,0,53.1000,1,0,1,0,...,False,False,False,True,False,False,True,False,False,True
4,0,3,35.000000,0,0,8.0500,3,1,0,1,...,True,False,False,True,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,27.000000,0,0,13.0000,2,1,0,1,...,True,False,False,True,False,True,False,False,False,True
887,1,1,19.000000,0,0,30.0000,1,0,1,1,...,False,False,False,True,False,False,True,False,False,True
888,0,3,29.699118,1,2,23.4500,3,0,0,0,...,False,False,False,True,False,False,True,False,False,True
889,1,1,26.000000,0,0,30.0000,1,1,1,1,...,True,True,False,False,False,True,False,True,False,False


In [51]:
X = df.drop("survived", axis=1)
y = df["survived"]

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [53]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)

In [54]:
accuracy_score(clf.predict(X_test), y_test)

0.9776536312849162

**K-Fold Cross-Validation**

Split dataset into k folds;
train on k-1 folds,
validate on the remaining; repeat k times


In [55]:

# K-Fold CV (5 folds)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(clf, X, y, cv=kf)

print("K-Fold scores:", scores)
print("Mean accuracy:", scores.mean())


K-Fold scores: [0.97765363 0.98876404 0.97752809 0.9494382  0.97191011]
Mean accuracy: 0.9730588161446236


**RepeatedKFold Cross-Validation**

In [56]:

# Repeated K-Fold (5 folds, 3 repeats)
rkf = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)
scores = cross_val_score(clf, X, y, cv=rkf)

print("Repeated K-Fold scores:", scores)
print("Mean accuracy:", scores.mean())


Repeated K-Fold scores: [0.97765363 0.98876404 0.97752809 0.9494382  0.97191011 0.94413408
 0.95505618 0.95505618 0.96629213 0.94382022 0.99441341 0.96629213
 0.96629213 0.95505618 0.95505618]
Mean accuracy: 0.9644508610047497


**Out-of-Bag (OOB) Validation**



Random Forest bootstrap sampling leaves ~1/3 of data unused per tree; these OOB samples act as validation. This is built in random forest, requires no additional data.


In [57]:
clf_oob = RandomForestClassifier(oob_score=True, random_state=42)
clf_oob.fit(X, y)

print("OOB score:", clf_oob.oob_score_)


OOB score: 1.0


# Gradient boosting implementations

â€¢ 	If you want maximum performance and flexibility â†’ go with XGBoost.Â¸

â€¢ 	If you need speed on massive datasets â†’ choose LightGBM.

â€¢ 	If your data has lots of categorical variables â†’ CatBoost is the best fit

**XGBoost**



In [58]:
import xgboost as xgb


In [None]:
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss")
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)
print("XGBoost Accuracy:", accuracy_score(y_test, xgb_preds))


**LightGBM**


In [None]:
import lightgbm as lgb


In [None]:
lgb_model = lgb.LGBMClassifier()
lgb_model.fit(X_train, y_train)
lgb_preds = lgb_model.predict(X_test)
print("LightGBM Accuracy:", accuracy_score(y_test, lgb_preds))

**CatBoost**

In [None]:
!pip install catboost

In [None]:
from catboost import CatBoostClassifier

In [None]:
cat_model = CatBoostClassifier(verbose=0)
cat_model.fit(X_train, y_train)
cat_preds = cat_model.predict(X_test)
print("CatBoost Accuracy:", accuracy_score(y_test, cat_preds))
