### Decision Trees (CLASSIFIER REGRESSION)

### \DECISION TREE REGRESSION 

In [2]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [3]:
data = pd.read_csv('insurance.csv')
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
# preprocess the data

x = data.drop(columns=['charges'])
y = data['charges']

In [8]:
# encode the data

encoder = LabelEncoder()
cat_cols = list(data.select_dtypes(include="object").columns)
for col in cat_cols:
    x[col] = encoder.fit_transform(x[col])

x.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,0,27.9,0,1,3
1,18,1,33.77,1,0,2
2,28,1,33.0,3,0,2
3,33,1,22.705,0,0,1
4,32,1,28.88,0,0,1


In [9]:
#split the data 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state= 23)

# scale both train and test set 

scaler = StandardScaler()
scaler.fit(x_test)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)


In [15]:
# train the model 

model = DecisionTreeRegressor(random_state=23)
model.fit(x_train, y_train)
train_preds = model.predict(x_train)
test_preds = model.predict(x_test)

# evaluate the performance of the base model
train_rmse = root_mean_squared_error(y_train, train_preds)
test_rmse = root_mean_squared_error(y_test, test_preds)

print(f'Train score: {train_rmse}')
print(f'Test score: {test_rmse}')

Train score: 413.77689999135333
Test score: 6140.195210303281


### A slightly optimized Model

In [19]:
model = DecisionTreeRegressor(random_state=23, max_depth=4)
model.fit(x_train, y_train)
train_preds = model.predict(x_train)
test_preds = model.predict(x_test)

# evaluate the performance of the base model 
train_rmse = root_mean_squared_error(y_train, train_preds)
test_rmse = root_mean_squared_error(y_test, test_preds)

print(f'Train score: {train_rmse}')
print(f'Test score: {test_rmse}')

Train score: 4387.8194415568905
Test score: 4519.112521297888


In [21]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [22]:
data = pd.read_csv('heart_disease.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,age,sex,chest pain type,resting bps,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope,target
0,0,40,1,2,140,289.0,0,0,172,0,0.0,1,0
1,1,49,0,3,160,180.0,0,0,156,0,1.0,2,1
2,2,37,1,2,130,283.0,0,1,98,0,0.0,1,0
3,3,48,0,4,138,214.0,0,0,108,1,1.5,2,1
4,4,54,1,3,150,195.0,0,0,122,0,0.0,1,0


In [None]:
# preprocess the data

x = data.drop(columns=['target'])
y = data['target']

In [24]:
# split the data 
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size= 0.15, random_state=23, stratify=y)

# scale both the train and test 
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)


In [29]:
# Train the base model
model = DecisionTreeClassifier(random_state=23)
model.fit(x_train, y_train)
train_preds = model.predict(x_train)
test_preds = model.predict(x_test)

# evaluate the performance of the base model
print(f'train: {classification_report(y_train, train_preds)}')
print("*"*60)
print(f'test: {classification_report(y_test, test_preds)}')

train:               precision    recall  f1-score   support

           0       1.00      1.00      1.00        79
           1       1.00      1.00      1.00        78

    accuracy                           1.00       157
   macro avg       1.00      1.00      1.00       157
weighted avg       1.00      1.00      1.00       157

************************************************************
test:               precision    recall  f1-score   support

           0       0.74      0.70      0.72       449
           1       0.71      0.74      0.73       442

    accuracy                           0.72       891
   macro avg       0.72      0.72      0.72       891
weighted avg       0.72      0.72      0.72       891



In [31]:
# Applying early stopping 

model = DecisionTreeClassifier(random_state=23, max_depth=4)
model.fit(x_train, y_train)
train_preds = model.predict(x_train)
test_preds = model.predict(x_test)

print(f'train: {classification_report(y_train, train_preds)}')
print('*'*65)
print(f'test: {classification_report(y_test, test_preds)}')

train:               precision    recall  f1-score   support

           0       0.88      0.91      0.89        79
           1       0.91      0.87      0.89        78

    accuracy                           0.89       157
   macro avg       0.89      0.89      0.89       157
weighted avg       0.89      0.89      0.89       157

*****************************************************************
test:               precision    recall  f1-score   support

           0       0.73      0.71      0.72       449
           1       0.72      0.74      0.73       442

    accuracy                           0.73       891
   macro avg       0.73      0.73      0.73       891
weighted avg       0.73      0.73      0.73       891

