## Decision tree Model

In [None]:
# A Decision Tree is a supervised ML algorithm used for both classification and regression tasks. 
# It has 3 types of nodes:
# 1 --> Root Node
# 2 --> Internal Node  
# 3 --> Leaf Node

# 1) Root Node: 
#    - Represents the first feature selected for splitting the data.

# 2) Internal Nodes:
#    - Represent decision conditions (feature-based questions) that further split the data.

# 3) Leaf Node:
#    - Represents the final output (class/label/value).

# ---> How it works?

# 1) Select the best feature to split the data (using metrics like Gini or Information Gain).
# 2) Split the dataset based on that feature.
# 3) Repeat the process recursively for each branch.
# 4) Stop when one of these conditions is met:
#    - All data in a node belongs to the same class.
#    - Maximum depth is reached.
#    - No significant gain from further splitting (e.g., purity can't be improved much).

# ---> Key concepts used for splitting:

# 1) Gini Index:
#    - Measures the impurity of a dataset.
#    - Tells how often a randomly chosen element would be incorrectly labeled if randomly labeled 
#      according to the class distribution in the node.
#    - Formula:
#          Gini(D) = 1 - Σ (pi)^2
#        where:
#          - pi is the proportion of instances belonging to class i
#          - Σ means "sum over all classes"
#    - A Gini Index of 0 means the node is pure (only one class present).
#    - The higher the Gini, the more impurity.

#    Example:
#    Suppose we have 10 samples:
#       - Red = 4
#       - Green = 6
#    Proportion of Red = 4/10 = 0.4
#    Proportion of Green = 6/10 = 0.6
#    Gini = 1 - (0.4)^2 - (0.6)^2 
#         = 1 - 0.16 - 0.36 
#         = 0.48

# 2) Information Gain:
#    - Measures the reduction in entropy (uncertainty) after splitting the dataset on an attribute.
#    - Formula:
#         Information Gain = Entropy(parent) - Weighted avg. entropy(children)

#    Entropy Formula:
#         H(S) = - Σ pi * log2(pi)

#    Example:
#    Suppose we have 10 samples:
#       - 4 Green, 6 Red

#    Initial Entropy:
#       p_green = 4/10 = 0.4
#       p_red = 6/10 = 0.6
#       Entropy = -0.4*log2(0.4) - 0.6*log2(0.6) ≈ 0.971

#    After splitting into 2 groups:
#    Group 1 --> 3 Green, 1 Red
#       - p_green = 3/4 = 0.75
#       - p_red = 1/4 = 0.25
#       - Entropy ≈ -0.75*log2(0.75) - 0.25*log2(0.25) ≈ 0.811

#    Group 2 --> 5 Red, 1 Green
#       - p_red = 5/6 ≈ 0.833
#       - p_green = 1/6 ≈ 0.167
#       - Entropy ≈ -0.833*log2(0.833) - 0.167*log2(0.167) ≈ 0.650

#    Total Information Gain:
#        IG = 0.971 - [(4/10 * 0.811) + (6/10 * 0.650)]
#           = 0.971 - (0.324 + 0.390)
#           = 0.971 - 0.714
#           = 0.257 (Answer)

# ----->   Extra :-
# 3) Mean Squared Error (MSE): [Used for Regression Trees]
#    - Measures the average squared difference between actual and predicted values.
#    - Formula:
#         MSE = (1/n) * Σ (yi - ŷi)^2
#      where:
#         - yi is the actual value
#         - ŷi is the predicted value
#         - n is the number of data points


## DecisionTreeRegressor

In [None]:
# it is mainly used for numerical data

In [152]:
import numpy as np
import pandas as pd

In [154]:
df = pd.read_csv("C:\\Users\\hardi\\OneDrive - wipro\\Desktop\\datasets\\insurance - insurance.csv")

In [156]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [158]:
df = pd.get_dummies(df, columns = ['sex' , 'smoker' , 'region'])

In [160]:
df = df.astype(int)

In [162]:
df.head()

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27,0,16884,1,0,0,1,0,0,0,1
1,18,33,1,1725,0,1,1,0,0,0,1,0
2,28,33,3,4449,0,1,1,0,0,0,1,0
3,33,22,0,21984,0,1,1,0,0,1,0,0
4,32,28,0,3866,0,1,1,0,0,1,0,0


In [164]:
x = df.drop(columns = ['charges'])
y = df['charges']

In [166]:
from sklearn.model_selection import train_test_split

In [168]:
x_train  , x_test , y_train , y_test = train_test_split(x,y,test_size=0.2,random_state =42)

In [170]:
from sklearn.tree import DecisionTreeRegressor

In [172]:
dt = DecisionTreeRegressor()

In [174]:
dt.fit(x_train , y_train)

In [176]:
y_pred = dt.predict(x_test)

In [178]:
from sklearn.metrics import r2_score

In [180]:
r2_score(y_test , y_pred)

0.74644913968167

##  DecisionTreeClassifier

In [183]:
# Used for classification tasks — i.e., predicting categories or class labels.
# --> it is mainly used for categorical data.

In [185]:
df = pd.read_csv("C:\\Users\\hardi\\OneDrive - wipro\\Desktop\\datasets\\Social_Network_Ads - Social_Network_Ads.csv")

In [187]:
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [189]:
df = pd.get_dummies(df, columns = ['Gender'])


In [191]:
x = df.drop(columns = ['Purchased'])
y = df['Purchased']

In [193]:
from sklearn.model_selection import train_test_split

In [195]:
x_train , x_test, y_train ,y_test = train_test_split(x,y,test_size=0.2,random_state =42)

In [197]:
from sklearn.tree import DecisionTreeClassifier

In [199]:
dt = DecisionTreeClassifier()

In [201]:
dt.fit(x_train , y_train)

In [211]:
y_pred = dt.predict(x_test)

In [213]:
from sklearn.metrics import accuracy_score

In [215]:
accuracy_score(y_test , y_pred)

0.9

In [209]:
## Feature  	   DecisionTreeClassifier	     DecisionTreeRegressor

# --> Output Type	     Discrete class labels	     Continuous numeric values

# --> Objective	       Classification	                 Regression

# --> Default Criterion	Gini Impurity	          Mean Squared Error (MSE)

In [150]:
# Task Type	           Metric to Use
# Regression	          ✅ r2_score, mean_squared_error, etc.
# Classification	      ✅ accuracy_score, f1_score, precision, recall