# DECISION TREE IMPLEMENTATION FROM SCRATCH ON ORIGINAL DATASET

In [1]:
import numpy as np
import pandas as pd 
import random
from sklearn.model_selection import train_test_split
from collections import Counter

In [2]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
        
    def is_leaf_node(self):
        return self.value is not None

In [3]:
class DecisionTree:
    # The decision tree in the definition uses a default value of the hyperparameters
    def __init__(self, min_samples_split=2, max_depth=5, n_features=None):
        self.min_samples_split=min_samples_split
        self.max_depth=max_depth
        self.n_features=n_features
        self.root=None

    # The build tree function takes the data and the target numpy arrays as inputs.
    def build_tree(self, X, y):
        self.n_features = X.shape[1] if not self.n_features else min(X.shape[1],self.n_features)
        self.root = self._grow_tree(X, y)

    # Recursively create the tree and then return the node to the build_tree function.
    def _grow_tree(self, X, y, depth=0):
        n_samples, n_feats = X.shape
        n_labels = len(np.unique(y))

        # Stopping Criteria: 
        # The hyperparameter 'max_depth, min_samples for split' will be passed and hence if depth >= max_depth or
        # samples in a node is lesser than the required or if the node is pure, it will be a leaf node.
        if (depth>=self.max_depth or n_labels==1 or n_samples<self.min_samples_split):
            leaf_value = self._most_common_label(y)                      #The value of the node is given to the class with most number.
            return Node(value=leaf_value)

        feat_idxs = np.random.choice(n_feats, self.n_features, replace=False)

        # find the best split among all features. 
        best_feature, best_thresh = self._best_split(X, y, feat_idxs)

        # create both left and right children
        left_idxs, right_idxs = self._split(X[:, best_feature], best_thresh)
        left = self._grow_tree(X[left_idxs, :], y[left_idxs], depth+1)
        right = self._grow_tree(X[right_idxs, :], y[right_idxs], depth+1)
        return Node(best_feature, best_thresh, left, right)


    #To find the best split in a node, we use the Information Gain Criteria.
    def _best_split(self, X, y, feat_idxs):
        best_gain = -1
        split_idx, split_threshold = None, None

        for feat_idx in feat_idxs:
            X_column = X[:, feat_idx]
            thresholds = np.unique(X_column)

            # Split on all possible thresholds.
            for thr in thresholds:
                gain = self._information_gain(y, X_column, thr)

                if gain > best_gain:
                    best_gain = gain
                    split_idx = feat_idx
                    split_threshold = thr

        return split_idx, split_threshold

    
    # A node is split according to the Information Gain gained from splitting using that feature.
    # IG = Entropy[parent] - weighted average * Entropy[children]
    def _information_gain(self, y, X_column, threshold):
        parent_entropy = self._entropy(y)

        left_idxs, right_idxs = self._split(X_column, threshold)

        if len(left_idxs) == 0 or len(right_idxs) == 0:
            return 0
        
        n = len(y)
        n_l, n_r = len(left_idxs), len(right_idxs)
        e_l, e_r = self._entropy(y[left_idxs]), self._entropy(y[right_idxs])
        child_entropy = (n_l/n) * e_l + (n_r/n) * e_r

        information_gain = parent_entropy - child_entropy
        return information_gain

    def _split(self, X_column, split_thresh):
        left_idxs = np.where(X_column <= split_thresh)[0]
        right_idxs = np.where(X_column > split_thresh)[0]
        return left_idxs, right_idxs

    
    # The measure of impurity in a node is calculated using entropy. Entropy = - Sigma(p * log(p)) over all classes present in the node.
    def _entropy(self, y):
        hist = np.bincount(y)
        ps = hist / len(y)
        return -np.sum([p * np.log(p) for p in ps if p>0])

    def _most_common_label(self, y):
        counter = Counter(y)
        value = counter.most_common(1)[0][0]
        return value

    def classify(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _traverse_tree(self, x, node):
        if node.is_leaf_node():
            return node.value

        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)
      



In [2]:
dataset_df1 = pd.read_csv(r"./premonsoon/ground_water_quality_2018_pre.csv")
dataset_df2 = pd.read_csv(r"./premonsoon/ground_water_quality_2019_pre.csv")
dataset_df3 = pd.read_csv(r"./premonsoon/ground_water_quality_2020_pre .csv")
dataset_df4 = pd.read_csv(r"./premonsoon/ground_water_quality_2021_pre.csv")
dataset_df5 = pd.read_csv(r"./premonsoon/ground_water_quality_2022_pre.csv")
dataset_df6 = pd.read_csv(r"./postmonsoon/ground_water_quality_2018_post.csv")
dataset_df7 = pd.read_csv(r"./postmonsoon/ground_water_quality_2019_post.csv")
dataset_df8 = pd.read_csv(r"./postmonsoon/ground_water_quality_2020_post.csv")
dataset_df9 = pd.read_csv(r"./postmonsoon/ground_water_quality_2021_post.csv")
dataset_df = pd.concat([dataset_df1,dataset_df2,dataset_df3,dataset_df4,dataset_df5,dataset_df6,dataset_df7,dataset_df8,dataset_df9],join='outer',ignore_index=True)
dataset_df.drop(columns=['RL_GIS','sno','village','mandal','district','gwl','Classification.1'],inplace=True)
dataset_df['season']=dataset_df['season'].astype(str).apply(lambda x: 0 if 'pre' in x.lower() else 1)
dataset_df.rename(columns={'RSC  meq  / L': 'RSC_meq/L'}, inplace=True)

In [3]:
dataset_df['pH'] = pd.to_numeric(dataset_df['pH'], errors='coerce')

# Now convert to float
dataset_df['pH'] = dataset_df['pH'].astype(float)
dataset_df['RSC_meq/L'] = pd.to_numeric(dataset_df['RSC_meq/L'], errors='coerce')
# dataset_df['season'] = pd.to_numeric(dataset_df['season'], errors='coerce')

# Now convert to float
dataset_df['RSC_meq/L'] = dataset_df['RSC_meq/L'].astype(float)

dataset_df['season'] = dataset_df['season'].astype(float)

In [4]:
#To remove rows containing null values
for index, row in dataset_df.iterrows():
    if 'NA' in row.values:
        dataset_df.drop(index, inplace=True)
    elif(row['Classification']=='OG')or(row['Classification']=='O.G')or(row['Classification']=='BELOW THE GRAPH')or(row['Classification']=='OUT OF SAR GRAPH')or(row['Classification']=='BG'):
        dataset_df.drop(index, inplace=True)
dataset_df.reset_index(drop=True, inplace=True)

In [5]:
null_threshold = len(dataset_df) * 0.25
for column in dataset_df.columns:
    null_count = dataset_df[column].isnull().sum()
    if null_count > null_threshold:
        dataset_df.drop(column, axis=1, inplace=True)
    elif null_count > 0:
        dataset_df.dropna(subset=[column], inplace=True)
dataset_df.reset_index(drop=True, inplace=True)

In [6]:
dataset_df.head()

Unnamed: 0,long_gis,lat_gis,season,pH,E.C,TDS,CO3,HCO3,Cl,F,NO3,SO4,Na,K,Ca,Mg,T.H,SAR,Classification,RSC_meq/L
0,78.5247,19.6683,0.0,8.21,1407.0,900.48,0.0,240.0,190.0,0.34,94.3377,129.0,95.0,5.0,48.0,111.826,579.810855,1.715349,C3S1,-6.796217
1,78.350833,19.458888,0.0,8.29,1620.0,1036.8,0.0,360.0,150.0,0.34,173.1739,115.0,105.0,19.0,104.0,87.516,619.851974,1.833654,C3S1,-5.197039
2,78.512222,19.525555,0.0,8.34,959.0,613.76,50.0,260.0,70.0,0.62,41.23399,56.0,55.0,2.0,56.0,72.93,439.876645,1.140169,C3S1,-2.597533
3,78.64,19.730555,0.0,8.22,509.0,325.76,0.0,230.0,30.0,0.91,19.26615,30.0,63.0,2.0,40.0,19.448,179.967105,2.041814,C2S1,1.000658
4,78.852654,19.495665,0.0,8.29,550.0,352.0,0.0,220.0,20.0,1.23,32.42028,25.0,49.0,7.0,32.0,24.31,179.958882,1.588114,C2S1,0.800822


In [7]:
column_to_shift = 'Classification'
dataset_df[column_to_shift] = dataset_df.pop(column_to_shift)

In [8]:
dataset_df.head()

Unnamed: 0,long_gis,lat_gis,season,pH,E.C,TDS,CO3,HCO3,Cl,F,NO3,SO4,Na,K,Ca,Mg,T.H,SAR,RSC_meq/L,Classification
0,78.5247,19.6683,0.0,8.21,1407.0,900.48,0.0,240.0,190.0,0.34,94.3377,129.0,95.0,5.0,48.0,111.826,579.810855,1.715349,-6.796217,C3S1
1,78.350833,19.458888,0.0,8.29,1620.0,1036.8,0.0,360.0,150.0,0.34,173.1739,115.0,105.0,19.0,104.0,87.516,619.851974,1.833654,-5.197039,C3S1
2,78.512222,19.525555,0.0,8.34,959.0,613.76,50.0,260.0,70.0,0.62,41.23399,56.0,55.0,2.0,56.0,72.93,439.876645,1.140169,-2.597533,C3S1
3,78.64,19.730555,0.0,8.22,509.0,325.76,0.0,230.0,30.0,0.91,19.26615,30.0,63.0,2.0,40.0,19.448,179.967105,2.041814,1.000658,C2S1
4,78.852654,19.495665,0.0,8.29,550.0,352.0,0.0,220.0,20.0,1.23,32.42028,25.0,49.0,7.0,32.0,24.31,179.958882,1.588114,0.800822,C2S1


In [11]:
#encoding/labelling the target column
label_mapping = {
    label: index for index, label in enumerate(dataset_df['Classification'].unique())
}
# Map labels to numerical values
dataset_df['Classification'] = dataset_df['Classification'].map(label_mapping)
dataset_df['Classification'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [12]:
dataset_df.head()

Unnamed: 0,long_gis,lat_gis,season,pH,E.C,TDS,CO3,HCO3,Cl,F,NO3,SO4,Na,K,Ca,Mg,T.H,SAR,RSC_meq/L,Classification
0,78.5247,19.6683,0.0,8.21,1407.0,900.48,0.0,240.0,190.0,0.34,94.3377,129.0,95.0,5.0,48.0,111.826,579.810855,1.715349,-6.796217,0
1,78.350833,19.458888,0.0,8.29,1620.0,1036.8,0.0,360.0,150.0,0.34,173.1739,115.0,105.0,19.0,104.0,87.516,619.851974,1.833654,-5.197039,0
2,78.512222,19.525555,0.0,8.34,959.0,613.76,50.0,260.0,70.0,0.62,41.23399,56.0,55.0,2.0,56.0,72.93,439.876645,1.140169,-2.597533,0
3,78.64,19.730555,0.0,8.22,509.0,325.76,0.0,230.0,30.0,0.91,19.26615,30.0,63.0,2.0,40.0,19.448,179.967105,2.041814,1.000658,1
4,78.852654,19.495665,0.0,8.29,550.0,352.0,0.0,220.0,20.0,1.23,32.42028,25.0,49.0,7.0,32.0,24.31,179.958882,1.588114,0.800822,1


In [13]:
dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4907 entries, 0 to 4906
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   long_gis        4907 non-null   float64
 1   lat_gis         4907 non-null   float64
 2   season          4907 non-null   float64
 3   pH              4907 non-null   float64
 4   E.C             4907 non-null   float64
 5   TDS             4907 non-null   float64
 6   CO3             4907 non-null   float64
 7   HCO3            4907 non-null   float64
 8   Cl              4907 non-null   float64
 9   F               4907 non-null   float64
 10  NO3             4907 non-null   float64
 11  SO4             4907 non-null   float64
 12  Na              4907 non-null   float64
 13  K               4907 non-null   float64
 14  Ca              4907 non-null   float64
 15  Mg              4907 non-null   float64
 16  T.H             4907 non-null   float64
 17  SAR             4907 non-null   f

In [19]:
X=dataset_df.iloc[:,0:19].values
y=dataset_df.iloc[:,19].values

train_df, test_df, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

train_df, val_df, y_train, y_val = train_test_split(
    train_df, y_train, test_size=0.1, random_state=0
)

In [20]:
train_df[:5]

array([[ 7.85600000e+01,  1.86600000e+01,  0.00000000e+00,
         7.43000000e+00,  4.63000000e+02,  2.96000000e+02,
         0.00000000e+00,  1.30000000e+02,  5.00000000e+01,
         5.40000000e-01,  2.60000000e+01,  9.00000000e+00,
         2.50000000e+01,  3.00000000e+00,  4.00000000e+01,
         1.90000000e+01,  1.80000000e+02,  8.10000000e-01,
        -1.00000000e+00],
       [ 7.88200000e+01,  1.90300000e+01,  0.00000000e+00,
         7.79000000e+00,  1.22800000e+03,  7.86000000e+02,
         0.00000000e+00,  3.30000000e+02,  1.50000000e+02,
         8.00000000e-01,  5.03000000e+01,  1.20000000e+01,
         1.02000000e+02,  5.00000000e+00,  9.60000000e+01,
         3.40300000e+01,  3.79940000e+02,  2.28000000e+00,
        -1.00000000e+00],
       [ 7.94600000e+01,  1.84528000e+01,  1.00000000e+00,
         7.92000000e+00,  6.92000000e+02,  4.42880000e+02,
         7.00000000e+01,  1.70000000e+02,  1.00000000e+02,
         1.90000000e-01,  4.22768182e+01,  2.47500000e+01,
    

In [21]:
test_df[:5]

array([[ 8.06200000e+01,  1.72300000e+01,  0.00000000e+00,
         8.20000000e+00,  7.69000000e+02,  4.92000000e+02,
         0.00000000e+00,  1.20000000e+02,  1.60000000e+02,
         5.80000000e-01,  1.98000000e+01,  2.40000000e+01,
         8.50000000e+01,  3.00000000e+00,  4.00000000e+01,
         2.40000000e+01,  2.00000000e+02,  2.61000000e+00,
        -1.60000000e+00],
       [ 7.96228420e+01,  1.73165490e+01,  0.00000000e+00,
         7.80000000e+00,  8.65000000e+02,  5.53600000e+02,
         0.00000000e+00,  3.02725802e+02,  4.00000000e+01,
         1.20000000e+00,  7.52930000e+01,  8.00000000e+00,
         8.93932926e+01,  2.00000000e+00,  3.20000000e+01,
         3.88960000e+01,  2.39934211e+02,  2.50917534e+00,
         1.25583182e+00],
       [ 7.89810330e+01,  1.67293920e+01,  1.00000000e+00,
         7.49000000e+00,  2.74000000e+03,  1.75360000e+03,
         0.00000000e+00,  2.44215962e+02,  4.40000000e+02,
         7.80000000e-01,  5.58054000e+02,  4.00000000e+01,
    

In [22]:
clf = DecisionTree(max_depth=5)
clf.build_tree(train_df, y_train)
classification = clf.classify(test_df)

def accuracy(y_test, y_pred):
    return np.sum(y_test == y_pred) / len(y_test)

acc = accuracy(y_test, classification)
print(acc)

0.9429735234215886


In [23]:
dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4907 entries, 0 to 4906
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   long_gis        4907 non-null   float64
 1   lat_gis         4907 non-null   float64
 2   season          4907 non-null   float64
 3   pH              4907 non-null   float64
 4   E.C             4907 non-null   float64
 5   TDS             4907 non-null   float64
 6   CO3             4907 non-null   float64
 7   HCO3            4907 non-null   float64
 8   Cl              4907 non-null   float64
 9   F               4907 non-null   float64
 10  NO3             4907 non-null   float64
 11  SO4             4907 non-null   float64
 12  Na              4907 non-null   float64
 13  K               4907 non-null   float64
 14  Ca              4907 non-null   float64
 15  Mg              4907 non-null   float64
 16  T.H             4907 non-null   float64
 17  SAR             4907 non-null   f

## Using the validation set for Hyperparameter tuning

In [24]:
def hyperparameter_tuning(train_df, y_train, val_df, y_val, max_depth):
    best_accuracy = 0
    best_params = {}

    for depth in max_depth:
        # Initialize and train decision tree classifier
        clf = DecisionTree(max_depth=depth)
        clf.build_tree(train_df, y_train)

        # Make classification on validation set
        classification = clf.classify(val_df)

        # Calculate accuracy
        acc = accuracy(y_val, classification)

        # Check if current model is the best so far
        if acc > best_accuracy:
            best_accuracy = acc
            best_params['max_depth'] = depth

    return best_params, best_accuracy

In [25]:
import matplotlib.pyplot as plt

# Define your range of hyperparameters to tune
max_depth = range(1, 51)  # Range from 1 to 50 for max_depth

# Initialize lists to store accuracy values
accuracies = []

best_params = 0
best_accuracy = 0
# Iterate over max_depth values and perform hyperparameter tuning
for depth in max_depth:
    # Perform hyperparameter tuning for current max_depth
    best_params, best_accuracy = hyperparameter_tuning(train_df, y_train, val_df, y_val, [depth])
    accuracies.append(best_accuracy)

print("Best hyperparameters:", best_params)
print("Best accuracy:", best_accuracy)

Best hyperparameters: {'max_depth': 50}
Best accuracy: 0.9363867684478372


In [32]:
import matplotlib.pyplot as plt

# Define your range of hyperparameters to tune
max_depth = range(1,20)  # Range from 1 to 50 for max_depth

# Initialize lists to store accuracy values
accuracies = []

best_params = 0
best_accuracy = 0
# Iterate over max_depth values and perform hyperparameter tuning
for depth in max_depth:
    # Perform hyperparameter tuning for current max_depth
    best_params, best_accuracy = hyperparameter_tuning(train_df, y_train, val_df, y_val, [depth])
    accuracies.append(best_accuracy)

print("Best hyperparameters:", best_params)
print("Best accuracy:", best_accuracy)

Best hyperparameters: {'max_depth': 19}
Best accuracy: 0.9440203562340967


In [33]:
clf = DecisionTree(max_depth=19)
clf.build_tree(train_df, y_train)
classification = clf.classify(test_df)

def accuracy(y_test, y_pred):
    return np.sum(y_test == y_pred) / len(y_test)

acc = accuracy(y_test, classification)
print(acc)

0.9378818737270875


## SKLEARN

### Remember to revert the train_df and test_df to pandas dataframe from numpy arrays

### Here we have restarted the kernel to get back the pandas dataframe.

In [10]:
from sklearn.model_selection import train_test_split

# Assuming dataset_df is your original DataFrame

# Step 1: Split the dataset into features (X) and target variable (y)
X = dataset_df.drop(columns=['Classification']).values
y = dataset_df['Classification'].values

# Step 2: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Optionally, create DataFrames for train and test sets
train_df = pd.DataFrame(X_train, columns=dataset_df.drop(columns=['Classification']).columns)
train_df['Classification'] = y_train

test_df = pd.DataFrame(X_test, columns=dataset_df.drop(columns=['Classification']).columns)
test_df['Classification'] = y_test

# Now you have train_df and test_df ready for further processing


In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Step 1: Prepare your data
X_train = train_df.drop(columns=['Classification'])
y_train = train_df['Classification']
X_test = test_df.drop(columns=['Classification'])
y_test = test_df['Classification']

# Step 2: Create a decision tree classifier
clf = DecisionTreeClassifier()

# Step 3: Train the model on the training data
clf.fit(X_train, y_train)

# Step 4: Make predictions on the testing data
y_pred = clf.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.9327902240325866
