In [6]:
#Answer 3(a) Decision tree implementation
import numpy as np
import pandas as pd
import random
from pprint import pprint

df=pd.read_csv('J:\MTech Notes\FOML\wine-dataset.csv')
df=df.rename(columns={'quality':'label','fixed acidity':'fixed_acidity','volatile acidity':'volatile_acidity',
                      'citric acid':'citric_acid','residual sugar':'residual_sugar','free sulfur dioxide':'free_sulfur_dioxide',
                       'total sulfur dioxide':'total_sulfur_dioxide'})

#Functions used throughout


def split_tr_te(df,test_size):
    
    if isinstance(test_size,float):
        test_size=round(test_size*len(df))
    
    indices=df.index.tolist()
    test_indices=random.sample(population=indices,k=test_size)
    
    test_df=df.loc[test_indices]
    train_df=df.drop(test_indices)
    return train_df,test_df

random.seed(0)
train_df, test_df = split_tr_te(df, test_size=0.2)
data=df.values


def purity(data):
    label_column=data[:,-1]
    unique_classes=np.unique(label_column)
    
    if len(unique_classes)==1:
        return True
    else:
        return False
    
def classify_data(data):
    label_column=data[:,-1]
    unique_classes,counts_unique_classes=np.unique(label_column,return_counts=True)
    
    index=counts_unique_classes.argmax()
    classification=unique_classes[index]
    
    return classification


def pot_splits(data):
    
    potential_splits={}
    _, n_columns=data.shape
    for column_index in range(n_columns-1):
        potential_splits[column_index]=[]
        values=data[:,column_index]
        unique_values=np.unique(values)
        
        for index in range(len(unique_values)):
            if index != 0:
                current_value=unique_values[index]
                previous_value=unique_values[index-1]
                potential_split=(current_value+previous_value)/2
                potential_splits[column_index].append(potential_split)
    return potential_splits


def data_split(data,split_column,split_value):
    
    split_column_values=data[:,split_column]
    data_below=data[split_column_values<=split_value]
    data_above=data[split_column_values > split_value]
    
    return data_below,data_above

#calculating entropy and info gain

def cal_entropy(data):
    label_column=data[:,-1]
    _, counts=np.unique(label_column,return_counts=True)
    
    probabilities=counts/counts.sum()
    entropy=sum(probabilities* -np.log2(probabilities))
    return entropy

def cal_overall_entropy(data_below,data_above):
    n=len(data_below)+ len(data_above)
    p_data_below=len(data_below)/n
    p_data_above=len(data_above)/n
    
    overall_entropy= (p_data_below*cal_entropy(data_below)
                     + p_data_above*cal_entropy(data_above))
    info_gain=(1-overall_entropy)
    return overall_entropy

def best_split(data,potential_splits):
    
    overall_entropy=99999
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_below,data_above=data_split(data, split_column=column_index, split_value=value)
            current_overall_entropy=cal_overall_entropy(data_below,data_above)
            
            if current_overall_entropy <= overall_entropy:
                overall_entropy=current_overall_entropy
                best_split_column=column_index
                best_split_value=value
                
    return best_split_column,best_split_value


#main algorithm for decision tree implementation

def main_algorithm(df,counter=0,min_samples=2,max_depth=5):
    
    if counter==0:
        global column_headers
        column_headers=df.columns
        data=df.values
    else:
        data=df
    
    
    if (purity(data))or (len(data)<min_samples) or (counter==max_depth):
        classification=classify_data(data)
        
        return classification

    else:
        counter +=1
        
        potential_splits=pot_splits(data)
        split_column,split_value=best_split(data,potential_splits)
        data_below,data_above= data_split(data,split_column,split_value)
        
        
        feature_name=column_headers[split_column]
        question="{} <= {}".format(feature_name,split_value)
        sub_tree={question: []}
        
        
        yes_answer=main_algorithm(data_below,counter,min_samples,max_depth)
        no_answer=main_algorithm(data_above,counter,min_samples,max_depth)
        
        
        if yes_answer==no_answer:
            sub_tree=yes_answer
        else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)
            
        return sub_tree
    

def classify_example(example, tree):
    question = list(tree.keys())[0]
    feature_name, comparison_operator, value = question.split()

    # ask question
    if example[feature_name] <= float(value):
        answer = tree[question][0]
    else:
        answer = tree[question][1]

    # base case
    if not isinstance(answer, dict):
        return answer
    
    # recursive part
    else:
        residual_tree = answer
        return classify_example(example, residual_tree)
    
def calculate_accuracy_of_tree(df, tree):

    df["classification"] = df.apply(classify_example, axis=1, args=(tree,))
    df["classification_correct"] = df["classification"] == df["label"]
    
    accuracy = df["classification_correct"].mean()
    
    return accuracy


tree=main_algorithm(train_df,max_depth=13)
pprint(tree)
accuracy = calculate_accuracy_of_tree(test_df, tree)
print("The overall accuracy of decision tree is :", accuracy)

{'alcohol <= 10.625': [{'volatile_acidity <= 0.2025': [{'density <= 0.99788': [{'total_sulfur_dioxide <= 141.5': [{'free_sulfur_dioxide <= 27.5': [{'citric_acid <= 0.45': [{'alcohol <= 10.149999999999999': [{'citric_acid <= 0.43': [{'fixed_acidity <= 5.25': [1.0,
                                                                                                                                                                                                                                                                   {'fixed_acidity <= 7.35': [0.0,
                                                                                                                                                                                                                                                                                              {'chlorides <= 0.0495': [0.0,
                                                                                                                                 

In [4]:
#Answer 3 (b): Using 10-fold validation

import numpy as np
import pandas as pd
import random
from pprint import pprint

df=pd.read_csv('J:\MTech Notes\FOML\wine-dataset.csv')
df=df.rename(columns={'quality':'label','fixed acidity':'fixed_acidity','volatile acidity':'volatile_acidity',
                      'citric acid':'citric_acid','residual sugar':'residual_sugar','free sulfur dioxide':'free_sulfur_dioxide',
                       'total sulfur dioxide':'total_sulfur_dioxide'})

#Functions used throughout


def split_tr_te(df,test_size):
    
    if isinstance(test_size,float):
        test_size=round(test_size*len(df))
    
    indices=df.index.tolist()
    test_indices=random.sample(population=indices,k=test_size)
    
    test_df=df.loc[test_indices]
    train_df=df.drop(test_indices)
    return train_df,test_df

random.seed(0)
train_df, test_df = split_tr_te(df, test_size=0.2)
data=df.values


def purity(data):
    label_column=data[:,-1]
    unique_classes=np.unique(label_column)
    
    if len(unique_classes)==1:
        return True
    else:
        return False
    
def classify_data(data):
    label_column=data[:,-1]
    unique_classes,counts_unique_classes=np.unique(label_column,return_counts=True)
    
    index=counts_unique_classes.argmax()
    classification=unique_classes[index]
    
    return classification


def pot_splits(data):
    
    potential_splits={}
    _, n_columns=data.shape
    for column_index in range(n_columns-1):
        potential_splits[column_index]=[]
        values=data[:,column_index]
        unique_values=np.unique(values)
        
        for index in range(len(unique_values)):
            if index != 0:
                current_value=unique_values[index]
                previous_value=unique_values[index-1]
                potential_split=(current_value+previous_value)/2
                potential_splits[column_index].append(potential_split)
    return potential_splits


def data_split(data,split_column,split_value):
    
    split_column_values=data[:,split_column]
    data_below=data[split_column_values<=split_value]
    data_above=data[split_column_values > split_value]
    
    return data_below,data_above


def cal_entropy(data):
    label_column=data[:,-1]
    _, counts=np.unique(label_column,return_counts=True)
    
    probabilities=counts/counts.sum()
    entropy=sum(probabilities* -np.log2(probabilities))
    return entropy

def cal_overall_entropy(data_below,data_above):
    n=len(data_below)+ len(data_above)
    p_data_below=len(data_below)/n
    p_data_above=len(data_above)/n
    
    overall_entropy= (p_data_below*cal_entropy(data_below)
                     + p_data_above*cal_entropy(data_above))
    return overall_entropy

def best_split(data,potential_splits):
    
    overall_entropy=9999
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_below,data_above=data_split(data, split_column=column_index, split_value=value)
            current_overall_entropy=cal_overall_entropy(data_below,data_above)
            
            if current_overall_entropy <= overall_entropy:
                overall_entropy=current_overall_entropy
                best_split_column=column_index
                best_split_value=value
                
    return best_split_column,best_split_value


#main algorithm for decision tree implementation

def main_algorithm(df,counter=0,min_samples=2,max_depth=5):
    
    if counter==0:
        global column_headers
        column_headers=df.columns
        data=df.values
    else:
        data=df
    
    
    if (purity(data))or (len(data)<min_samples) or (counter==max_depth):
        classification=classify_data(data)
        
        return classification

    else:
        counter +=1
        
        potential_splits=pot_splits(data)
        split_column,split_value=best_split(data,potential_splits)
        data_below,data_above= data_split(data,split_column,split_value)
        
        
        feature_name=column_headers[split_column]
        question="{} <= {}".format(feature_name,split_value)
        sub_tree={question: []}
        
        
        yes_answer=main_algorithm(data_below,counter,min_samples,max_depth)
        no_answer=main_algorithm(data_above,counter,min_samples,max_depth)
        
        
        if yes_answer==no_answer:
            sub_tree=yes_answer
        else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)
            
        return sub_tree
    

def classify_example(example, tree):
    question = list(tree.keys())[0]
    feature_name, comparison_operator, value = question.split()

    # ask question
    if example[feature_name] <= float(value):
        answer = tree[question][0]
    else:
        answer = tree[question][1]

    # base case
    if not isinstance(answer, dict):
        return answer
    
    # recursive part
    else:
        residual_tree = answer
        return classify_example(example, residual_tree)
    
def calculate_accuracy_of_tree(df, tree):

    df["classification"] = df.apply(classify_example, axis=1, args=(tree,))
    df["classification_correct"] = df["classification"] == df["label"]
    
    accuracy = df["classification_correct"].mean()
    
    return accuracy

##10-fold-validation

final_accuracy=0
for i in range(10):
    
    indices=df.index.tolist()
    test_indices=random.sample(population=indices,k=len(df))
    train_df_1,train_df_2,train_df_3,train_df_4,train_df_5,train_df_6,train_df_7,train_df_8,train_df_9,train_df_10=np.array_split(df.loc[test_indices],10)
    random.seed(0)
    combined=[train_df_1,train_df_2,train_df_3,train_df_4,train_df_5,train_df_6,train_df_7,train_df_8,train_df_9,train_df_10]
    test_df=combined[i]
    combined.pop(i)
    combined_df=pd.concat(combined)
    train_df=combined_df
    tree=main_algorithm(train_df,max_depth=13)
    accuracy = calculate_accuracy_of_tree(test_df, tree)
    final_accuracy += accuracy
    #print(final_accuracy)
    
accuracy_model=final_accuracy/10
print("The final accuracy of k-fold validation is: ", accuracy_model )



The final accuracy of k-fold validation is:  0.8327941237844829


In [1]:
#Answer 3(c) Improvement strategies 
#Part 1: Using Ginni Index

import numpy as np
import pandas as pd
import random
from pprint import pprint

df=pd.read_csv('J:\MTech Notes\FOML\wine-dataset.csv')
df=df.rename(columns={'quality':'label','fixed acidity':'fixed_acidity','volatile acidity':'volatile_acidity',
                      'citric acid':'citric_acid','residual sugar':'residual_sugar','free sulfur dioxide':'free_sulfur_dioxide',
                       'total sulfur dioxide':'total_sulfur_dioxide'})

#Functions used throughout


def split_tr_te(df,test_size):
    
    if isinstance(test_size,float):
        test_size=round(test_size*len(df))
    
    indices=df.index.tolist()
    test_indices=random.sample(population=indices,k=test_size)
    
    test_df=df.loc[test_indices]
    train_df=df.drop(test_indices)
    return train_df,test_df

random.seed(0)
train_df, test_df = split_tr_te(df, test_size=0.2)
data=df.values


def purity(data):
    label_column=data[:,-1]
    unique_classes=np.unique(label_column)
    
    if len(unique_classes)==1:
        return True
    else:
        return False
    
def classify_data(data):
    label_column=data[:,-1]
    unique_classes,counts_unique_classes=np.unique(label_column,return_counts=True)
    
    index=counts_unique_classes.argmax()
    classification=unique_classes[index]
    
    return classification


def pot_splits(data):
    
    potential_splits={}
    _, n_columns=data.shape
    for column_index in range(n_columns-1):
        potential_splits[column_index]=[]
        values=data[:,column_index]
        unique_values=np.unique(values)
        
        for index in range(len(unique_values)):
            if index != 0:
                current_value=unique_values[index]
                previous_value=unique_values[index-1]
                potential_split=(current_value+previous_value)/2
                potential_splits[column_index].append(potential_split)
    return potential_splits


def data_split(data,split_column,split_value):
    
    split_column_values=data[:,split_column]
    data_below=data[split_column_values<=split_value]
    data_above=data[split_column_values > split_value]
    
    return data_below,data_above


def calculate_ginni(data):
    label_column=data[:,-1]
    _, counts=np.unique(label_column,return_counts=True)
    
    probabilities=counts/counts.sum()
    entropy=sum(1-(probabilities* probabilities))
    return entropy


def cal_gini_entropy(data_below,data_above):
    n=len(data_below)+ len(data_above)
    p_data_below=len(data_below)/n
    p_data_above=len(data_above)/n
    
    overall_entropy= (p_data_below*calculate_ginni(data_below)
                     + p_data_above*calculate_ginni(data_above))
    return overall_entropy

def best_split(data,potential_splits):
    
    overall_entropy=9999
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_below,data_above=data_split(data, split_column=column_index, split_value=value)
            current_overall_entropy=cal_gini_entropy(data_below,data_above)
            
            if current_overall_entropy <= overall_entropy:
                overall_entropy=current_overall_entropy
                best_split_column=column_index
                best_split_value=value
                
    return best_split_column,best_split_value


#main algorithm for decision tree implementation

def main_algorithm(df,counter=0,min_samples=2,max_depth=5):
    
    if counter==0:
        global column_headers
        column_headers=df.columns
        data=df.values
    else:
        data=df
    
    
    if (purity(data))or (len(data)<min_samples) or (counter==max_depth):
        classification=classify_data(data)
        
        return classification

    else:
        counter +=1
        
        potential_splits=pot_splits(data)
        split_column,split_value=best_split(data,potential_splits)
        data_below,data_above= data_split(data,split_column,split_value)
        
        
        feature_name=column_headers[split_column]
        question="{} <= {}".format(feature_name,split_value)
        sub_tree={question: []}
        
        
        yes_answer=main_algorithm(data_below,counter,min_samples,max_depth)
        no_answer=main_algorithm(data_above,counter,min_samples,max_depth)
        
        
        if yes_answer==no_answer:
            sub_tree=yes_answer
        else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)
            
        return sub_tree
    

def classify_example(example, tree):
    question = list(tree.keys())[0]
    feature_name, comparison_operator, value = question.split()

    # ask question
    if example[feature_name] <= float(value):
        answer = tree[question][0]
    else:
        answer = tree[question][1]

    # base case
    if not isinstance(answer, dict):
        return answer
    
    # recursive part
    else:
        residual_tree = answer
        return classify_example(example, residual_tree)
    
def calculate_accuracy_of_tree(df, tree):

    df["classification"] = df.apply(classify_example, axis=1, args=(tree,))
    df["classification_correct"] = df["classification"] == df["label"]
    
    accuracy = df["classification_correct"].mean()
    
    return accuracy

ginni_tree=main_algorithm(train_df,max_depth=13)
pprint(ginni_tree)
accuracy = calculate_accuracy_of_tree(test_df, ginni_tree)
print("The overall accuracy after implementing Ginni index is :", accuracy)
    



{'alcohol <= 10.625': [{'volatile_acidity <= 0.46499999999999997': [{'total_sulfur_dioxide <= 229.5': [{'citric_acid <= 0.665': [{'citric_acid <= 0.135': [0.0,
                                                                                                                                                           {'pH <= 2.895': [0.0,
                                                                                                                                                                            {'chlorides <= 0.1355': [{'density <= 0.9917149999999999': [0.0,
                                                                                                                                                                                                                                        {'volatile_acidity <= 0.2025': [{'citric_acid <= 0.235': [0.0,
                                                                                                                                   

In [7]:
#Improvement strategies 
#Part 2: Using post pruning

import numpy as np
import pandas as pd
import random
from pprint import pprint

df=pd.read_csv('J:\MTech Notes\FOML\wine-dataset.csv')
df=df.rename(columns={'quality':'label','fixed acidity':'fixed_acidity','volatile acidity':'volatile_acidity',
                      'citric acid':'citric_acid','residual sugar':'residual_sugar','free sulfur dioxide':'free_sulfur_dioxide',
                       'total sulfur dioxide':'total_sulfur_dioxide'})

#Functions used throughout


def split_tr_te(df,test_size):
    
    if isinstance(test_size,float):
        test_size=round(test_size*len(df))
    
    indices=df.index.tolist()
    test_indices=random.sample(population=indices,k=test_size)
    
    test_df=df.loc[test_indices]
    train_df=df.drop(test_indices)
    return train_df,test_df

random.seed(0)
train_df, test_df = split_tr_te(df, test_size=0.2)
data=df.values


def purity(data):
    label_column=data[:,-1]
    unique_classes=np.unique(label_column)
    
    if len(unique_classes)==1:
        return True
    else:
        return False
    
def classify_data(data):
    label_column=data[:,-1]
    unique_classes,counts_unique_classes=np.unique(label_column,return_counts=True)
    
    index=counts_unique_classes.argmax()
    classification=unique_classes[index]
    
    return classification


def pot_splits(data):
    
    potential_splits={}
    _, n_columns=data.shape
    for column_index in range(n_columns-1):
        potential_splits[column_index]=[]
        values=data[:,column_index]
        unique_values=np.unique(values)
        
        for index in range(len(unique_values)):
            if index != 0:
                current_value=unique_values[index]
                previous_value=unique_values[index-1]
                potential_split=(current_value+previous_value)/2
                potential_splits[column_index].append(potential_split)
    return potential_splits


def data_split(data,split_column,split_value):
    
    split_column_values=data[:,split_column]
    data_below=data[split_column_values<=split_value]
    data_above=data[split_column_values > split_value]
    
    return data_below,data_above


def cal_entropy(data):
    label_column=data[:,-1]
    _, counts=np.unique(label_column,return_counts=True)
    
    probabilities=counts/counts.sum()
    entropy=sum(probabilities* -np.log2(probabilities))
    return entropy

def cal_overall_entropy(data_below,data_above):
    n=len(data_below)+ len(data_above)
    p_data_below=len(data_below)/n
    p_data_above=len(data_above)/n
    
    overall_entropy= (p_data_below*cal_entropy(data_below)
                     + p_data_above*cal_entropy(data_above))
    return overall_entropy

def best_split(data,potential_splits):
    
    overall_entropy=9999
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_below,data_above=data_split(data, split_column=column_index, split_value=value)
            current_overall_entropy=cal_overall_entropy(data_below,data_above)
            
            if current_overall_entropy <= overall_entropy:
                overall_entropy=current_overall_entropy
                best_split_column=column_index
                best_split_value=value
                
    return best_split_column,best_split_value


#main algorithm for decision tree implementation

def main_algorithm(df,counter=0,min_samples=2,max_depth=5):
    
    if counter==0:
        global column_headers
        column_headers=df.columns
        data=df.values
    else:
        data=df
    
    
    if (purity(data))or (len(data)<min_samples) or (counter==max_depth):
        classification=classify_data(data)
        
        return classification

    else:
        counter +=1
        
        potential_splits=pot_splits(data)
        split_column,split_value=best_split(data,potential_splits)
        data_below,data_above= data_split(data,split_column,split_value)
        
        
        feature_name=column_headers[split_column]
        question="{} <= {}".format(feature_name,split_value)
        sub_tree={question: []}
        
        
        yes_answer=main_algorithm(data_below,counter,min_samples,max_depth)
        no_answer=main_algorithm(data_above,counter,min_samples,max_depth)
        
        
        if yes_answer==no_answer:
            sub_tree=yes_answer
        else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)
            
        return sub_tree
    

def classify_example(example, tree):
    question = list(tree.keys())[0]
    feature_name, comparison_operator, value = question.split()

    # ask question
    if example[feature_name] <= float(value):
        answer = tree[question][0]
    else:
        answer = tree[question][1]

    # base case
    if not isinstance(answer, dict):
        return answer
    
    # recursive part
    else:
        residual_tree = answer
        return classify_example(example, residual_tree)
    
def calculate_accuracy_of_tree(df, tree):

    df["classification"] = df.apply(classify_example, axis=1, args=(tree,))
    df["classification_correct"] = df["classification"] == df["label"]
    
    accuracy = df["classification_correct"].mean()
    
    return accuracy

#functions for tree pruning

def df_filter(df, question):
    feature, _, value = question.split()
    df_yes = df[df[feature] <= float(value)]
    df_no  = df[df[feature] >  float(value)]
    
    return df_yes, df_no

def pruning_result(tree, train_df, test_df):

    leaf = df_train.label.value_counts().index[0]
    errors_leaf = sum(df_val.label != leaf)
    errors_decision_node = sum(df_val.label != make_predictions(df_val, tree))

    if errors_leaf <= errors_decision_node:
        return leaf
    else:
        return tree


def predict(example, tree):
    
    if not isinstance(tree, dict):
        return tree
    
    question = list(tree.keys())[0]
    feature_name, comparison_operator, value = question.split(" ")

    
    if comparison_operator == "<=":
        if example[feature_name] <= float(value):
            answer = tree[question][0]
        else:
            answer = tree[question][1]
   
    else:
        if str(example[feature_name]) == value:
            answer = tree[question][0]
        else:
            answer = tree[question][1]

    if not isinstance(answer, dict):
        return answer
    
    else:
        residual_tree = answer
        return predict(example, residual_tree)


def make_predictions(df, tree):
    
    if len(df) != 0:
        predictions = df.apply(predict, args=(tree,), axis=1)
    else:
        predictions = pd.Series()
        
    return predictions

def post_pruning(tree, train_df, test_df):
    
    question = list(tree.keys())[0]
    yes_answer, no_answer = tree[question]

    
    if not isinstance(yes_answer, dict) and not isinstance(no_answer, dict):
        return pruning_result(tree, train_df, test_df)
        
    
    else:
        df_train_yes, df_train_no = df_filter(df_train, question)
        df_val_yes, df_val_no = df_filter(df_val, question)
        
        if isinstance(yes_answer, dict):
            yes_answer = post_pruning(yes_answer, df_train_yes, df_val_yes)
            
        if isinstance(no_answer, dict):
            no_answer = post_pruning(no_answer, df_train_no, df_val_no)
            
        tree = {question: [yes_answer, no_answer]}
    
        return pruning_result(tree, train_df, test_df)
    
tree=main_algorithm(train_df,max_depth=13)
pprint(tree)
tree_pruned = post_pruning(tree, train_df, test_df)
pprint(tree_pruned)
accuracy_pruned = calculate_accuracy_of_tree(test_df, tree_pruned)
print("The overall accuracy of pruned decision tree is :", accuracy_pruned)


{'alcohol <= 10.625': [{'volatile_acidity <= 0.2025': [{'density <= 0.99788': [{'total_sulfur_dioxide <= 141.5': [{'free_sulfur_dioxide <= 27.5': [{'citric_acid <= 0.45': [{'alcohol <= 10.149999999999999': [{'citric_acid <= 0.43': [{'fixed_acidity <= 5.25': [1.0,
                                                                                                                                                                                                                                                                   {'fixed_acidity <= 7.35': [0.0,
                                                                                                                                                                                                                                                                                              {'chlorides <= 0.0495': [0.0,
                                                                                                                                 

NameError: name 'df_train' is not defined