## Homework 4: Decision Trees
### Distinguishing Rock, Pop, Hip-Hop

    Question1
        Implement a function called dtree which takes the following parameters:
            • train
            • criterion:(gini or entropy)
            • max_depth
            • min_instances=2
            • target_impurity=0.0

In [83]:
#Import libraries
import pandas as pd
import numpy as np
from collections import Counter 
from math import log2

In [84]:
#Downloading csv files into training and testing variables
testing = pd.read_csv('spotify_test.csv')
training = pd.read_csv('spotify_train.csv')

COL_TRACK_GENRE = "track_genre"
# drop duplicates from both datasets
training.drop_duplicates(keep=False, inplace=True, subset=testing.columns.difference([COL_TRACK_GENRE]))
testing.drop_duplicates(keep=False, inplace=True, subset=testing.columns.difference([COL_TRACK_GENRE]))

In [85]:
#Total Count
def total(cnt):
    return sum(cnt.values())

#Gini Method
#Gini impurity measures the frequency at which any element of the dataset 
#will be mislabelled when it is randomly labeled.
def gini(cnt):
    tot = total(cnt)
    return 1 - sum([(v/tot)**2 for v in cnt.values()])

#Entropy Method
#Entropy is a measure of information that indicates the disorder of the features with the target. 
#Similar to the Gini Index, the optimum split is chosen by the feature with less entropy.
def entropy(cnt):
    tot = total(cnt)
    return sum([(-v/tot) * log2(v/tot) for v in cnt.values()])
    
def wavg(cnt1, cnt2, measure):
    tot1 = total(cnt1)
    tot2 = total(cnt2)
    tot = tot1 + tot2
    return (measure(cnt1) * tot1 + measure(cnt2) * tot2) / tot

def evaluate_split(df, class_col, split_col, feature_val, measure):
    df1, df2 = df[df[split_col] <= feature_val], df[df[split_col] > feature_val]
    cnt1, cnt2 = Counter(df1[class_col]), Counter(df2[class_col])
    return wavg(cnt1, cnt2, measure)

def best_split_for_column(df, class_col, split_col, method):
    best_v = ''
    best_meas = float("inf")
    
    for v in set(df[split_col]):
        
        meas = evaluate_split(df, class_col, split_col, v, method)
        if  meas < best_meas:
            best_v = v
            best_meas = meas
    
    return best_v, best_meas

def best_split(df, class_col, method):
    best_col = 0
    best_v = ''
    best_meas = float("inf")
    
    for split_col in df.columns:
        if split_col != class_col:
            v, meas = best_split_for_column(df, class_col, split_col, method)
            if meas < best_meas:
                best_v = v
                best_meas = meas
                best_col = split_col
                
    return best_col, best_v, best_meas

#Function of dtree taking following parameters
def dtree(train, criterion, max_depth=None, min_instances=2, target_impurity=0.0, depth=0):
    num_instances = len(train)
    num_classes = len(train[train.columns[-1]].unique())
    class_counts = train[train.columns[-1]].value_counts()

    #Checking max_depth and min_depth 
    if num_classes == 1 or (max_depth is not None and depth == max_depth) or num_instances < min_instances:
        return None, None, num_instances, class_counts.idxmax(), 0, depth, None, None
    
    best_col, best_v, best_meas = best_split(train, train.columns[-1], criterion)

    left_split = train[train[best_col] <= best_v]
    right_split = train[train[best_col] > best_v]

    if left_split.empty or right_split.empty:
        left_split = train[train[best_col] == best_v]
        right_split = train[train[best_col] != best_v]

    #Checking for target impurity
    if best_meas <= target_impurity:
        left = None, None, len(left_split), left_split[train.columns[-1]].value_counts().idxmax(), 0, depth + 1, None, None
        right = None, None, len(right_split), right_split[train.columns[-1]].value_counts().idxmax(), 0, depth + 1, None, None
        return best_col, best_v, num_instances, class_counts.idxmax(), best_meas, depth, left, right

    #Recursing
    left = dtree(left_split, criterion, max_depth, min_instances, target_impurity, depth + 1)
    right = dtree(right_split, criterion, max_depth, min_instances, target_impurity, depth + 1)
    
    return best_col, best_v, num_instances, class_counts.idxmax(), best_meas, depth, left, right


In [86]:
# Function to generate predictions. Get a list of predictions for each row in data
def predict(model, data):
    predictions = []
    for i in range(len(data)):
        predictions.append(predict_row(model, data.iloc[i]))
    return predictions

# Function to generate predictions for a row
def predict_row(model, row):
    node = model
    majority_class = node[3]
    left = node[6]
    right = node[7]

    if node[0] is None:
        return majority_class
    else:
        if row[node[0]] <= node[1]:
            return predict_row(left, row)
        else:
            return predict_row(right, row)

#cross validation to determine overall validation error
#Returning avg. accuracies
def cross_validate(train, criterion, max_depth=None, min_instances=2, target_impurity=0.0, k=10):
    fold_size = int(len(train) / k)
    accuracies = []
    for i in range(k):
        training_fold = pd.concat([train.iloc[:i * fold_size], train.iloc[(i + 1) * fold_size:]]).reset_index(drop=True)
        validation_fold = train.iloc[i * fold_size:(i + 1) * fold_size].reset_index(drop=True)

        model = dtree(training_fold, criterion, max_depth, min_instances, target_impurity)
        predictions = predict(model, validation_fold)
        labels = validation_fold.track_genre

        accuracy = np.sum(predictions == labels) / len(labels)
        accuracies.append(accuracy)
    return np.mean(accuracies)

In [87]:
#Hyperparameter tuning which searches for possible combinations that produces the best validation accuracy.
def hyperparameter_tuning(train):
    best_params = None
    best_accuracy = 0
    #FEATURE SELECTION AND SPLITTING CRITERION
    criterions = [gini, entropy]
    max_depths = np.arange(0, 20, 1)
    min_instances = np.arange(2, 20, 1)
    target_impurities = np.arange(0.0, 0.5, 0.1)
    
    #STOPPING CRITERIAS
    for c in criterions:
        for md in max_depths:
            for mi in min_instances:
                for ti in target_impurities:
                    accuracy = cross_validate(train, c, md, mi, ti, 10)

                    if accuracy > best_accuracy:
                        best_accuracy = accuracy
                        best_params = c, md, mi, ti
    return best_params, best_accuracy

In [None]:
best_parameters, best_accuracy = hyperparameter_tuning(training)

In [88]:
# computer blue screened, so I had to hard code our best parameters and accuracy.
best_parameters = [gini, 14, 4, 0.1]
best_accuracy = 0.7385679514358962
print(best_parameters)
print(best_accuracy)

c = best_parameters[0]
md = best_parameters[1]
mi = best_parameters[2]
ti = best_parameters[3]

clf = dtree(training, c, md, mi, ti)

predictions = predict(clf, testing)

[<function gini at 0x00000170B96859D0>, 14, 4, 0.1]
0.7385679514358962


In [89]:
y_test = testing.track_genre

print("Accuracy: ", np.sum(predictions == y_test) / len(predictions))

Accuracy:  0.7021546261089987
