In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import decisiontrees as dt
import randomforest as rf

In [2]:
df = pd.read_csv("toxicity-2/data.csv")

In [3]:
#functions to, using the confusion matrix values, calculate various metrics
def GetAccuracy(tp, fp, tn, fn):
    tot = tp+fp+tn+fn
    truevals = tp+tn
    acc = float(truevals) / float(tot)
    return acc

def GetPrecision(tp, fp, tn, fn):
    pos = float(tp + fp)
    return float(tp) / pos

def GetRecall(tp, fp, tn, fn):
    pos = float(tp + fn)
    return float(tp) / pos

def GetF1Score(tp, fp, tn, fn):
    prec = GetPrecision(tp, fp, tn, fn)
    rec = GetRecall(tp, fp, tn, fn)
    numer = 2.0 * prec * rec
    denom = prec + rec
    return numer / denom

def PrintMetrics(tp, fp, tn, fn, name = "Unnamed Model"):
    """ 
    PrintMetrics() will call all the smaller functions to calculate the
    metrics and print them in a neatly formatted way. You need to give
    it all the metrics in order, and you can also give it the model name
    (optional, set default to 'Unnamed Model')
    """
    print("Accuracy of " + name + ": " + str(GetAccuracy(tp, fp, tn, fn)))
    print("Precision of " + name + ": " + str(GetPrecision(tp, fp, tn, fn)))
    print("Recall of " + name + ": " + str(GetRecall(tp, fp, tn, fn)))
    print("F1-Score of " + name + ": " + str(GetF1Score(tp, fp, tn, fn)))


In [4]:
#split into training/testing dataset
#this function is specific to the toxicity dataset, and it will split to have
#the same fraction of both types in training/testing
def TrainTestSplit(data, frac = 0.7):
    """ 
    TrainTestSplit() takes a pandas dataframe and an optional fraction value
    set to a default 0.7. It returns a training set that is the fraction parameter
    proportion of the dataset, and a testing set that is the 1-fraction of the
    dataset.

    In this case, it will also split to ensure that an equal proportion of the
    outcome variable, preset to "Class" here, is in both sets.
    """

    df_toxic = data[data["Class"] == "Toxic"]
    df_nontoxic = data[data["Class"] == "NonToxic"]
    len_tox = len(df_toxic)
    len_non = len(df_nontoxic)
    tot = len(data)

    train_indices_nontox = GetRandomIndices(n = int(frac * len_non), max = len_non)
    train_indices_tox = GetRandomIndices(n = int(frac * len_tox), max = len_tox)
    test_indices_nontox = GetRemainingIndices(train_indices_nontox, len_non)
    test_indices_tox = GetRemainingIndices(train_indices_tox, len_tox)
    
    train_nontox = df_nontoxic.iloc[train_indices_nontox]
    train_tox = df_toxic.iloc[train_indices_tox]
    test_nontox = df_nontoxic.iloc[test_indices_nontox]
    test_tox = df_toxic.iloc[test_indices_tox]
    
    train = pd.concat([train_nontox, train_tox])
    test = pd.concat([test_nontox, test_tox])

    return train, test

#helper function: get random indices
def GetRandomIndices(n, max, min = 0):
    """ 
    GetRandomIndices() is a function that takes three integers:
    - n, number of indices to draw
    - max, maximum exclusive value to draw
    - min, minimum inclusive value to draw (optional, default 0)
    It returns a list of random indices from the specified range; this
    list is guaranteed to have no repeats.
    """
    indices = []
    while len(indices) < n:
        current_num = np.random.randint(low = min, high = max)
    
        #check if exist
        e = False
        for j in range(len(indices)):
            if current_num == indices[j]:
                e = True

        if e != True: 
            indices.append(current_num)

    return indices

#second helper functions: get the indices we did NOT pick
def GetRemainingIndices(picked, max):
    """ 
    For any list given as the first argument and the maximum value
    given in the second, GetRemainingIndices() will return a list of
    the values less than the maximum not in the parameter list.
    """
    rem_indices = []
    for i in range(max):
        e = False
        for j in picked:
            if i == j:
                e = True
        if e != True:
            rem_indices.append(i)

    return rem_indices


In [5]:
#functions for crossvalidation
def MakeKFolds(data, k = 5):
    """ 
    MakeKFolds() takes in a pandas dataframe and an integer k (default 5), and
    returns a list of pandas dataframes that is k long.
    """
    n = len(data)
    n_sub = int(n / k)
    inds = GetRandomIndices(n, max = n)

    folds = []
    for i in range(k):
        startind = i * n_sub
        endind = (i + 1) * n_sub
        current_indices = inds[startind:endind]
        current_fold = data.iloc[current_indices]
        folds.append(current_fold)

    return folds

def TrainValidationSplit(folds):
    """ 
    TrainValidationSplit() takes the folds given from MakeKFolds() and randomly
    assigns one fold to be a validation set and combines the others to make
    the training set.
    """
    k = len(folds)
    fold_indices = []
    for i in range(k):
        fold_indices.append(i)

    val_index = np.random.randint(k)
    validation = folds[val_index]
    train_indices = GetRemainingIndices([val_index], k)
    
    train = pd.DataFrame()
    for j in train_indices:
        train = pd.concat([train, folds[j]])

    return train, validation

def TuneTree(folds, grid):
    """ 
    TuneTree() takes in datafolds and a tuning grid, made by MakeKFolds() and
    dt.MakeTuningGrid(). It returns the parameters found in the grid that yielded
    the best accuracy.
    """
    k = len(grid)
    accuracies = []
    for i in range(k):
        current_params = grid[i]
        current_train, current_val = TrainValidationSplit(folds)

        #a little QOL addition
        #checks if we have more depth than variables
        dep = 0
        cols = len(current_train.columns)
        if cols < current_params[2]:
            dep = cols
        else:
            dep = current_params[2]

        root = dt.MakeNode(current_train, 1)
        tree = dt.BuildTree(root, current_train, 0, 
                            min=current_params[0], frac = current_params[1],
                            maxDepth=dep)
        preds = dt.Predict(tree, current_val)
        tp, fp, tn, fn = dt.GetConfusionMatrix(current_val, preds)
        
        current_acc = GetAccuracy(tp, fp, tn, fn)
        accuracies.append(current_acc)

    max_index = np.argmax(np.array(accuracies))
    return grid[max_index]

def TuneForest(folds, grid):
    """ 
    TuneForest() takes in datafolds and a tuning grid, made by MakeKFolds() and
    rf.MakeTuningGrid(). It returns the parameters found in the grid that yielded
    the best accuracy.
    """
    k = len(grid)
    accuracies = []
    for i in range(k):
        current_params = grid[i]
        current_train, current_val = TrainValidationSplit(folds)

        #a little QOL addition
        #checks if we have more depth than variables
        dep = 0
        cols = len(current_train.columns)
        if cols < current_params[2]:
            dep = cols - 1
        else:
            dep = current_params[2]

        f = rf.BuildRandomForest(current_train, trees = current_params[4], 
                                 subset_frac = current_params[3], min = current_params[0],
                                 frac = current_params[1], maxDepth = dep)
        preds = rf.ForestPredict(f, current_val)
        tp, fp, tn, fn = dt.GetConfusionMatrix(current_val, preds)
        
        current_acc = GetAccuracy(tp, fp, tn, fn)
        accuracies.append(current_acc)

    max_index = np.argmax(np.array(accuracies))
    return grid[max_index]

In [6]:
#function to save parameters as a file
#so you can build the trees again.
def SaveParams(params, filename = "params.txt"):
    with open(filename, "w") as fw:
        for p in params:
            fw.write(str(p))
            fw.write("\n")

In [9]:
#workflow for doing Decision Trees
#MODIFY THIS - TRY TO NOT MODIFY FUNCTIONS
train, test = TrainTestSplit(df)
folds = MakeKFolds(train)
g = dt.MakeTuningGrid(min_max = 10, frac_min = 0.7)
best_params = TuneTree(folds, g)
SaveParams(best_params, "params_dt_full.txt")
final_root = dt.MakeNode(train, 1)
final_tree = dt.BuildTree(final_root, train, 0, min = best_params[0],
                          frac = best_params[1], maxDepth = best_params[2])
preds = dt.Predict(final_tree, test)
tp, fp, tn, fn = dt.GetConfusionMatrix(test, preds)
PrintMetrics(tp, fp, tn, fn, "Decision Tree")


Accuracy of Decision Tree: 0.7115384615384616
Precision of Decision Tree: 0.4117647058823529
Recall of Decision Tree: 0.5833333333333334
F1-Score of Decision Tree: 0.4827586206896552


In [10]:
#decision tree workflow
#using default parameters
root = dt.MakeNode(train, 1)
tree = dt.BuildTree(root, train, 0, maxDepth=10)
preds = dt.Predict(tree, test)
tp, fp, tn, fn = dt.GetConfusionMatrix(test, preds)
PrintMetrics(tp, fp, tn, fn, name = "Decision Tree")

Accuracy of Decision Tree: 0.6923076923076923
Precision of Decision Tree: 0.4117647058823529
Recall of Decision Tree: 0.5384615384615384
F1-Score of Decision Tree: 0.4666666666666667


In [11]:
class_list_paper = ["MDEC-23", "MATS2v", "ATSC8s", "VE3_Dt", "CrippenMR", "SpMax7_Bhe", 
              "SpMin1_Bhs", "C1SP2", "GATS8e", "SpMax5_Bhv", "VE3_Dzi", "VPC-4", "Class"]
class_list_gb = ["MDEC-23", "GATS8s", "VE3_Dzi", "CrippenMR", "VPC-4", "GATS8e",
                   "ATSC8s", "C1SP2", "SpMax5_Bhv", "MATS2v", "SpMax7_Bhe", "SpMin1_Bhs", 
                   "VE3_Dt", "Class"]
df_paper_select = df[class_list_paper]
df_gb_select = df[class_list_gb]

In [16]:
#workflow for doing Decision Trees
#MODIFY THIS - TRY TO NOT MODIFY FUNCTIONS
#this time, it is with modified features from paper
train, test = TrainTestSplit(df_paper_select)
folds = MakeKFolds(train)
g = dt.MakeTuningGrid(min_max = 10, frac_min = 0.7, depth_min=5, depth_max=10)
best_params = TuneTree(folds, g)
SaveParams(best_params, "params_dt_pap.txt")
final_root = dt.MakeNode(train, 1)
final_tree_pap = dt.BuildTree(final_root, train, 0, min = best_params[0],
                          frac = best_params[1], maxDepth = best_params[2])
preds = dt.Predict(final_tree_pap, test)
tp, fp, tn, fn = dt.GetConfusionMatrix(test, preds)
PrintMetrics(tp, fp, tn, fn, "Decision Tree with Paper Selected Features")

Accuracy of Decision Tree with Paper Selected Features: 0.6538461538461539
Precision of Decision Tree with Paper Selected Features: 0.5294117647058824
Recall of Decision Tree with Paper Selected Features: 0.47368421052631576
F1-Score of Decision Tree with Paper Selected Features: 0.5


In [21]:
#workflow for doing Decision Trees
#MODIFY THIS - TRY TO NOT MODIFY FUNCTIONS
#this time, it is with modified features from Gradient Boost selection
train, test = TrainTestSplit(df_gb_select)
folds = MakeKFolds(train)
g = dt.MakeTuningGrid(min_max = 10, frac_min = 0.7, depth_min=5, depth_max=10)
best_params = TuneTree(folds, g)
SaveParams(best_params, "params_dt_gb.txt")
final_root = dt.MakeNode(train, 1)
final_tree_gb = dt.BuildTree(final_root, train, 0, min = best_params[0],
                          frac = best_params[1], maxDepth = best_params[2])
preds = dt.Predict(final_tree_gb, test)
tp, fp, tn, fn = dt.GetConfusionMatrix(test, preds)
PrintMetrics(tp, fp, tn, fn, "Decision Tree with Gradient Boost Selected Features")

Accuracy of Decision Tree with Gradient Boost Selected Features: 0.6730769230769231
Precision of Decision Tree with Gradient Boost Selected Features: 0.23529411764705882
Recall of Decision Tree with Gradient Boost Selected Features: 0.5
F1-Score of Decision Tree with Gradient Boost Selected Features: 0.31999999999999995


In [25]:
#workflow for doing Random Forest
#MODIFY THIS - TRY TO NOT MODIFY FUNCTIONS
#we are trying this with the modified feature list from paper
train, test = TrainTestSplit(df_paper_select)
folds = MakeKFolds(train)
g = rf.MakeTuningGrid(min_max = 10, frac_min = 0.7, depth_min=4, depth_max=8, n = 100, subs_min=0.7, subs_max=0.9)
best_params = TuneForest(folds, g)
SaveParams(best_params, "params_rf_pap.txt")
final_root = dt.MakeNode(train, 1)
final_forest_pap = rf.BuildRandomForest(train, trees = best_params[4], 
                                 subset_frac = best_params[3], min = best_params[0],
                                 frac = best_params[1], maxDepth = best_params[2])
preds_pap = rf.ForestPredict(final_forest_pap, test)
tp, fp, tn, fn = dt.GetConfusionMatrix(test, preds)
PrintMetrics(tp, fp, tn, fn, "Random Forest with Paper Selected Features")

Accuracy of Random Forest with Paper Selected Features: 0.6730769230769231
Precision of Random Forest with Paper Selected Features: 0.23529411764705882
Recall of Random Forest with Paper Selected Features: 0.5
F1-Score of Random Forest with Paper Selected Features: 0.31999999999999995


In [41]:
#workflow for doing Random Forest
#MODIFY THIS - TRY TO NOT MODIFY FUNCTIONS
#we are trying this with the modified feature list from gradient boositng
train, test = TrainTestSplit(df_gb_select)
folds = MakeKFolds(train)
g = rf.MakeTuningGrid(min_max = 10, frac_min = 0.7, depth_min=4, depth_max=8, n = 100, subs_min=0.7, subs_max=0.9)
best_params = TuneForest(folds, g)
SaveParams(best_params, "params_rf_gb.txt")
final_forest_gb = rf.BuildRandomForest(train, trees = best_params[4], 
                                 subset_frac = best_params[3], min = best_params[0],
                                 frac = best_params[1], maxDepth = best_params[2])
preds_gb = rf.ForestPredict(final_forest_gb, test)
tp, fp, tn, fn = dt.GetConfusionMatrix(test, preds)
PrintMetrics(tp, fp, tn, fn, "Random Forest with Gradient Boost Selected Features")

Accuracy of Random Forest with Gradient Boost Selected Features: 0.6730769230769231
Precision of Random Forest with Gradient Boost Selected Features: 0.23529411764705882
Recall of Random Forest with Gradient Boost Selected Features: 0.5
F1-Score of Random Forest with Gradient Boost Selected Features: 0.31999999999999995


In [42]:
#randomforest workflow
#with default parameters
train, test = TrainTestSplit(df)
f = rf.BuildRandomForest(train, 50)
preds = rf.ForestPredict(f, test)
tp, fp, tn, fn = dt.GetConfusionMatrix(test, preds)
PrintMetrics(tp, fp, tn, fn, name = "Random Forest")


Accuracy of Random Forest: 0.5961538461538461
Precision of Random Forest: 0.23529411764705882
Recall of Random Forest: 0.3333333333333333
F1-Score of Random Forest: 0.27586206896551724


In [43]:
import dill

In [None]:
#save my session!
dill.dump_session("dtrf_models.db")