In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#we are going to import our functions from decisiontrees.py
import decisiontrees as dt

In [2]:
df = pd.read_csv("toxicity-2/data.csv")
df

Unnamed: 0,MATS3v,nHBint10,MATS3s,MATS3p,nHBDon_Lipinski,minHBint8,MATS3e,MATS3c,minHBint2,MATS3m,...,WTPT-4,WTPT-5,ETA_EtaP_L,ETA_EtaP_F,ETA_EtaP_B,nT5Ring,SHdNH,ETA_dEpsilon_C,MDEO-22,Class
0,0.0908,0,0.0075,0.0173,0,0.0000,-0.0436,0.0409,0.0000,0.1368,...,0.0000,0.0000,0.1780,1.5488,0.0088,0,0.0,-0.0868,0.00,NonToxic
1,0.0213,0,0.1144,-0.0410,0,0.0000,0.1231,-0.0316,0.0000,0.1318,...,8.8660,19.3525,0.1739,1.3718,0.0048,2,0.0,-0.0810,0.25,NonToxic
2,0.0018,0,-0.0156,-0.0765,2,0.0000,-0.1138,-0.1791,0.0000,0.0615,...,5.2267,27.8796,0.1688,1.4395,0.0116,2,0.0,-0.1004,0.00,NonToxic
3,-0.0251,0,-0.0064,-0.0894,3,0.0000,-0.0747,-0.1151,0.0000,0.0361,...,7.7896,24.7336,0.1702,1.4654,0.0133,2,0.0,-0.1010,0.00,NonToxic
4,0.0135,0,0.0424,-0.0353,0,0.0000,-0.0638,0.0307,0.0000,0.0306,...,12.3240,19.7486,0.1789,1.4495,0.0120,2,0.0,-0.1071,0.00,NonToxic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,-0.0960,0,-0.0478,-0.0840,2,0.0000,-0.0739,-0.2315,1.5660,-0.1133,...,2.5690,12.0174,0.1648,0.9710,0.0049,1,0.0,-0.0952,0.00,NonToxic
167,-0.0064,1,-0.1222,0.0013,1,0.0000,-0.1873,-0.2181,5.5404,-0.0757,...,10.7860,6.4871,0.1805,1.2298,0.0127,1,0.0,-0.0860,0.00,NonToxic
168,0.0096,2,-0.1846,0.0058,1,0.0000,-0.1293,-0.0979,5.3976,0.0409,...,4.9930,19.2864,0.2089,1.1245,0.0093,1,0.0,-0.0927,0.00,NonToxic
169,-0.0736,2,-0.1267,-0.0345,2,0.5346,-0.0361,0.0151,5.5190,-0.1025,...,10.7504,19.4989,0.1944,1.2256,0.0167,1,0.0,-0.1129,0.00,Toxic


In [47]:
#function: build a random forest
def BuildRandomForest(train_data, trees, subset_frac = 0.7, min = 5, frac = 0.9, maxDepth = 20):
    """ 
    BuildRandomForest() uses BuildTree() to build the a random forest.
    It takes the following inputs:
    - train_data: the data used to build the tree, as a pandas dataframe
    - trees: an integer corresponding to the number of trees to make
    - min: a parameter given to BuildTree() that sets the minimum number of datapoints needed to make a leaf, default is 5
    - frac: a parameter given to BuildTree() that sets the minimum fraction of prediction needed to make a leaf, default is 0.9
    - maxDepth: a parameter given to BuildTree() that sets the maximum depth of the tree, default is 20
    The function returns a list of trees
    """
    forest = []
    for i in range(trees):
        current_data = GetRandomSubset(train_data, subset_frac = subset_frac)
        current_root = dt.MakeNode(current_data, 1)
        current_tree = dt.BuildTree(current_root, current_data, 0, min = min, frac = frac, maxDepth = maxDepth)
        forest.append(current_tree)

    return forest

def GetRandomSubset(data, subset_frac):
    """ 
    GetRandomSubset() is a function that takes in a pandas dataframe and returns 
    a new pandas dataframe with a subset of the original variables.

    It operates heuritstically and gets the nearest integer for the number of 
    variables to select. 
    """
    cols = len(data.columns) - 1 #gives us the upper bound, and we exclude the last which is class
    n = int(subset_frac * (cols + 1))
    topick = []
    
    current_rand = np.random.randint(cols)
    topick.append(current_rand)
    while len(topick) < n:
        current_num = np.random.randint(cols)
    
        #check if exist
        e = False
        for j in range(len(topick)):
            if current_num == topick[j]:
                e = True

        if e != True: 
            topick.append(current_num)
    
    #we also want to append the last column
    topick.append(cols)
    newsubset = data[data.columns[topick]]
    return newsubset
    
def ForestPredict(forest, test_data):
    """
    ForestPredict() is a function that takes a prebuilt forest from BuildForest()
    and the data for which to predict. It returns a list of predictions by the
    forest for each point in the testdata.
    """
    n = len(forest)
    preds = []
    for i in range(len(test_data)): #run for each data point
        tox = 0
        non = 0
        #for each tree, run a prediction
        for j in range(n):
            current_res = dt.PredictOne(forest[j], test_data.iloc[i])
            if current_res == "Toxic":
                tox += 1
            else:
                non += 1
        
        #take the majority rule
        if tox >= non:
            preds.append("Toxic")
        else:
            preds.append("NonToxic")

    return preds

In [48]:
GetRandomSubset(df, 0.7)

Unnamed: 0,nT5HeteroRing,khs.tsC,IC0,VR1_Dzm,JGI9,SPC-6,maxsssCH,piPC9,MATS6m,VP-1,...,MATS2m,StsC,AATS7i,nHdCH2,ETA_Psi_1,nBondsD,SpMax5_Bhp,SpMax4_Bhp,ATSC6e,Class
0,0,0,1.4282,1167.3161,0.0092,39.5531,0.0,10.6613,-0.0799,14.9333,...,0.0281,0.0000,149.7571,0,0.5949,22,3.5441,3.8195,0.6513,NonToxic
1,2,0,1.5380,572.7661,0.0059,17.1890,0.0,9.2870,-0.0317,13.5639,...,0.0149,0.0000,156.5877,0,0.5862,20,3.5584,3.7870,-1.3050,NonToxic
2,2,2,1.5726,3081.1325,0.0062,22.9308,0.0,9.1944,-0.0648,13.5174,...,0.1502,4.4113,157.7041,0,0.5779,17,3.6447,3.6788,1.7708,NonToxic
3,2,2,1.5909,1605.5284,0.0072,23.9263,0.0,9.2173,-0.0676,13.7878,...,0.1437,4.3790,154.6908,0,0.5730,17,3.6587,3.6788,0.5700,NonToxic
4,2,0,1.6246,481.7694,0.0083,22.6422,0.0,9.4008,-0.0655,13.9373,...,0.0105,0.0000,153.7310,0,0.5554,19,3.5152,3.7784,0.6523,NonToxic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,1,0,1.6172,125.2693,0.0034,4.3826,0.0,7.0513,-0.0478,5.4900,...,-0.0586,0.0000,165.6835,0,0.5635,8,2.8004,2.9603,-0.6539,NonToxic
167,1,0,1.4985,561.8328,0.0080,13.7438,0.0,8.0050,0.0562,10.0722,...,0.0662,0.0000,154.2986,0,0.5761,14,3.2321,3.3510,1.0013,NonToxic
168,1,0,1.6696,842.8180,0.0067,11.8393,0.0,7.6428,-0.0746,11.8650,...,0.0412,0.0000,161.3846,0,0.5857,13,3.2713,3.5394,-0.0158,NonToxic
169,1,0,1.6368,400.1427,0.0074,14.7126,0.0,7.6332,-0.0688,11.5984,...,0.1876,0.0000,161.0539,0,0.5494,14,3.2613,3.4656,-3.0058,Toxic


In [49]:
f = BuildRandomForest(df, 10)

In [50]:
ForestPredict(f, df)

['NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'Toxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 'NonToxic',
 '