In [34]:
#16/8/24 This script aims to test ways of fixing class imbalance

import pandas as pd
import numpy as np
import os
import math

seed = 81
#Data import
df = pd.read_csv('/Users/james/Documents/Honours/Data/Fingerprints/SR-ATAD5_Morganfp.csv')
toxvals = (df['Toxicity_Value'].values)

In [None]:
#K fold partioning

#Given imbalanced data
#split majority class into K partitions where K is (Number of majority class) / (number of minority class)
#each partition is the size of the minority class
#Each parition is then modelled against minority class so K models are made


In [35]:
#Determine Degree of Class Imbalance
posvals = 0
negvals = 0
loops = 0
neglist = []
poslist = []
for value in toxvals:
    if value == 1:
        posvals += 1
        poslist.append(loops)
    else:
        negvals += 1
        neglist.append(loops)
        
    loops += 1

print('Total samples =', (negvals + posvals))
print('Positives in data:', posvals)
print('Negatives in data:', negvals)
print('Class Imbalance =', (posvals/negvals), ':', (1 - (posvals/negvals)))

Total samples = 6931
Positives in data: 253
Negatives in data: 6678
Class Imbalance = 0.037885594489368075 : 0.9621144055106319


In [23]:
#Find Model Performance on Imbalanced Set
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import sys
sys.path.insert(1, '/Users/james/Documents/Honours/Python')
from Function_Repo import metriccalc 

df = pd.read_csv('/Users/james/Documents/Honours/Data/Fingerprints/SR-ATAD5_Morganfp.csv')
#df = pd.read_csv('/Users/james/Documents/Honours/Data/Fingerprints/Ames_Mutagenicity_Morganfp.csv')
df.drop(columns=['SMILES'], inplace=True)
df = df.dropna()
dfarray = df.to_numpy()

dfarray, tempset = train_test_split(dfarray, test_size=0.2, 
                                    random_state=seed, stratify = df['Toxicity_Value'])

size = len(dfarray) / 5
splitsize = math.ceil(size)
empty = [[] for _ in range(5)]
splits = []
for list in empty:
    splits.append(list)

loops = 0
currsplit = 0
#Results in a list of 5 lists that each contain 1/5 of the targetdata
for row in dfarray:
    splits[currsplit].append(row)
    if loops == splitsize:
        loops = 0
        currsplit = currsplit + 1
    loops = loops + 1

folds = []
for split in splits:
    temp = pd.DataFrame(split)
    folds.append(temp)

model_list = []
validlist = []
totalmetrics = []
testdat = []
#For loop that uses each fold once for valid/testing and the rest for training
#Each 'split' in the range corresponds to the set used for test/validation with the other 4 for training
iteration = 1
print('========================================')
for split in range(0,5):
    #Set creation
    trainlist = [df for i, df in enumerate(folds) if i != split]
    trainset = pd.concat(trainlist, axis=0)
    testset, validset = train_test_split(folds[split], test_size=0.5, random_state=seed)

    ytrain = trainset[0].values
    xtrain = trainset.iloc[:, 1:]

    ytest = testset[0].values
    xtest = testset.iloc[:, 1:]

    yvalid = validset[0].values
    xvalid = validset.iloc[:, 1:]

    #lists of metric values
    mcclist = []
    bestmetrics = 1
    maxval = 0
    for mtry in range(1, 50):
        #using mtry as the adjusted hyperparameter creates a series of random forests
        rf = RandomForestClassifier(n_estimators=mtry, criterion='entropy', max_depth=None, 
                                min_samples_split=2, min_samples_leaf=1, 
                                min_weight_fraction_leaf=0.0, max_features='sqrt', 
                                max_leaf_nodes=None, min_impurity_decrease=0.0, 
                                bootstrap=True, oob_score=False, n_jobs= 4, random_state=seed, 
                                verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)

        model= rf.fit(xtrain, ytrain)
        preds = model.predict(xtest)
        #calculate metric (mcc)
        mcc = metriccalc(preds, ytest)[12]
        mcclist.append(mcc)
        if mcc > maxval:
            maxval = mcc
            bestmetrics = mtry
    #store best model for the given fold and plot the metric vs mcc value
    rf = RandomForestClassifier(n_estimators=bestmetrics, criterion='entropy', max_depth=None, 
                                min_samples_split=2, min_samples_leaf=1, 
                                min_weight_fraction_leaf=0.0, max_features='sqrt', 
                                max_leaf_nodes=None, min_impurity_decrease=0.0, 
                                bootstrap=True, oob_score=False, n_jobs= 4, random_state=seed, 
                                verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)
    model_list.append(rf)
    testdat.append(maxval)
    totalmetrics.append(mcclist)

    #check models onto validation set, printing various metrics
    model= rf.fit(xtrain, ytrain)
    preds = model.predict(xvalid)
    results = metriccalc(preds, yvalid)
    validlist.append(results)
    print('for fold', split + 1, 'test set mcc of', maxval, 'valid set mcc of', results[12])

    iteration = iteration + 1

    #get x and y values for the final validation set

toxvals = []
fingerprints = []
for row in tempset:
    toxvals.append(row[0])
    fingerprints.append(row[1:])

#calculates consensus of models on each fingerprint in the final validation set
consensuslist = []
predictions = []
for fp in fingerprints:
    consensus = -1
    fp = fp.reshape(1, -1)
    #appends each model's prediction to a list
    predictions = []
    predlist = []
    for model in model_list:
        preds = model.predict(fp)
        predictions.append(preds)
        predlist.append(preds)
    predictions.append(predlist)

    #finds number of 0s in the prediction list
    zercount = 0
    for num in predictions:
        if num == 0:
            zercount = zercount + 1
    #as there are 5 models, if there are less than 3 0s predicted, the consensus is 1
    if zercount < 3:
        consensus = 1
    else:
        consensus = 0
    consensuslist.append(consensus)
#calculate and print metrics
results = metriccalc(consensuslist, toxvals)
print('\nvalidation metrics of:')
print('positives in data', results[0])
print('negatives in data', results[1])
print('net accuracy =', results[6])
print('mcc =',results[12])

for fold 1 test set mcc of 0.2639431263837363 valid set mcc of 0.2723211280201637
for fold 2 test set mcc of 0.3179965487615299 valid set mcc of 0.11262150407710413
for fold 3 test set mcc of 0.15403218406501445 valid set mcc of 0.1212662593325755
for fold 4 test set mcc of 0.13414143899278652 valid set mcc of 0.30290273405904894
for fold 5 test set mcc of 0.48983581302779317 valid set mcc of 0.19130468963600855

validation metrics of:
positives in data 0
negatives in data 0
net accuracy = 0
mcc = 0


In [None]:
#as seen the results using the whole dataset is suboptimal at best

In [43]:
#Partioning dominant results into portions of 253 (number of non-dominant samples)
import random
random.seed(seed)
#Start by randomising dominant sample list before partitioning
rneglist = neglist
random.shuffle(rneglist)
#Creates partitions of dominant data each the size of non dominant data
loops = 0
partitions = []
currpart = []
for value in rneglist:
    currpart.append(value)
    loops += 1
    if loops == posvals:
        partitions.append(currpart)
        currpart = []
        loops = 0

In [53]:
#Create models for each partition
datasets = []
for part in partitions:
    newframe = pd.DataFrame()
    for item in poslist:
        newframe = pd.concat([newframe, df.iloc[item]], axis = 1)
    for value in part:
        newframe = pd.concat([newframe, df.iloc[value]], axis = 1)
    newframe = newframe.transpose()
    newframe.drop(columns=['SMILES'], inplace=True)
    
    datasets.append(newframe)

In [54]:
#Model dev on each partition
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import sys
sys.path.insert(1, '/Users/james/Documents/Honours/Python')
from Function_Repo import metriccalc 
iter = 0
finalresults = []
for df in datasets:
    df = df.dropna()
    dfarray = df.to_numpy()

    dfarray, tempset = train_test_split(dfarray, test_size=0.2, 
                                        random_state=seed, stratify = df['Toxicity_Value'])

    size = len(dfarray) / 5
    splitsize = math.ceil(size)
    empty = [[] for _ in range(5)]
    splits = []
    for list in empty:
        splits.append(list)

    loops = 0
    currsplit = 0
    #Results in a list of 5 lists that each contain 1/5 of the targetdata
    for row in dfarray:
        splits[currsplit].append(row)
        if loops == splitsize:
            loops = 0
            currsplit = currsplit + 1
        loops = loops + 1

    folds = []
    for split in splits:
        temp = pd.DataFrame(split)
        folds.append(temp)

    model_list = []
    validlist = []
    totalmetrics = []
    testdat = []
    #For loop that uses each fold once for valid/testing and the rest for training
    #Each 'split' in the range corresponds to the set used for test/validation with the other 4 for training
    iteration = 1
    print('========================================')
    print('For Partition', iter)
    for split in range(0,5):
        #Set creation
        trainlist = [df for i, df in enumerate(folds) if i != split]
        trainset = pd.concat(trainlist, axis=0)
        testset, validset = train_test_split(folds[split], test_size=0.5, random_state=seed)

        ytrain = trainset[0].values
        xtrain = trainset.iloc[:, 1:]

        ytest = testset[0].values
        xtest = testset.iloc[:, 1:]

        yvalid = validset[0].values
        xvalid = validset.iloc[:, 1:]

        #lists of metric values
        mcclist = []
        bestmetrics = 1
        maxval = 0
        for mtry in range(1, 50):
            #using mtry as the adjusted hyperparameter creates a series of random forests
            rf = RandomForestClassifier(n_estimators=mtry, criterion='entropy', max_depth=None, 
                                    min_samples_split=2, min_samples_leaf=1, 
                                    min_weight_fraction_leaf=0.0, max_features='sqrt', 
                                    max_leaf_nodes=None, min_impurity_decrease=0.0, 
                                    bootstrap=True, oob_score=False, n_jobs= 4, random_state=seed, 
                                    verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)

            model= rf.fit(xtrain, ytrain)
            preds = model.predict(xtest)
            #calculate metric (mcc)
            mcc = metriccalc(preds, ytest)[12]
            mcclist.append(mcc)
            if mcc > maxval:
                maxval = mcc
                bestmetrics = mtry
        #store best model for the given fold and plot the metric vs mcc value
        rf = RandomForestClassifier(n_estimators=bestmetrics, criterion='entropy', max_depth=None, 
                                    min_samples_split=2, min_samples_leaf=1, 
                                    min_weight_fraction_leaf=0.0, max_features='sqrt', 
                                    max_leaf_nodes=None, min_impurity_decrease=0.0, 
                                    bootstrap=True, oob_score=False, n_jobs= 4, random_state=seed, 
                                    verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)
        model_list.append(rf)
        testdat.append(maxval)
        totalmetrics.append(mcclist)

        #check models onto validation set, printing various metrics
        model= rf.fit(xtrain, ytrain)
        preds = model.predict(xvalid)
        results = metriccalc(preds, yvalid)
        validlist.append(results)
        print('for fold', split + 1, 'test set mcc of', maxval, 'valid set mcc of', results[12])

        iteration = iteration + 1

        #get x and y values for the final validation set

    toxvals = []
    fingerprints = []
    for row in tempset:
        toxvals.append(row[0])
        fingerprints.append(row[1:])

    #calculates consensus of models on each fingerprint in the final validation set
    consensuslist = []
    predictions = []
    for fp in fingerprints:
        consensus = -1
        fp = fp.reshape(1, -1)
        #appends each model's prediction to a list
        predictions = []
        predlist = []
        for model in model_list:
            preds = model.predict(fp)
            predictions.append(preds)
            predlist.append(preds)
        predictions.append(predlist)

        #finds number of 0s in the prediction list
        zercount = 0
        for num in predictions:
            if num == 0:
                zercount = zercount + 1
        #as there are 5 models, if there are less than 3 0s predicted, the consensus is 1
        if zercount < 3:
            consensus = 1
        else:
            consensus = 0
        consensuslist.append(consensus)
    #calculate and print metrics
    results = metriccalc(consensuslist, toxvals)
    print('\nvalidation metrics of:')
    print('positives in data', results[0])
    print('negatives in data', results[1])
    print('net accuracy =', results[6])
    print('mcc =',results[12])
    
    finalresults.append(results[12])
    iter += 1

For Partition 0
for fold 1 test set mcc of 0.39705882352941174 valid set mcc of -0.023809523809523808
for fold 2 test set mcc of 0.5959595959595959 valid set mcc of 0.5429187467294155
for fold 3 test set mcc of 0.5506887917539348 valid set mcc of 0.24637845104051562
for fold 4 test set mcc of 0.7035264706814485 valid set mcc of 0.4304142310105583
for fold 5 test set mcc of 0.45848623461254817 valid set mcc of 0.447801930486707

validation metrics of:
positives in data 51
negatives in data 51
net accuracy = 0.7549019607843137
mcc = 0.5146749286021822
For Partition 1
for fold 1 test set mcc of 0.49754901960784315 valid set mcc of 0.20558916985752212
for fold 2 test set mcc of 0.4530333378893934 valid set mcc of 0.63802580126271
for fold 3 test set mcc of 0.556293911166591 valid set mcc of 0
for fold 4 test set mcc of 0.6508140266182866 valid set mcc of 0.2805712709727033
for fold 5 test set mcc of 0.5522311464689716 valid set mcc of 0.32824397594488725

validation metrics of:
positives i

For Partition 14
for fold 1 test set mcc of 0.4913530840368859 valid set mcc of 0.07159924926194092
for fold 2 test set mcc of 0.4530333378893934 valid set mcc of 0.41963703136786923
for fold 3 test set mcc of 0.450563556889583 valid set mcc of 0.15472230622130836
for fold 4 test set mcc of 0.502518907629606 valid set mcc of 0.4806307859271526
for fold 5 test set mcc of 0.4791863575034157 valid set mcc of 0.6359727033932191

validation metrics of:
positives in data 51
negatives in data 51
net accuracy = 0.7058823529411765
mcc = 0.4199160251916029
For Partition 15
for fold 1 test set mcc of 0.6077399860725146 valid set mcc of 0.12046772038736683
for fold 2 test set mcc of 0.6030226891555273 valid set mcc of 0.5600378063109737
for fold 3 test set mcc of 0.40201512610368484 valid set mcc of 0.2717914161090704
for fold 4 test set mcc of 0.7509392614826383 valid set mcc of 0.3266084458844825
for fold 5 test set mcc of 0.5000643551261523 valid set mcc of 0.6030226891555273

validation metric

In [57]:
np.mean(finalresults)

0.4421902274859165

In [56]:
#As can be seen this form of class balancing is working well
finalresults

[0.5146749286021822,
 0.5499719409228703,
 0.446108908109597,
 0.4541341396153042,
 0.510688230856951,
 0.4103049699311091,
 0.4169260787524219,
 0.48193159734149926,
 0.37515428924742517,
 0.3563134121092031,
 0.3922322702763681,
 0.3563134121092031,
 0.48193159734149926,
 0.49236596391733095,
 0.4199160251916029,
 0.47508454947893747,
 0.3728358164321559,
 0.48193159734149926,
 0.4354941703556927,
 0.41303706026554643,
 0.3959037912324479,
 0.5505619884431753,
 0.4314554973040049,
 0.5516772843673704,
 0.40555355282690636,
 0.3244428422615251]