In [1]:
import pandas as pd
import numpy as np

In [2]:
# Returns number of samples in a dataframe
def getNumberSamps(dfSamps):
    return len(dfSamps)

# Returns number of NC and C patients and their combined total in a dataframe
def geneCounts(dfUsed):
    countNC = dfUsed.str.count('NC').sum()
    countTotal = len(dfUsed)
    countC = countTotal - countNC
    
    return countTotal, countC, countNC

In [3]:
# Splits gene based on a generated decision tree's top feature and table
def get2Layers(dataf, geneName):
    if(geneName == ""):
        return None, None, None, None
    # Create dataframe containing all patients that have / don't have mutation 'geneName'
    try:
        dfGroupC = dataf[dataf[geneName] == True]
        dfGroupNC = dataf[dataf[geneName] == False]
    except Exception as e:
        return None, None, None, None
    
    # Drop 'geneName' from the dataframe containing patients with
    # 'geneName' to avoid picking the same gene twice
    dfGroupC = dfGroupC.drop(geneName,axis=1).reset_index(drop=True)
    
    # Make dataframes containing the highest Phi values of mutations for patients with 'geneName' and without it
    dfA, oob = makeTable(dfGroupC)
    dfB, oob = makeTable(dfGroupNC)
    
    if not dfA.empty:
        # Find top features of subgroups based on dfA and dfB
        topFA = dfA["Genetic Mutation"].iloc[0]
    else:
        topFA = ""
    
    if not dfB.empty:
        # Find top features of subgroups based on dfA and dfB
        topFB = dfB["Genetic Mutation"].iloc[0]
    else:
        topFB = ""

    # Return dataframes and top features
    return topFA, topFB, dfGroupC, dfGroupNC

In [4]:
# Generates a decision tree and out-of-bag dataset based on Phi value
def makeTable(dfD):
    dfValues = pd.DataFrame()
    
    # Choose 1/2 of mutations randomly
    totalCols = len(dfD.columns)
    colsRandom = dfD.drop(columns=['Unnamed: 0']).sample(frac=1/2, axis=1, replace=False) # Randomize database
    colsRandom.insert(0, "Unnamed: 0", dfD["Unnamed: 0"])
    dfD = dfD[colsRandom.columns]
    dfD = colsRandom
    
    # Save all patients into series to get out-of-bag dataset later
    allPatients = dfD["Unnamed: 0"]
    # Bootstrap
    dfD = dfD.sample(n=len(dfD), replace=True)
    
    # Out-of-Bag Dataset
    outOfBag = allPatients[~allPatients.isin(dfD["Unnamed: 0"])]
    
    dfValues["Genetic Mutation"] = dfD.drop(columns = ["Unnamed: 0"]).columns
    
    n_Total = getNumberSamps(dfD)
    
    n_tL = dfD.sum(numeric_only = True, axis = 0)
    dfValues['n(tL)'] = n_tL.values
    
    n_tR = (dfD.drop(columns = ["Unnamed: 0"]) == 0).astype(int).sum(numeric_only = True, axis = 0)
    dfValues['n(tR)'] = n_tR.values
    
    n_tL_NC = dfD.loc[(dfD["Unnamed: 0"].str.contains("NC"))].drop(columns=["Unnamed: 0"]).sum()
    dfValues['n(tL, NC)'] = n_tL_NC.values
    
    n_tR_NC = (dfD.loc[(dfD["Unnamed: 0"].str.contains("NC"))].drop(columns=["Unnamed: 0"]) == 0).astype(int).sum()
    dfValues['n(tR, NC)'] = n_tR_NC.values
    
    dfValues['n(tL, C)'] = dfValues["n(tL)"] - dfValues['n(tL, NC)']
    dfValues['n(tR, C)'] = dfValues["n(tR)"] - dfValues['n(tR, NC)']


    dfValues['PL'] = dfValues['n(tL)'] / (dfValues['n(tL)'] + dfValues['n(tR)'])
    dfValues['PR'] = dfValues['n(tR)'] / (dfValues['n(tL)'] + dfValues['n(tR)'])
    dfValues['P(C | tL)'] = dfValues['n(tL, C)'] / dfValues['n(tL)']
    dfValues['P(NC | tL)'] = dfValues['n(tL, NC)'] / dfValues['n(tL)']
    dfValues['P(C | tR)'] = dfValues['n(tR, C)'] / dfValues['n(tR)']
    dfValues['P(NC | tR)'] = dfValues['n(tR, NC)'] / dfValues['n(tR)']
    dfValues['2PLPR'] = 2 * dfValues['PL'] * dfValues['PR']
    dfValues['Q'] = abs(dfValues['P(C | tL)'] - dfValues['P(C | tR)']) + abs(dfValues['P(NC | tL)'] - dfValues['P(NC | tR)'])
    dfValues['Φ(s,t)'] = dfValues['2PLPR'] * dfValues['Q']

    dfValues = dfValues.nlargest(10, "Φ(s,t)")
    
    return dfValues, outOfBag


In [5]:
df = pd.read_csv('../mutations(1).csv')

# Drop all mutations with less than 3 patients
unnamedReserve = df["Unnamed: 0"]
df = df.drop(columns=["Unnamed: 0"])
df = df[df.columns[df.sum(axis=0, numeric_only=True) >= 3]]
df.insert(0, "Unnamed: 0", unnamedReserve)

#If sample doesn't have any genes, drop it
df = df.loc[df.sum(axis=1, numeric_only=True) >= 1]
display(df)

# Generates 100 decision trees and returns them as an array (random forest)
def generate():
    tableVals = []
    outOBs = []    
    
    for i in range(0, 100):
        table, outOB = makeTable(df)
        tableVals.append(table)
        outOBs.append(outOB)
        
    return tableVals, outOBs

# Classifies a patient based on the result of generating a random forest
def classify(tableVals, patient, outOB):
    majorityVote = 0
    cCount = 0
    ncCount = 0
    
    rootNodeList = []
    aNodeList = []
    bNodeList = []
    
    for x in range(0, len(tableVals)):
        
        # Get top feature of dataframe
        topFeatureOrig = tableVals[x]["Genetic Mutation"].iloc[0]
        
        # Use out-of-bag samples to classify
        dfUse = df.loc[outOB[x].keys()]
        
        # Get the name of the mutation with the highest Phi value and split until depth 3 is reached
        topFeatureA, topFeatureB, dfA, dfB = get2Layers(dfUse, topFeatureOrig)

        topFeatureA1, topFeatureA2, dfA1, dfA2 = get2Layers(dfA, topFeatureA)
        topFeatureB1, topFeatureB2, dfB1, dfB2 = get2Layers(dfB, topFeatureB)
        
        topFeatureA3, topFeatureA33, dfA3, dfA33 = get2Layers(dfA1, topFeatureA)
        topFeatureA3, topFeatureA33, dfA32, dfA332 = get2Layers(dfA2, topFeatureA)
        topFeatureB3, topFeatureB33, dfB3, dfB33 = get2Layers(dfB1, topFeatureB)
        topFeatureB3, topFeatureB33, dfB32, dfB332 = get2Layers(dfB2, topFeatureB)

        # Classifies patients as having cancer or not having cancer and stores the result
        result = testPatient(df, patient, topFeatureOrig, topFeatureA, topFeatureB, dfA1, dfB1, dfA2, dfB2, topFeatureA1, topFeatureA2, topFeatureB1, topFeatureB2, dfA3, dfA33, dfA32, dfA332, dfB3, dfB33, dfB32, dfB332)
        if result != None:
            majorityVote += result

        if result > 0:
            cCount += 1
        elif result < 0:
            ncCount += 1
        
        # Adds the root, split 1, and split 2 mutations to an array
        rootNodeList.append(topFeatureOrig)
        aNodeList.append(topFeatureA)
        bNodeList.append(topFeatureB)
        
    return majorityVote, cCount, ncCount, rootNodeList, aNodeList, bNodeList

Unnamed: 0.1,Unnamed: 0,TP53_GRCh38_17:7675088-7675088_Missense-Mutation_SNP_C-T-T_C-C-T,RFX5_GRCh38_1:151346265-151346265_Frame-Shift-Del_DEL_G----_G-G--,FDPS_GRCh38_1:155317840-155317840_Intron_DEL_A-A--,KRTCAP3_GRCh38_2:27444218-27444218_3'UTR_DEL_T-T--,GHRL_GRCh38_3:10292765-10292765_Intron_DEL_A-A--_A----,LRRC2_GRCh38_3:46518878-46518878_3'UTR_DEL_T-T--,TLR10_GRCh38_4:38773335-38773335_Frame-Shift-Del_DEL_T-T--,SRFBP1_GRCh38_5:122027171-122027171_3'UTR_DEL_T-T--,C5orf24_GRCh38_5:134859364-134859364_3'Flank_DEL_T-T--,...,APC_GRCh38_5:112840063-112840064_Frame-Shift-Ins_INS_----T,ZBTB43_GRCh38_9:126834070-126834070_3'UTR_DEL_A-A--,NRAS_GRCh38_1:114716126-114716126_Missense-Mutation_SNP_C-C-T_C-C-A_C-T-T,HCRTR2_GRCh38_6:55280416-55280416_Silent_SNP_G-G-A,RSPO2_GRCh38_8:107989148-107989148_Missense-Mutation_SNP_C-C-T,MLANA_GRCh38_9:5897631-5897631_Missense-Mutation_SNP_G-G-A,DIS3_GRCh38_13:72763512-72763512_Missense-Mutation_SNP_C-C-T,APC_GRCh38_5:112839574-112839574_Nonsense-Mutation_SNP_C-C-G_C-C-A,GGNBP2_GRCh38_17:36585696-36585696_Intron_DEL_A-A--,TENM3_GRCh38_4:182792316-182792316_Missense-Mutation_SNP_G-G-A_G-A-A
0,C1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,C2,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,C3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,NC1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,C4,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,C108,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
226,NC119,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
227,C109,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
228,C110,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
import random
from IPython.display import display

# Returns a 1 if the patient is predicted to have cancer and a 0 if not
def testPatient(dfTestPatients, patient, topFOrig, topFAA, topFBB, A1, A2, B1, B2, topFAAA, topFAA2, topFBBB, topFBB2, A3, A32, A33, A34, B3, B32, B33, B34):
    # Instance checking used to stop crashes if no gene for a given feature table split exists
    if(isinstance(A1, pd.DataFrame)):
        totalCountA1, CPatientsA1, NCPatientsA1 = geneCounts(A1["Unnamed: 0"])
    if(isinstance(B1, pd.DataFrame)):
        totalCountB1, CPatientsB1, NCPatientsB1 = geneCounts(B1["Unnamed: 0"])
    if(isinstance(A2, pd.DataFrame)):
        totalCountA2, CPatientsA2, NCPatientsA2 = geneCounts(A2["Unnamed: 0"])
    if(isinstance(B2, pd.DataFrame)):
        totalCountB2, CPatientsB2, NCPatientsB2 = geneCounts(B2["Unnamed: 0"])
    if(isinstance(A3, pd.DataFrame)):
        totalCountA3, CPatientsA3, NCPatientsA3 = geneCounts(A3["Unnamed: 0"])
    if(isinstance(A32, pd.DataFrame)):
        totalCountA32, CPatientsA32, NCPatientsA32 = geneCounts(A32["Unnamed: 0"])
    if(isinstance(A33, pd.DataFrame)):
        totalCountA33, CPatientsA33, NCPatientsA33 = geneCounts(A33["Unnamed: 0"])
    if(isinstance(A34, pd.DataFrame)):
        totalCountA34, CPatientsA34, NCPatientsA34 = geneCounts(A34["Unnamed: 0"])
    if(isinstance(B3, pd.DataFrame)):
        totalCountB3, CPatientsB3, NCPatientsB3 = geneCounts(B3["Unnamed: 0"])
    if(isinstance(B32, pd.DataFrame)):
        totalCountB32, CPatientsB32, NCPatientsB32 = geneCounts(B32["Unnamed: 0"])
    if(isinstance(B33, pd.DataFrame)):
        totalCountB33, CPatientsB33, NCPatientsB33 = geneCounts(B33["Unnamed: 0"])
    if(isinstance(B34, pd.DataFrame)):
        totalCountB34, CPatientsB34, NCPatientsB34 = geneCounts(B34["Unnamed: 0"])
        
    # Classifies patients based on whether they have genes at each layer
    if (((dfTestPatients["Unnamed: 0"].eq(patient) == True) & (dfTestPatients[topFOrig] == True)).any()):
        if (isinstance(A1, pd.DataFrame)):
            if(((dfTestPatients["Unnamed: 0"].eq(patient) == True) & (dfTestPatients[topFAA] == True)).any()):
                if(topFAAA != ""):
                    if (isinstance(A3, pd.DataFrame)):
                        if(((dfTestPatients["Unnamed: 0"].eq(patient) == True) & (dfTestPatients[topFAAA] == True)).any()):
                            if(CPatientsA3 >= NCPatientsA3):
                                return 1
                        if(CPatientsA32 >= NCPatientsA32):
                            return 1
                if(CPatientsA1 >= NCPatientsA1):
                    return 1
                else:
                    return -1
        
        if(topFAA2 != ""):
            if (isinstance(A33, pd.DataFrame)):
                if(((dfTestPatients["Unnamed: 0"].eq(patient) == True) & (dfTestPatients[topFAA2] == True)).any()):
                    if(CPatientsA33 >= NCPatientsA33):
                        return 1
                if(CPatientsA34 >= NCPatientsA34):
                    return 1
        if(CPatientsA2 >= NCPatientsA2):
            return 1
        else:
            return -1
    elif (((dfTestPatients["Unnamed: 0"].eq(patient) == True) & (dfTestPatients[topFOrig] == False)).any()):
        if (isinstance(B1, pd.DataFrame)):
            if(((dfTestPatients["Unnamed: 0"].eq(patient) == True) & (dfTestPatients[topFBB] == True)).any()):
                if(topFBBB != ""):
                    if (isinstance(B3, pd.DataFrame)):
                        if(((dfTestPatients["Unnamed: 0"].eq(patient) == True) & (dfTestPatients[topFBBB] == True)).any()):
                            if(CPatientsB3 >= NCPatientsB3):
                                return 1
                        if(CPatientsB32 >= NCPatientsB32):
                            return 1
                if(CPatientsB1 >= NCPatientsB1):
                    return 1
                else:
                    return -1
        
        if(topFBB2 != ""):
            if (isinstance(B33, pd.DataFrame)):
                if(((dfTestPatients["Unnamed: 0"].eq(patient) == True) & (dfTestPatients[topFBB2] == True)).any()):
                    if(CPatientsB33 >= NCPatientsB33):
                        return 1
                    #else:
                        #return -1
                if(CPatientsB34 >= NCPatientsB34):
                    return 1
                #else:
                    #return -1
        if(CPatientsB2 >= NCPatientsB2):
            return 1
        else:
            return -1
    else:
        return 0

# Returns statistics based on TP, TN, FP, and FN values
def evaluator(TP, TN, FP, FN):
    P = TP + FP
    N = TN + FN
    
    if N == 0:
        N = 1
    if P == 0:
        P = 1
    # Calculates important values and displays them
    Accuracy = (TP + TN) / (P + N) if (P + N) else 1.0
    Sensitivity = TP / (TP + FN) if (TP + FN) else 1.0
    Specificity = TN / (TN + FP) if (TN + FP) else 1.0
    Precision = TP / (TP + FP) if (TP + FP) else 1.0
    MissRate = FN / (FN + TP) if (FN + TP) else 1.0
    FalseDiscoveryRate = FP / (FP + TP) if (FP + TP) else 1.0
    FalseOmissionRate = FN / (FN + TN) if (FN + TN) else 1.0

    print("Accuracy: ", Accuracy)
    print("Sensitivity: ", Sensitivity)
    print("Specificity: ", Specificity)
    print("Precision: ", Precision)
    print("Miss Rate: ", MissRate)
    print("False Discovery Rate: ", FalseDiscoveryRate)
    print("False Omission Rate: ", FalseOmissionRate)
    return [Accuracy, Sensitivity, Specificity, Precision, MissRate, FalseDiscoveryRate, FalseOmissionRate]
    
import plotly.express as px
# Returns a heatmap (in confusion matrix format)
def heatMap(TP, TN, FP, FN, count):
    Array = [[TP, FN],
                    [FP, TN]]
    
    PositivePP = str((TP or 0) + (FP or 0))
    NegativePN = str((TN or 0) + (FN or 0))
    PositiveP = str((TP or 0) + (FN or 0))
    NegativeN = str((FP or 0) + (TN or 0))

    Conf = px.imshow(Array,
                   labels=dict(x="Predicted Condition", y='Actual Condition'),
                   x=["Cancer - " + PositivePP, "No Cancer - " + NegativePN],
                   y=["Cancer - " + PositiveP, "No Cancer - " + NegativeN],
                    title="Total Patients: " + str(count),
                    text_auto=True)
    Conf.update_layout(xaxis={'side':'top'})
    Conf.show()

ModuleNotFoundError: No module named 'plotly'

In [None]:
# Generates random forest, then classifies out-of-bag samples from the first tree in the forest
#     as cancer or non-cancer based on majority vote
def classifyPatients():
    totalPatien = pd.Series()
    TP = 0
    FN = 0
    FP = 0
    TN = 0
    tableVals, outOB = generate()
    
    rootNodeList = []
    aNodeList = []
    bNodeList = []
    
    for i in outOB[0].values:
        cCount = 0
        ncCount = 0
        majorityVote2, cCount, ncCount, rootNodeListTmp, aNodeListTmp, bNodeListTmp = classify(tableVals, i, outOB)
        thisTime = False
        
        rootNodeList += rootNodeListTmp
        aNodeList += aNodeListTmp
        bNodeList += bNodeListTmp
        
        # We classify a patient as having cancer if the vote is >= 0; = 0 because it's better to be safe than sorry.
        if(majorityVote2 >= 0):
            print(i + " has cancer.")
            thisTime = True
        else:
            print(i + " does not have cancer.")
            thisTime = False
        
        print("Count C: " + str(cCount))
        print("Count NC: " + str(ncCount))
        
        if(i[0] == 'C' and thisTime == True):
            TP += 1
        elif(i[0] == 'C' and thisTime == False):
            FN += 1
        elif(i[0] == 'N' and thisTime == True):
            FP += 1
        elif(i[0] == 'N' and thisTime == False):
            TN += 1
    
    # Print most used mutations for splits at the root and the depth 2 splits
    print("Root Splitting Mutation: ")
    for ele in set(rootNodeList):
        print(ele + " used times: " + str(rootNodeList.count(ele)))
    print()

    print("A Splitting Mutation: ")
    for ele in set(aNodeList):
        print(ele + " used times: " + str(aNodeList.count(ele)))
    print()

    print("B Splitting Mutation: ")
    for ele in set(bNodeList):
        print(ele + " used times: " + str(bNodeList.count(ele)))
    print()
    
    return TP, FN, FP, TN, outOB[0]
        

In [None]:
# Classification
print("Out of Bag classification: ")
TP, FN, FP, TN, oob = classifyPatients()

evaluator(TP, TN, FP, FN)
print("TP: ", TP)
print("TN: ", TN)
print("FP: ", FP)
print("FN: ", FN)

In [None]:
# Confusion matrix
heatMap(TP, TN, FP, FN, len(oob))