In [1]:
from google.colab import drive
from fastai.imports import *
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive/Colab Notebooks/dev/"
base_dir = root_dir + 'nn-sero-pytorch/randomforest/'
NN_dir = root_dir + 'nn-sero-pytorch'
path = Path(base_dir)


ModuleNotFoundError: No module named 'google'

In [1]:
import os
NN_dir = os.path.dirname(os.path.abspath('.'))
base_dir = os.path.join(NN_dir, 'randomforest/')

In [2]:
import pandas as pd
import numpy as np
import sys
import math
import lime
import lime.lime_tabular
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

def metrics(print_all='no'):
    loci = ['A', 'B', 'C', 'DPB1', 'DQB1', 'DRB1']
    #loci = ['A']

    # function to check if value can be an integer - to eliminate excess characters from serology labels
    def checkInt(x):
        try:
            int(x)
            return True
        except ValueError:
            return False

    concordances = {}

    for loc in loci:
        newDict = {}
        simDict = {}
        diffDict = {}
        oldPredict = {}
        newPredict = {}
        oldPredFile = NN_dir + "/old-predictions/" + loc + ".chile"
        newPreds = pd.read_csv(base_dir + "predictions/" + loc + "_predictions.csv")
        newPreds = newPreds.set_index('allele')
        newPreds = newPreds.to_dict()
        newPredict = newPreds["serology"]
        for nKey in newPredict.keys():
            adjustMe = str(newPredict[nKey])
            adjustMe = adjustMe.replace('[','')
            adjustMe = adjustMe.replace(']','')
            adjustMe = adjustMe.replace('a','')
            adjustMe = adjustMe.replace("'",'')
            adjustMe = adjustMe.split(' ')
            newPredict[nKey] = [x.strip('a') for x in adjustMe if checkInt(x)]
        with open(oldPredFile, "r") as handle:
            for line in handle:
                if line.find('%') == -1:
                    next
                else:
                    line = line.split()
                    if line == []:
                        next
                    else:
                        line[:] = [x for x in line if (x != '[100.00%]')]
                        allele = loc + "*" + str(line[0][:-1])
                        oldPredict[allele] = line[1:]


        for each in oldPredict.keys():
            allDict = {}
            allDict["Allele"] = each
            allDict["Old Assignment"] = oldPredict[each]
            if each not in newPredict.keys():
                next
            else:
                allDict["New Assignment"] = newPredict[each]
                if set(newPredict[each]) != set(oldPredict[each]):
                    diffDict[each] = allDict
                elif set(newPredict[each]) == set(oldPredict[each]):
                    simDict[each] = allDict
        diffFrame = pd.DataFrame.from_dict(diffDict)
        diffFrame = diffFrame.transpose()
        diffFrame.to_csv(base_dir + "comparison/" + loc + "_compfile.csv", index=False)
        simFrame = pd.DataFrame.from_dict(simDict)
        simFrame = simFrame.transpose()
        simFrame.to_csv(base_dir + "comparison/" + loc + "_similar.csv", index=False)
        

        for allele in newPredict.keys():
            allDict = {}
            allDict["Allele"] = allele
            allDict["Serologic Assignment"] = newPredict[allele]
            if allele not in oldPredict.keys():
                newDict[allele] = allDict
        newFrame = pd.DataFrame.from_dict(simDict)
        newFrame = newFrame.transpose()
        newFrame.to_csv(base_dir + "comparison/" + loc + "_newsies.csv", index=False)

        simLen = len(simFrame)
        diffLen = len(diffFrame)
        with open(base_dir + "comparison/" + loc + "_concordance.txt", "w+") as fhandle:
            fhandle.write("HLA-" +loc+ " Similar: " + str(simLen))
            fhandle.write("HLA-" +loc+ " Different: " + str(diffLen))
            concordance = (simLen / (simLen + diffLen)) * 100
            concordances[loc] = concordance
            fhandle.write("HLA-" +loc+ " Concordance: " + str(concordance) + "%")
            if print_all == "yes":
                print("HLA-" +loc+ " Similar: " + str(simLen))
                print("HLA-" +loc+ " Different: " + str(diffLen))
                print("HLA-" +loc+ " Concordance: " + str(concordance) + "%")
    return concordances

#main(print_all="yes")

In [3]:
np.set_printoptions(threshold=sys.maxsize)

def one_hot_decode(df):
	df['serology']=''

	for col in df.columns:
		df.loc[df[col]==1,'serology'] = df['serology']+col+';'

	return df

def fix_data(uniques, data, loc, iset, ident):
  sero = {}
  for row in data.itertuples(name='Pandas'):
    sero[row.allele] = str(row.serology)
    #sero[row[1]] = str(row[-1])
	
  data = data.drop('serology', axis=1)

  for key in sero.keys():
    '''
  	# not applicable for old_sets train/test
    if (sero[key].find(';') != -1):
      sero[key] = sero[key].replace('a','')
      sero[key] = sero[key].split(';')
    else:
      sero[key] = sero[key].replace('a','')
      sero[key] = [sero[key]]
    '''

    #for old_sets train/test
    sero[key] = sero[key].split(' ')
    
    for x in sero[key]:
      if (x not in uniques):
        uniques.append(x)
      else:
        continue

  uniques = list(map(int, uniques))
  uniques.sort()
  uniques = list(map(str, uniques))
  
  for y in uniques:
    data[y] = 0

  one_sero = {}
  for key in sero.keys():
    one_sero[key] = { some_key : ("1" if (some_key in sero[key]) else "0")
		                  for some_key in uniques }
  one_df = pd.DataFrame.from_dict(one_sero)
  one_df = one_df.transpose()
  one_df.index.name = "allele"
  data = data.set_index('allele')
  data.update(one_df, overwrite=True)
  data.to_csv(base_dir + 'randfor/'+iset+'/'+loc+'_'+ident+'.csv', index=True)
  return data, uniques


In [4]:
RSEED = 0

pre_concord = metrics()

loci = ["A", "B", "C", "DQB1", "DRB1"]
print("\nPredicting...")
for loc in tqdm(loci):
  uniques = []
  print('\n'+loc)
  #features = pd.read_csv(base_dir + "training/" + loc + "_train.csv")
  features = pd.read_csv(base_dir + "park/park0216/training/" + loc + "_train.csv")
  features['serology'] = features['serology'].apply(lambda x: x.replace('a','').replace(';',' '))
  features, sers = fix_data(uniques, features,loc,iset='training',ident='train')
  #vfeatures = pd.read_csv(base_dir + "training/" + loc + "_validation.csv")
  vfeatures = pd.read_csv(base_dir + "park/park0216/training/" + loc + "_validation.csv")
  vfeatures['serology'] = vfeatures['serology'].apply(lambda x: x.replace('a','').replace(';',' '))
  vfeatures, vsers = fix_data(uniques, vfeatures,loc,iset='training',ident='validation')
  #test = pd.read_csv(base_dir + "testing/" + loc + "_test.csv")
  test = pd.read_csv(base_dir + "park/park0216/testing/" + loc + "_test.csv")
  test = test.drop('serology', axis=1)

  #remove uncategorized C*12+ alleles
  if loc == "C":
    Cdrops = ["C*12", "C*14", "C*15", "C*16", "C*17", "C*18"]
    test = test.drop(test[test.allele.str.contains('|'.join(Cdrops))].index)
  test.to_csv(base_dir + 'randfor/testing/'+loc+'_test.csv', index=True)

  features = features.append(vfeatures)
  labels = np.array(features[sers])
  features = features.drop(sers, axis=1)
  features = features.reset_index()
  indices = features["allele"]
  indices = list(indices)
  features = features.drop('allele', axis=1)
  feature_list = list(features.columns)
  n_features = len(feature_list)
  maxfeat = int(math.sqrt(n_features))+3

  features = np.array(features)
  labels[labels!=labels]='0'
  features[features!=features]='0'
  features = features.astype(np.int)
  labels = labels.astype(np.int)

  test_idcs = test['allele']
  test = test.drop('allele', axis=1)
  test_list = list(test.columns)
  test = np.array(test)
  test[test!=test]='0'
  test = test.astype(np.int)

  forest = RandomForestClassifier(n_estimators=550, bootstrap=False, max_features=maxfeat, n_jobs=-1,   random_state=RSEED)
  multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1)
  multi_target_forest.fit(features,labels)
  predictions = multi_target_forest.predict(test)

  ind_labels = [str(x) for x in sers]
  #explainer = lime.lime_tabular.LimeTabularExplainer(features,feature_names=feature_list,class_names=ind_labels,kernel_width=5)
  #for rowexp in range(0,2):
    #exp = explainer.explain_instance(test[rowexp], multi_target_forest.predict_proba, num_features=maxfeat)
    #exp.show_in_notebook(show_table=True)

  preds_output = pd.DataFrame(predictions, index=test_idcs, columns=ind_labels)
  preds_output = one_hot_decode(preds_output)
  preds_output = preds_output.drop(ind_labels, axis=1)
  preds_output.index.name = 'allele'
  preds_output = preds_output.apply(lambda x: str((' '.join(x['serology'].split(';')))[:-1]), result_type='broadcast', axis=1)
  #print(preds_output)
  preds_output.to_csv(base_dir + 'predictions/'+loc+'_predictions.csv', index=True)

print("Done.")

  0%|          | 0/5 [00:00<?, ?it/s]
Predicting...

A
 20%|██        | 1/5 [00:09<00:38,  9.72s/it]
B
 40%|████      | 2/5 [00:34<00:56, 18.77s/it]
C
 60%|██████    | 3/5 [00:40<00:25, 12.92s/it]
DQB1
 80%|████████  | 4/5 [00:45<00:09,  9.66s/it]
DRB1
100%|██████████| 5/5 [00:54<00:00, 10.95s/it]Done.



In [5]:
post_concord = metrics()

for loc in loci:
	print(loc + " Concordance:\t\t\t\t" + str(post_concord[loc])[:5] + "%")
	change = post_concord[loc] - pre_concord[loc]
	print("% Change:\t\t\t\t" + str(change)[:5] + "%")

A Concordance:				97.74%
% Change:				0.0%
B Concordance:				90.44%
% Change:				0.0%
C Concordance:				98.64%
% Change:				0.0%
DQB1 Concordance:				99.27%
% Change:				0.0%
DRB1 Concordance:				95.11%
% Change:				0.0%
