In [1]:
#base_dir = '/home/gbiagini/dev/nn-sero-pytorch/'
base_dir = './'

import pandas as pd
import numpy as np
import torch
import random
from fastai import *
from fastai.basics import *
from fastai.tabular import *
from tqdm import tqdm

In [2]:
def random_seed(seed_value, use_cuda):
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    random.seed(seed_value) # Python
    if use_cuda: 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False

In [3]:
loci = ['A', 'B', 'C', 'DPB1', 'DQB1', 'DRB1']
#loci = ['A']

# function to check if value can be an integer - to eliminate excess characters from serology labels
def metrics(print_all='no'):
    loci = ['A', 'B', 'C', 'DPB1', 'DQB1', 'DRB1']
    #loci = ['A']

    # function to check if value can be an integer - to eliminate excess characters from serology labels
    def checkInt(x):
        try:
            int(x)
            return True
        except ValueError:
            return False

    concordances = {}

    for loc in loci:
        newDict = {}
        simDict = {}
        diffDict = {}
        oldPredict = {}
        newPredict = {}
        oldPredFile = base_dir + "/old-predictions/" + loc + ".chile"
        newPreds = pd.read_csv(base_dir + "predictions/" + loc + "_predictions.csv")
        newPreds = newPreds.set_index('allele')
        newPreds = newPreds.to_dict()
        newPredict = newPreds["serology"]
        for nKey in newPredict.keys():
            adjustMe = newPredict[nKey]
            adjustMe = adjustMe.replace('[','')
            adjustMe = adjustMe.replace(']','')
            adjustMe = adjustMe.replace(' ','')
            adjustMe = adjustMe.replace("'",'')
            adjustMe = adjustMe.split(',')
            newPredict[nKey] = [x.strip('a') for x in adjustMe if checkInt(x)]
        with open(oldPredFile, "r") as handle:
            for line in handle:
                if line.find('%') == -1:
                    next
                else:
                    line = line.split()
                    if line == []:
                        next
                    else:
                        line[:] = [x for x in line if (x != '[100.00%]')]
                        allele = loc + "*" + str(line[0][:-1])
                        oldPredict[allele] = line[1:]


        for each in oldPredict.keys():
            allDict = {}
            allDict["Allele"] = each
            allDict["Old Assignment"] = oldPredict[each]
            if each not in newPredict.keys():
                next
            else:
                allDict["New Assignment"] = newPredict[each]
                if set(newPredict[each]) != set(oldPredict[each]):
                    diffDict[each] = allDict
                elif set(newPredict[each]) == set(oldPredict[each]):
                    simDict[each] = allDict
        diffFrame = pd.DataFrame.from_dict(diffDict)
        diffFrame = diffFrame.transpose()
        diffFrame.to_csv(base_dir + "comparison/" + loc + "_compfile.csv", index=False)
        simFrame = pd.DataFrame.from_dict(simDict)
        simFrame = simFrame.transpose()
        simFrame.to_csv(base_dir + "comparison/" + loc + "_similar.csv", index=False)
        

        for allele in newPredict.keys():
            allDict = {}
            allDict["Allele"] = allele
            allDict["Serologic Assignment"] = newPredict[allele]
            if allele not in oldPredict.keys():
                newDict[allele] = allDict
        newFrame = pd.DataFrame.from_dict(simDict)
        newFrame = newFrame.transpose()
        newFrame.to_csv(base_dir + "comparison/" + loc + "_newsies.csv", index=False)

        simLen = len(simFrame)
        diffLen = len(diffFrame)
        with open(base_dir + "comparison/" + loc + "_concordance.txt", "w+") as fhandle:
            fhandle.write("HLA-" +loc+ " Similar: " + str(simLen))
            fhandle.write("HLA-" +loc+ " Different: " + str(diffLen))
            concordance = (simLen / (simLen + diffLen)) * 100
            concordances[loc] = concordance
            fhandle.write("HLA-" +loc+ " Concordance: " + str(concordance) + "%")
            if print_all == "yes":
                print("HLA-" +loc+ " Similar: " + str(simLen))
                print("HLA-" +loc+ " Different: " + str(diffLen))
                print("HLA-" +loc+ " Concordance: " + str(concordance) + "%")
    return concordances


In [6]:
pre_concord = metrics()

loci = ['A', 'B', 'C', 'DPB1', 'DQB1', 'DRB1']
epoch = { "A":120, "B":100, "C":50, "DPB1":100, "DQB1":100, "DRB1":100 }
layer = { "A":[2000, 1500], "B":[1000], "C":[150], "DPB1":[1000], "DQB1":[1000], "DRB1":[1000] }


for locus in loci:

  random_seed(50,use_cuda=False)

  AAs = []
  tng_df = pd.read_csv(base_dir + 'randomforest/training/' + locus + '_train.csv')
  tst_df = pd.read_csv(base_dir + 'randomforest/testing/' + locus + '_test.csv')
  val_df = pd.read_csv(base_dir + 'randomforest/training/' + locus + '_validation.csv')
  tng_idx = len(tng_df)
  val_len = len(val_df)
  val_idx = tng_idx + val_len
  tst_idx = len(tst_df)
  tbatch = int(tng_idx // 1.5)
  if (tbatch <= 1):
    tbatch = tng_idx // 1
  vbatch = int(val_idx // 1.5)
  if (vbatch <= 1):
    vbatch = val_idx // 1
  if locus == "C":
    tbatch = tng_idx
    vbatch == val_idx
  

  df = tng_df.append(val_df)

  for each in tng_df:
    if (each != 'allele') & (each != 'serology'):
      AAs.append(each)

  dep_var = 'serology'
  cat_names = ['allele'] + AAs
  procs = [FillMissing, Categorify]


  cat_names = ['allele'] + AAs
  test = TabularList.from_df(tst_df, path=Path(''), cat_names=cat_names)
  
  data = (TabularList.from_df(df=df, path=Path(''), procs=procs, cat_names=cat_names)
                              .split_by_idx(list(range(tng_idx,val_idx)))
                              .label_from_df(cols=dep_var, label_delim=' ')
                              .add_test(test)
                              .databunch(bs=tbatch, val_bs=vbatch))
  '''
  data = (TabularList.from_df(df=df, path=Path(''), procs=procs, cat_names=cat_names)
                              .split_by_idx(list(range(tng_idx,val_idx)))
                              .label_from_df(cols=dep_var, label_delim=';')
                              .add_test(test)
                              .databunch(bs=tng_idx, val_bs=val_idx))
  '''
  acc_02 = partial(accuracy_thresh, thresh=0.99)
  f_score = partial(fbeta, thresh=0.55)

  pre_vote = {}
  avg_pred = {}
  all_models = []

  for n in range(1,2):
    #weights = torch.ones([data.c]).float().cuda()
    #loss = nn.BCEWithLogitsLoss(pos_weight=weights)
    #learn = tabular_learner(data, opt_func=optim.SGD, layers=layer[locus], metrics=[acc_02, f_score], loss_func=loss)
    learn = tabular_learner(data, opt_func=optim.SGD, layers=layer[locus], metrics=[acc_02, f_score])
    print(data.classes)

    lr = 0.5
    #learn.recorder.plot(suggestion=True)

    #learn.fit_one_cycle(epoch[locus], max_lr=slice(lr))
    learn.fit_one_cycle(epoch[locus], max_lr=slice(lr))
    #learn.fit_one_cycle(epoch[locus], lr)
    learn.model
    #learn.recorder.plot_losses()

    test_id = list(tst_df['allele'])

    classes = data.classes
    predictions = []
    print(classes)

   
    for i in tqdm(range(0,tst_idx)):
      category = str(learn.predict(tst_df.iloc[i], thresh=0.52)[0])
      sero = category.strip('MultiCategory ')
      sero = sero.replace(';',' ')
      sero = sero.replace('a','')
      predictions.append(sero.split())
    '''
    category = learn.get_preds()
    print(category)
    '''
    pre_vote = {test_id[j]: str(predictions[j]) for j in range(len(test_id)) }
    avg_pred[str(n)] = pre_vote
    

  avg_frame = pd.DataFrame.from_dict(avg_pred)
  mode = avg_frame.mode(axis=1)
  rmode = mode[0]
  rmode = pd.DataFrame(rmode)
  rmode = rmode.reset_index()
  rmode.columns=['allele', 'serology']

  #output_preds = pd.DataFrame({'allele': test_id, 'serology': predictions})
  rmode.to_csv(base_dir + 'predictions/' + locus + '_predictions.csv', index=False)


['1', '10', '11', '2', '23', '24', '25', '26', '28', '29', '3', '30', '31', '32', '33', '34', '36', '43', '66', '68', '69', '74', '80', '9']


  0%|          | 0/1692 [00:00<?, ?it/s]['1', '10', '11', '2', '23', '24', '25', '26', '28', '29', '3', '30', '31', '32', '33', '34', '36', '43', '66', '68', '69', '74', '80', '9']
  1%|          | 15/1692 [00:07<13:14,  2.11it/s]


KeyboardInterrupt: 

In [None]:
post_concord = metrics()

for loc in loci:
	print(loc + " Concordance:\t\t\t\t" + str(post_concord[loc])[:5] + "%")
	change = post_concord[loc] - pre_concord[loc]
	print("% Change:\t\t\t\t" + str(change)[:5] + "%")

In [None]:
rmode.to_csv(base_dir + 'predictions/' + locus + '_predictions.csv', index=False)