In [3]:
import pandas as pd
import numpy as np
import json
from pandas.api.types import CategoricalDtype

In [2]:
with open('//home//kin672//Kira_GenTB//Isolating Relevant Files//relevant_file_names.json') as f:
    relevant_files = json.load(f) # This is a dictionary with format {STRAIN ID: [OUTPUT PATH, LINEAGE PATH]}

    


In [2]:
prefix = '//n//groups//gentb_www//predictData//'

# RF probability thresholds (if probability is below threshold, it is susceptible to the drug)
thresholds = {'rif': 0.002, 'inh': 0.22, 'emb': 0.082,'pza': 0.013,'str':0.047, 'cap':0.25,'amk':0.6,'cip': 0.42,'kan': 0.63,'levo':0.41,'oflx':0.33,'pas': 0.001,'eth': 0.32}
relevant_files = {'TEST0248': {'Folder': 'tbdata_00001288',
  'Resistance': 'TEST0248.matrix.json',
  'Lineage': 'TEST0248_cut_lineage.txt'},
 'TEST0246': {'Folder': 'tbdata_00001288',
  'Resistance': 'TEST0246.matrix.json',
  'Lineage': 'TEST0246_cut_lineage.txt'},
 'TEST0236': {'Folder': 'tbdata_00001288',
  'Resistance': 'TEST0236.matrix.json',
  'Lineage': 'TEST0236_cut_lineage.txt'}}

In [7]:
catch_problems2 = []
catch_problems = []
catch_problems_3 = []
master_df = pd.DataFrame({'ID':[], 'Drug':[], 'Resistant':[], 'Lineage':[]})

for strainID in relevant_files:
    value = relevant_files.get(strainID)
    json_path = prefix + value['Folder'] + '//' + value['Resistance']
    lineage_path = prefix + value['Folder'] + '//' + value['Lineage']
        
    # Parse lineage
    lineage = pd.read_csv(lineage_path, sep = '\t')
    if any(['freschi' in string for string in lineage.columns]):
        freschi_lineage = lineage.loc[0, list(lineage.columns)[list(np.where(['freschi' in string for string in lineage.columns])[0])[0]]].replace('(1/1)','') 
    else:
        catch_problems_3.append(lineage_path)
        freschi_lineage = 'unfilled'
    
    # Parse json
    with open(json_path) as f:
        resistance = json.load(f)
        
    # Cut out the extra output at the bottom
    resistance = resistance[0]
    if len(resistance) != 13:
        catch_problems2.append(json_path)
    
    # Pull binary resistance outcome and probability per drug!
    for drug_index in range(len(resistance)):
        profile = resistance[drug_index]
        drug_name = profile[1]
        resistant = 'unfilled'
        
        # If the json file has only three outputs per drug, we need to manually determine the binary output using thresholds.
        if len(profile) == 5: # 3 outputs - probability is at index 2
            if float(profile[2]) < thresholds[drug_name]:
                resistant = '0'
            else:
                resistant = '1'
        elif len(profile) == 6: # 4 outputs - binary output is at index 2
            resistant = profile[2] 
        else:
            catch_problems.append(strainID + '/' + drug_name)
            continue
        
        # Add a line to the dataframe
        master_df = pd.concat([master_df, pd.DataFrame({'ID':[strainID], 'Drug':[drug_name], 'Resistant':[resistant], 'Lineage':[freschi_lineage]})], ignore_index = True)
    
        



In [230]:
print('Total number of strains processed: ' + str(len(master_df)) + '\n')
print('StrainIDs that were not processed: ' + ', '.join(catch_problems))
print('Json files without 13 drugs: ' + ', '.join(catch_problems2))
print('Lineage files without freschi2020: ' + ', '.join(catch_problems3))


Total number of strains processed: 39

StrainIDs that were not processed: 


In [232]:
drug_order = CategoricalDtype(thresholds.keys(), ordered = True)

In [234]:
master_df['Drug'] = master_df['Drug'].astype(drug_order)
master_df = master_df.sort_values(by = ['Drug', 'Lineage','Resistant'])

Unnamed: 0,ID,Drug,Resistant,Lineage
1,TEST0248,rif,1,3.1.1
27,TEST0236,rif,1,4.1.i1.1.1.1
14,TEST0246,rif,0,4.2.1.2.2.1.2
0,TEST0248,inh,1,3.1.1
26,TEST0236,inh,1,4.1.i1.1.1.1
13,TEST0246,inh,0,4.2.1.2.2.1.2
3,TEST0248,emb,1,3.1.1
29,TEST0236,emb,0,4.1.i1.1.1.1
16,TEST0246,emb,0,4.2.1.2.2.1.2
2,TEST0248,pza,0,3.1.1


In [242]:
summarized = master_df.groupby(['Drug','Lineage','Resistant']).count()
summarized = summarized.rename(columns={'ID':'NumStrains'})
summarized.reset_index(inplace=True)

In [258]:
reformat_summarized = pd.DataFrame({'Drug':[], 'Lineage':[], 'Percentage of Strains that are Resistant':[], 'Number of Resistant Strains':[], 'Number of Susceptible Strains':[]})
for drug in np.unique(summarized.Drug):
    for lineage in np.unique(summarized.Lineage):
        num_resistant = int(summarized[(summarized.Drug == drug) & (summarized.Lineage == lineage) & (summarized.Resistant == '1')].NumStrains)
        num_susceptible = int(summarized[(summarized.Drug == drug) & (summarized.Lineage == lineage) & (summarized.Resistant == '0')].NumStrains)
        temp = pd.DataFrame({'Drug':[drug], 'Lineage':[lineage], 'Percentage of Strains that are Resistant':[num_resistant * 100 / (num_resistant + num_susceptible)], 'Number of Resistant Strains':[str(num_resistant)], 'Number of Susceptible Strains':[str(num_susceptible)]})
        reformat_summarized = pd.concat([reformat_summarized, temp], ignore_index=True)
reformat_summarized

Unnamed: 0,Drug,Lineage,Percentage of Strains that are Resistant,Number of Resistant Strains,Number of Susceptible Strains
0,amk,3.1.1,100.0,1,0
1,amk,4.1.i1.1.1.1,0.0,0,1
2,amk,4.2.1.2.2.1.2,0.0,0,1
3,cap,3.1.1,100.0,1,0
4,cap,4.1.i1.1.1.1,0.0,0,1
5,cap,4.2.1.2.2.1.2,0.0,0,1
6,cip,3.1.1,100.0,1,0
7,cip,4.1.i1.1.1.1,0.0,0,1
8,cip,4.2.1.2.2.1.2,0.0,0,1
9,emb,3.1.1,100.0,1,0


In [None]:
reformat_summarized.to_csv('//home//kin672//Kira_GenTB//summarized_full.csv')