In [20]:
import glob
import os
import pandas as pd
import numpy as np
import joblib
import sklearn
import json
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

In [25]:
model_dir = '../models/sklearn/'
train_result_dir = '../models/sklearn/train_results/'

model_dic = {}
species = glob.glob(f'{model_dir}/*')
for i in species:
    if not(os.path.isdir(i)):
        continue
    model_dic[os.path.basename(i)] = []
    all_models = glob.glob(f'{model_dir}/{os.path.basename(i)}/*.joblib')
    for x in all_models:
        to = os.path.basename(x).split('_')
        meta_dic = {
            'train_site': [to[2]],
            'ML': [to[7]],
            'species': ['_'.join(to[9:10])],
            'antibiotics': [to[12]],
            'seed': [to[14]],
            'model': joblib.load(x)
        }
        """
        train_result_path = os.path.join(train_result_dir, meta_dic['ML'][0], os.path.basename(x).rstrip('_model.joblib')+'.json')
        if os.path.exists(train_result_path):
            try:
                train_result_path = os.path.join(train_result_dir, meta_dic['ML'][0], os.path.basename(x).rstrip('_model.joblib')+'.json')
                with open(train_result_path, 'r') as openfile:
                    # Reading from json file
                    json_object = json.load(openfile)
                meta_dic['test_prob'] = [np.array(json_object['y_score'])]
                meta_dic['test_true'] = [np.array(json_object['y_test'])]
            except:
                pass
        """
        model_dic[os.path.basename(i)].append(meta_dic)

#model_dic

In [53]:
def ML_model_run(model_dic: dict, bin_filepath: str, species: str) -> pd.DataFrame:
    vec = pd.read_csv(bin_filepath, sep=' ', index_col=False, header=None).to_numpy()
    vec = vec.T

    result = pd.DataFrame()
    list_of_models = model_dic[species]
    for model_item in list_of_models:
        model = model_item['model']
        meta_data = model_item.copy()
        del meta_data['model']
        result_row = pd.DataFrame.from_dict(meta_data)
        pred = model.predict_proba(vec)[0]
        result_row['S'] = pred[0]
        result_row['R'] = pred[1]
        result = pd.concat([result, result_row], axis = 0)
        
    
        #dist_x = pd.DataFrame(meta_data['test_prob'][0])
        #dist_x_true = meta_data['test_true'][0]
        #dist_x = dist_x[dist_x_true==1]
        
        #dist_x['label'] = dist_x_true
        #sns.kdeplot(data=dist_x[dist_x_true==0].iloc[:,0], color="blue")
        #sns.kdeplot(data=dist_x[dist_x_true==1].iloc[:,1], color="orange")

        #sns.ecdfplot(data=dist_x.iloc[:,0], color='black', ylim=(0.5,1))
        #plt.axvline(x=pred[0], color='blue')
        #print(stats.ttest_1samp(dist_x[dist_x_true==0].iloc[:,0], pred[0]))
        #print(stats.ttest_1samp(dist_x.iloc[:,0], pred[0]))
        
        #sns.ecdfplot(data=dist_x.iloc[:,1], color='red')
        #plt.axvline(x=pred[1], color='orange')
        #print(stats.ttest_1samp(dist_x[dist_x_true==1].iloc[:,1], pred[1]))
        #print(stats.ttest_1samp(dist_x.iloc[:,1], pred[1]))

    return result


def folder_scan(raw_dir: str) -> dict:
    file_exist_dic = {}
    raw_file_path = os.path.join(raw_dir, '*', '*')
    raw_file_list = glob.glob(raw_file_path)
    for filepath in raw_file_list:
        species_name, sample_number = filepath.split(os.sep)[-2:]
        if species_name not in file_exist_dic.keys():
            file_exist_dic[species_name] = set()
        file_exist_dic[species_name].add(sample_number)
    print(f'File scan done.')
    
    return file_exist_dic


def preprocessing(input_dir: str, output_dir: str, file_exist_dic: dict) -> None:
    for species in file_exist_dic.keys():
        raw_path_species = os.path.join(input_dir, species)
        preprocessed_path = os.path.join(output_dir, species)
        os.makedirs(preprocessed_path, exist_ok=True)
        for sample_number in file_exist_dic[species]:
            raw_path = os.path.join(raw_path_species, sample_number)
            preprocessed_filepath = os.path.join(preprocessed_path, sample_number)

            if os.path.exists(preprocessed_filepath):
                #print(f'Preprocessing {preprocessed_filepath} already exist.')
                continue
            
            print(f'New bin file: {raw_path} found.')
            pred_res = ML_model_run(model_dic, raw_path, species)

            try:
                pred_res = ML_model_run(model_dic, raw_path, species)
                print(f'ML prediction {preprocessed_filepath} done.')
            except:
                print(f'ML prediction of {raw_path} fail.')
    
    pred_res.sort_values('antibiotics', inplace=True)
    summary_res = pred_res.groupby('antibiotics')['S'].apply(list)
    print(summary_res)
    for row in summary_res:
        print(row)
    #print( [( ['S' if float(x) > 0.5 else 'R' for x in summary_res.loc[:,row]]) for row in summary_res.index] )
    return pred_res


bin_size = 6000
binned_dir = os.path.join('..', 'data', f'binned_{str(bin_size)}')
bin_files = folder_scan(binned_dir)

results_dir = os.path.join('..', 'results')


pred_ress = preprocessing(binned_dir, results_dir, bin_files)

File scan done.
New bin file: ../data/binned_6000/Escherichia_coli/example12.txt found.
ML prediction ../results/Escherichia_coli/example12.txt done.
New bin file: ../data/binned_6000/Escherichia_coli/example1.txt found.
ML prediction ../results/Escherichia_coli/example1.txt done.
antibiotics
Ampicillin                            [0.49583347063980465, 0.6701676951241453, 0.6965263613917103, 0.46074609453785864, 0.6264661239664027, 0.6958497588585764, 0.5753427677077021, 0.5994799867055156, 0.5608570955547223, 0.5889096708132662, 0.6452302613492612]
Ampicillin+Sulbactam                       [0.804577760002742, 0.5637992750911969, 0.6185921216000838, 0.6550229164790657, 0.7501347633077858, 0.66129137740947, 0.5477104583999055, 0.5807139766172029, 0.6087176377382251, 0.6127956032185189, 0.7724429966072943]
Cefotaxim                               [0.6952104543512315, 0.6489127026734942, 0.7877745676830163, 0.7955139187582577, 0.7720968525757789, 0.7564575888935389, 0.6162724310452996, 0.6

In [29]:
pred_ress

Unnamed: 0,train_site,ML,species,antibiotics,seed,S,R
0,UMG-0,lr,Escherichia,Ampicillin,409,0.495833,0.5041665
0,UMG-0,lr,Escherichia,Ampicillin,344,0.670168,0.3298323
0,UMG-0,lr,Escherichia,Ampicillin,77,0.696526,0.3034736
0,UMG-0,lr,Escherichia,Ampicillin,188,0.460746,0.5392539
0,UMG-0,lr,Escherichia,Ampicillin,89,0.626466,0.3735339
0,UMG-0,lr,Escherichia,Ampicillin,164,0.69585,0.3041502
0,UMG-0,lr,Escherichia,Ampicillin,172,0.575343,0.4246572
0,UMG-0,lr,Escherichia,Ampicillin,35,0.59948,0.40052
0,UMG-0,lr,Escherichia,Ampicillin,480,0.560857,0.4391429
0,UMG-0,lr,Escherichia,Ampicillin,545,0.58891,0.4110903
