In [1]:
import glob
import os
import pandas as pd
import numpy as np
import joblib
import sklearn
import json
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

In [10]:
import joblib

model_dir = os.path.join('..','models','sklearn')

all_models = glob.glob(f'{model_dir}/Escherichia_coli/*.joblib')
print(all_models)
model_dic = []
for x in all_models:
    to = os.path.basename(x).split('_')
    meta_dic = {
        'train_site': [to[2]],
        'ML': [to[7]],
        'species': ['_'.join(to[9:10])],
        'antibiotics': [to[12]],
        'seed': [to[14]],
        'model': joblib.load(x)
    }

    model_dic.append(meta_dic)
len(model_dic)

['../models/sklearn/Escherichia_coli/Train_site_UMG-0_Test_site_UMG-0_Model_lr_Species_Escherichia_coli_Antibiotic_Ampicillin_Seed_409_model.joblib', '../models/sklearn/Escherichia_coli/Train_site_UMG-0_Test_site_UMG-0_Model_lr_Species_Escherichia_coli_Antibiotic_Ciprofloxacin_Seed_545_model.joblib', '../models/sklearn/Escherichia_coli/Train_site_UMG-0_Test_site_UMG-0_Model_lr_Species_Escherichia_coli_Antibiotic_Ciprofloxacin_Seed_164_model.joblib', '../models/sklearn/Escherichia_coli/Train_site_UMG-0_Test_site_UMG-0_Model_lr_Species_Escherichia_coli_Antibiotic_Cefotaxim_Seed_545_model.joblib', '../models/sklearn/Escherichia_coli/Train_site_UMG-0_Test_site_UMG-0_Model_lr_Species_Escherichia_coli_Antibiotic_Ciprofloxacin_Seed_480_model.joblib', '../models/sklearn/Escherichia_coli/Train_site_UMG-0_Test_site_UMG-0_Model_lr_Species_Escherichia_coli_Antibiotic_Ceftriaxone_Seed_545_model.joblib', '../models/sklearn/Escherichia_coli/Train_site_UMG-0_Test_site_UMG-0_Model_lr_Species_Escherichi

88

In [16]:
def ML_model_run(model_dic: dict, bin_filepath: str, species: str) -> pd.DataFrame:
    vec = pd.read_csv(bin_filepath, sep=' ', index_col=False, header=None).to_numpy()
    vec = vec.T

    result = pd.DataFrame()
    list_of_models = model_dic
    for model_item in list_of_models:
        model = model_item['model']
        meta_data = model_item.copy()
        del meta_data['model']
        result_row = pd.DataFrame.from_dict(meta_data)
        pred = model.predict_proba(vec)[0]
        result_row['S'] = pred[0]
        result_row['R'] = pred[1]
        result = pd.concat([result, result_row], axis = 0)

    return result


def folder_scan(raw_dir: str) -> dict:
    file_exist_dic = {}
    raw_file_path = os.path.join(raw_dir,'*','*')
    raw_file_list = glob.glob(raw_file_path)
    for filepath in raw_file_list:
        species_name, sample_number = filepath.split(os.sep)[-2:]
        if species_name not in file_exist_dic.keys():
            file_exist_dic[species_name] = set()
        file_exist_dic[species_name].add(sample_number)
    print(f'File scan done.')

    return file_exist_dic


def preprocessing(input_dir: str, output_dir: str, file_exist_dic: dict) -> None:
    pred_res = pd.DataFrame()
    for species in file_exist_dic.keys():
        raw_path_species = os.path.join(input_dir, species)
        preprocessed_path = os.path.join(output_dir, species)
        os.makedirs(preprocessed_path, exist_ok=True)
        for sample_number in file_exist_dic[species]:
            raw_path = os.path.join(raw_path_species, sample_number)
            preprocessed_filepath = os.path.join(preprocessed_path, sample_number)
            output_filename = preprocessed_filepath.rstrip('.txt.')
            output_raw_filename = f'{output_filename}_raw.csv'
            output_summary_filename = f'{output_filename}_summary.csv'

            if os.path.exists(output_raw_filename):
                print(f'Preprocessing {output_raw_filename} already exist.')
                continue

            print(f'New bin file: {raw_path} found.')
            pred_res = ML_model_run(model_dic, raw_path, species)

            try:
                pred_res = ML_model_run(model_dic, raw_path, species)
                print(f'ML prediction {preprocessed_filepath} done.')
            except:
                print(f'ML prediction of {raw_path} fail.')

            if len(pred_res) > 0:
                pred_res.sort_values('antibiotics', inplace=True)
                pred_res.to_csv(output_raw_filename)

                out_dic = []
                summary_res = pred_res.groupby('antibiotics')['S'].apply(list)
                for i_ in range(len(summary_res)):
                    row = summary_res.values[i_]
                    amname = summary_res.index[i_]
                    row = ['S' if x > 0.5 else 'R' for x in row]
                    out_dic.append({
                        'Antibiotics': amname,
                        'Resistant' : row.count('R'),
                        'Susceptible': row.count('S')
                    })
                    #print(f"{name} S: {row.count('S')}, R: {row.count('R')}")
            
                result_df = pd.DataFrame.from_dict(out_dic)
                result_df.set_index(result_df['Antibiotics'], inplace=True)
                del(result_df['Antibiotics'])
                
                result_df.to_csv(output_summary_filename)

    return


binned_dir = os.path.join('..', 'data', f'binned_{str(6000)}')
print(binned_dir)
bin_files = folder_scan(binned_dir)
print(bin_files)
results_dir = os.path.join('..', 'results')


final_result = preprocessing(binned_dir, results_dir, bin_files)

../data/binned_6000
File scan done.
{'Escherichia_coli': {'example12.txt', 'example1.txt'}}
Preprocessing ../results/Escherichia_coli/example12_raw.csv already exist.
Preprocessing ../results/Escherichia_coli/example1_raw.csv already exist.


In [29]:
pred_ress

Unnamed: 0,train_site,ML,species,antibiotics,seed,S,R
0,UMG-0,lr,Escherichia,Ampicillin,409,0.495833,0.5041665
0,UMG-0,lr,Escherichia,Ampicillin,344,0.670168,0.3298323
0,UMG-0,lr,Escherichia,Ampicillin,77,0.696526,0.3034736
0,UMG-0,lr,Escherichia,Ampicillin,188,0.460746,0.5392539
0,UMG-0,lr,Escherichia,Ampicillin,89,0.626466,0.3735339
0,UMG-0,lr,Escherichia,Ampicillin,164,0.69585,0.3041502
0,UMG-0,lr,Escherichia,Ampicillin,172,0.575343,0.4246572
0,UMG-0,lr,Escherichia,Ampicillin,35,0.59948,0.40052
0,UMG-0,lr,Escherichia,Ampicillin,480,0.560857,0.4391429
0,UMG-0,lr,Escherichia,Ampicillin,545,0.58891,0.4110903
