In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys, os
sys.path.append('../scripts')
import ml_gwas_dev

In [4]:
''' Load the gene, allele, and AMR binary matrices '''
antibiotic = 'ciprofloxacin'
df_features, df_amr = ml_gwas_dev.__prepare_amr_data__(
    allele_table_path='../sample_data/strain_by_allele_binary_SA.csv.gz', 
    amr_path='../sample_data/strain_by_amr_SA.csv.gz', 
    antibiotic=antibiotic, 
    gene_path='../sample_data/strain_by_gene_SA.csv.gz', core_cutoff=10)

Non-core genes: (288, 2968)
Core-gene alleles: (288, 20390)
Feature table: (288, 23358)


In [29]:
%%time
''' No parallel processing '''
reload(ml_gwas_dev)
ensemble = ml_gwas_dev.RSE(num_models=500, bootstrap_instances=0.8, bootstrap_features=0.5)
ensemble.fit(df_features.values, df_amr.values)

Iteration 50
Iteration 100
Iteration 150
Iteration 200
Iteration 250
Iteration 300
Iteration 350
Iteration 400
Iteration 450
Iteration 500
CPU times: user 50.1 s, sys: 116 ms, total: 50.3 s
Wall time: 50.2 s


In [9]:
''' joblib: prefer=processes '''
reload(ml_gwas_dev)
ensemble = ml_gwas_dev.RSE(num_models=500, bootstrap_instances=0.8, bootstrap_features=0.5)
ensemble.fit_parallel(df_features.values, df_amr.values, cores=8)

Sampling instances and features...
Training modesl...


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    5.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   19.9s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   37.9s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:   41.5s finished


In [12]:
''' joblib: prefer=threads '''
reload(ml_gwas_dev)
ensemble = ml_gwas_dev.RSE(num_models=500, bootstrap_instances=0.8, bootstrap_features=0.5)
ensemble.fit_parallel(df_features.values, df_amr.values, cores=8)

Sampling instances and features...
Training modesl...


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    3.0s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:   10.4s
[Parallel(n_jobs=8)]: Done 349 tasks      | elapsed:   22.9s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:   31.0s finished


In [28]:
''' joblib: prefer=threads '''
reload(ml_gwas_dev)
ensemble = ml_gwas_dev.RSE(num_models=500, bootstrap_instances=0.8, bootstrap_features=0.5)
ensemble.fit_parallel(df_features.values, df_amr.values, cores=16)

Sampling instances and features...
Training models...


[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:    2.9s
[Parallel(n_jobs=16)]: Done 130 tasks      | elapsed:   14.0s
[Parallel(n_jobs=16)]: Done 333 tasks      | elapsed:   28.5s
[Parallel(n_jobs=16)]: Done 500 out of 500 | elapsed:   36.8s finished


In [30]:
''' joblib: prefer=threads '''
reload(ml_gwas_dev)
ensemble = ml_gwas_dev.RSE(num_models=500, bootstrap_instances=0.8, bootstrap_features=0.5)
ensemble.fit_parallel(df_features.values, df_amr.values, cores=1)

Sampling instances and features...
Training models...


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   45.7s finished


In [11]:
df_coefs = ensemble.get_coefficient_matrix(feature_names=df_features.columns,reduced=True, order=-1)
df_coefs.head(10)

Unnamed: 0,Model_1,Model_2,Model_3,Model_4,Model_5,Model_6,Model_7,Model_8,Model_9,Model_10,...,Model_491,Model_492,Model_493,Model_494,Model_495,Model_496,Model_497,Model_498,Model_499,Model_500
Cluster_126_Allele_1,,-0.616476,-1.602933,-0.636803,,-0.712542,,-0.467172,,-0.073412,...,,,,,,,,,,
Cluster_279_Allele_0,0.0,,0.0,-0.831678,,0.0,0.0,-0.962353,,,...,,-0.83887,-0.213808,,,,,-1.598456,-0.698527,
Cluster_2243_Allele_0,,0.0,0.0,,,-0.225536,,0.0,-0.720072,-1.072115,...,,,-0.478861,-1.233946,0.0,,,0.0,,-0.875676
Cluster_1696_Allele_0,,-0.189193,0.0,,-0.448464,,-0.069117,,0.0,,...,,,,,,,,,0.0,
Cluster_2617_Allele_0,-0.487544,,,,-0.480344,-0.264983,,,,,...,,,-0.371539,0.0,,-0.561587,0.0,,0.0,
Cluster_1028_Allele_0,,,,0.0,,0.0,,0.0,,,...,-0.443479,-0.284452,-0.228301,,,0.0,-0.001574,0.0,,
Cluster_1780_Allele_0,0.0,0.0,0.0,,,,,,,,...,,,,0.0,-1.098092,,-0.518926,-0.000774,-0.051232,
Cluster_4362_Allele_0,,-0.196654,0.0,0.0,,0.0,,0.0,0.0,,...,0.0,0.0,,0.0,,-0.529948,,,,0.0
Cluster_168_Allele_1,,-0.153867,0.0,0.0,-0.06289,,-0.017804,0.0,-0.389885,,...,0.0,,0.0,-0.117227,0.0,,,0.0,,
Cluster_92_Allele_22,,,,0.0,,,,0.0,0.0,0.0,...,,0.0,,,0.0,,,,0.0,


### Try larger P. aeruginosa dataset

In [33]:
''' Load the gene, allele, and AMR binary matrices '''
df_features_PA, df_amr_PA = ml_gwas_dev.__prepare_amr_data__(
    allele_table_path='../../../data_PA/strain_by_allele_binary_PA.csv', 
    amr_path='../../../data_PA/strain_by_amr_PA.csv', 
    antibiotic='levofloxacin', 
    gene_path='../../../data_PA/strain_by_gene_PA.csv', core_cutoff=10)

Non-core genes: (456, 23155)
Core-gene alleles: (456, 155350)
Feature table: (456, 178505)


In [37]:
%%timeit
ensemble = ml_gwas_dev.RSE(num_models=16, bootstrap_instances=0.8, bootstrap_features=0.5)
ensemble.fit(df_features_PA.values, df_amr_PA)

1 loop, best of 3: 26.9 s per loop


In [38]:
''' joblib: prefer=threads '''
reload(ml_gwas_dev)
ensemble = ml_gwas_dev.RSE(num_models=16, bootstrap_instances=0.8, bootstrap_features=0.5)
ensemble.fit_parallel(df_features_PA.values, df_amr_PA, cores=16)

Sampling instances and features...
Training models...


[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   3 out of  16 | elapsed:   22.2s remaining:  1.6min
[Parallel(n_jobs=16)]: Done  12 out of  16 | elapsed:   22.2s remaining:    7.4s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:   23.0s finished


In [40]:
''' joblib: prefer=processes '''
reload(ml_gwas_dev)
ensemble = ml_gwas_dev.RSE(num_models=16, bootstrap_instances=0.8, bootstrap_features=0.5)
ensemble.fit_parallel(df_features_PA.values, df_amr_PA, cores=16)

Sampling instances and features...
Training models...


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   3 out of  16 | elapsed:   20.9s remaining:  1.5min
[Parallel(n_jobs=16)]: Done  12 out of  16 | elapsed:   25.4s remaining:    8.5s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:   27.0s finished


In [42]:
''' joblib: reset CPU affinity '''
reload(ml_gwas_dev)
ensemble = ml_gwas_dev.RSE(num_models=16, bootstrap_instances=0.8, bootstrap_features=0.5)
ensemble.fit_parallel(df_features_PA.values, df_amr_PA, cores=16)

Sampling instances and features...
Training models...


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   3 out of  16 | elapsed:   28.2s remaining:  2.0min
[Parallel(n_jobs=16)]: Done  12 out of  16 | elapsed:   32.4s remaining:   10.8s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:   34.3s finished


In [43]:
''' joblib '''
reload(ml_gwas_dev)
ensemble = ml_gwas_dev.RSE(num_models=16, bootstrap_instances=0.8, bootstrap_features=0.5)
ensemble.fit_parallel(df_features_PA.values, df_amr_PA, cores=16)

Sampling instances and features...
Training models...


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   3 out of  16 | elapsed:   24.2s remaining:  1.7min
[Parallel(n_jobs=16)]: Done  12 out of  16 | elapsed:   28.4s remaining:    9.5s
[Parallel(n_jobs=16)]: Done  16 out of  16 | elapsed:   30.3s finished


In [44]:
ensemble = ml_gwas_dev.RSE(num_models=32, bootstrap_instances=0.8, bootstrap_features=0.5)
ensemble.fit_parallel(df_features_PA.values, df_amr_PA, cores=16)

Sampling instances and features...
Training models...


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 out of  32 | elapsed:   47.0s remaining:   36.5s
[Parallel(n_jobs=16)]: Done  32 out of  32 | elapsed:   49.3s finished


In [45]:
ensemble = ml_gwas_dev.RSE(num_models=64, bootstrap_instances=0.8, bootstrap_features=0.5)
ensemble.fit_parallel(df_features_PA.values, df_amr_PA, cores=16)

Sampling instances and features...
Training models...


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:   55.9s
[Parallel(n_jobs=16)]: Done  64 out of  64 | elapsed:  1.8min finished


In [49]:
''' joblib + CSR sparsify '''
reload(ml_gwas_dev)
ensemble = ml_gwas_dev.RSE(num_models=64, bootstrap_instances=0.8, bootstrap_features=0.5)
ensemble.fit_parallel(df_features_PA.values, df_amr_PA, cores=16)

Sampling instances and features...
Training models...


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:    7.2s
[Parallel(n_jobs=16)]: Done  64 out of  64 | elapsed:   13.7s finished


## Revisiting with sparse matrices

In [50]:
''' S. aureus dataset '''
reload(ml_gwas_dev)
ensemble = ml_gwas_dev.RSE(num_models=500, bootstrap_instances=0.8, bootstrap_features=0.5)
ensemble.fit_parallel(df_features.values, df_amr.values, cores=8)

Sampling instances and features...
Training models...


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done 108 tasks      | elapsed:    2.9s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:   11.1s finished


In [51]:
''' P. aeruginosa dataset '''
reload(ml_gwas_dev)
ensemble = ml_gwas_dev.RSE(num_models=500, bootstrap_instances=0.8, bootstrap_features=0.5)
ensemble.fit_parallel(df_features_PA.values, df_amr_PA, cores=16)

Sampling instances and features...
Training models...


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:    4.8s
[Parallel(n_jobs=16)]: Done 130 tasks      | elapsed:   21.5s
[Parallel(n_jobs=16)]: Done 333 tasks      | elapsed:   43.9s
[Parallel(n_jobs=16)]: Done 500 out of 500 | elapsed:   59.5s finished


In [53]:
''' E. coli dataset '''
df_features_EC, df_amr_EC = ml_gwas_dev.__prepare_amr_data__(
    allele_table_path='../../../data_EC/strain_by_allele_binary_EC.csv', 
    amr_path='../../../data_EC/strain_by_amr_EC.csv', 
    antibiotic='ciprofloxacin', 
    gene_path='../../../data_EC/strain_by_gene_filtered_EC.csv', core_cutoff=10)

Non-core genes: (1588, 37162)
Core-gene alleles: (1588, 152264)
Feature table: (1588, 189426)


In [54]:
''' E. coli dataset, try 64 models '''
reload(ml_gwas_dev)
ensemble = ml_gwas_dev.RSE(num_models=64, bootstrap_instances=0.8, bootstrap_features=0.5)
ensemble.fit_parallel(df_features_EC.values, df_amr_EC, cores=16)

Sampling instances and features...
Training models...


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:   13.5s
[Parallel(n_jobs=16)]: Done  64 out of  64 | elapsed:   26.0s finished


In [56]:
''' E. coli dataset, try original 500 models '''
reload(ml_gwas_dev)
ensemble = ml_gwas_dev.RSE(num_models=500, bootstrap_instances=0.8, bootstrap_features=0.5)
ensemble.fit_parallel(df_features_EC.values, df_amr_EC, cores=16)

Sampling instances and features...
Training models...


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:   13.2s
[Parallel(n_jobs=16)]: Done 130 tasks      | elapsed:   57.5s
[Parallel(n_jobs=16)]: Done 333 tasks      | elapsed:  2.2min
[Parallel(n_jobs=16)]: Done 500 out of 500 | elapsed:  3.1min finished


In [60]:
ensemble.selected_features.shape

(189426, 500)