In [1]:
import pandas as pd
import re
import numpy as np
from autogluon.tabular import TabularDataset, TabularPredictor

In [47]:
whorunsit = 'LiezelMac' # 'LiezelMac' | 'LiezelCluster'

if whorunsit == 'LiezelMac':
    home_dir = '/Users/ltamon'
elif whorunsit == 'LiezelCluster':
    home_dir = '/project/sahakyanlab/ltamon'
else:
  print("The supplied <whorunsit> option is not created in the script.")

wk_dir = home_dir + '/SahakyanLab/GenomicContactDynamics/26_PredictingCp'
csv_dir = wk_dir + '/z_ignore_git/out_makeTable'
out_dir = wk_dir + '/out_autogluon_run'

gcb = 'min2Mb'
chrs = ['chr21']

label_col = 'LABEL'
subsample_size = 50  # subsample subset of data for quick run

feat_regex = 'grp.compl.|grp.kmer'
model_id = re.compile('[^a-zA-Z]').sub('', feat_regex)
model_id = model_id + '_' + str(subsample_size)

model_path = out_dir + '/' + 'agModel_'  + model_id 
metric = 'accuracy'

# fit()
time_limit = 2*60 # 2 min
#holdout_frac

# Baggin/stack ensembling - if enabled, dont provide tuning_data
#num_bag_folds=5 # how many times the k-fold bagging process is repeated to further reduce variance (increasing this may further boost accuracy but will substantially increase training times, inference latency, and memory/disk usage)
#num_bag_sets=1, 
#num_stack_levels=1 
#auto_stack=True # Autogluon will select bagging/stacking numbers
# specifying presets='best_quality' in fit() simply sets auto_stack=True
presets = 'good_quality'
problem_type = 'multiclass' # (options: ‘binary’, ‘multiclass’, ‘regression’, ‘quantile’)
    
#num_trials = 5  # try at most 5 different hyperparameter configurations for each type of model
#search_strategy = 'auto'  # to tune hyperparameters using random search routine with a local scheduler
#hyperparameter_tune_kwargs = {  # HPO is not performed unless hyperparameter_tune_kwargs is specified
#    'num_trials': num_trials,
#    'scheduler' : 'local',
#    'searcher': search_strategy,
#}

In [21]:
def concat_chrcsv(csv_dir, gcb, chrs):
    
    df = []
    [df.append(pd.read_csv( csv_dir + '/' + gcb + '_' + chr + '_MLtbl.csv' )) for chr in chrs]
    df = pd.concat(df)
    
    return(df)

def choose_features(df, feat_regex, label_col):
    
    final_regex = feat_regex + '|' + label_col
    is_chosen = [ bool(re.search(pattern=final_regex, string=feat)) for feat in df.columns ] 
    is_todrop = [not bool for bool in is_chosen]
    df.drop(columns=list(df.columns[is_todrop]), inplace=True)

    return(df)
    
def switch_contact(df):
    
    features = np.array(df.columns)
    is_icol = ['.i_' in feat for feat in features]
    is_jcol = ['.j_' in feat for feat in features]
    
    features_switch = np.array(features)
    features_switch[is_icol] = features[is_jcol]
    features_switch[is_jcol] = features[is_icol]
    
    df_switch = df.set_axis(features_switch, axis=1, inplace=False)
    df_switch = df_switch[features]
    df = pd.concat([df, df_switch])
    
    return(df)

In [32]:
train_data = concat_chrcsv(csv_dir, gcb, chrs)

In [33]:
train_data.head()

Unnamed: 0,LABEL,grp.compl.ij_anvdist,grp.compl.ij_CIIkmer,grp.compl.ij_CIIG,grp.compl.ij_CIIalign,grp.anv.i_nA,grp.anv.i_nC,grp.anv.i_nG,grp.anv.i_nT,grp.anv.i_zetaA,...,grp.kmer3.j_GTA,grp.kmer3.j_TAA,grp.kmer3.j_TCA,grp.kmer1.i_A,grp.kmer1.i_C,grp.kmer1.j_A,grp.kmer1.j_C,grp.GC.i_GC,grp.GC.j_GC,grp.GC.ij_meanGC
0,6,2234.49531,-1.2627,-2e-05,-0.5139,13553,6564,7287,12596,20127.964583,...,865,1546,1679,26149,13851,24640,15360,0.346275,0.384,0.365138
1,8,2577.97879,-1.2975,-2e-05,-0.5202,13593,7501,6985,11921,20725.681748,...,865,1546,1679,25514,14486,24640,15360,0.36215,0.384,0.373075
2,9,1756.299497,-1.4243,-1.9e-05,-0.527,12295,7877,8041,11787,19151.150224,...,865,1546,1679,24082,15918,24640,15360,0.39795,0.384,0.390975
3,4,3603.515907,-1.5271,-2e-05,-0.52535,9841,9649,8996,11514,19382.269282,...,865,1546,1679,21355,18645,24640,15360,0.466125,0.384,0.425063
4,8,4365.757094,-1.4998,-2e-05,-0.527725,10786,9359,9363,10492,18359.698869,...,865,1546,1679,21278,18722,24640,15360,0.46805,0.384,0.426025


In [34]:
train_data = choose_features(train_data, feat_regex, label_col)

In [35]:
train_data.head()

Unnamed: 0,LABEL,grp.compl.ij_anvdist,grp.compl.ij_CIIkmer,grp.compl.ij_CIIG,grp.compl.ij_CIIalign,grp.kmer3.i_AAA,grp.kmer3.i_AAC,grp.kmer3.i_AAG,grp.kmer3.i_AAT,grp.kmer3.i_ACA,...,grp.kmer3.j_GCA,grp.kmer3.j_GCC,grp.kmer3.j_GGA,grp.kmer3.j_GTA,grp.kmer3.j_TAA,grp.kmer3.j_TCA,grp.kmer1.i_A,grp.kmer1.i_C,grp.kmer1.j_A,grp.kmer1.j_C
0,6,2234.49531,-1.2627,-2e-05,-0.5139,3953,1154,1644,2737,1727,...,1175,792,1092,865,1546,1679,26149,13851,24640,15360
1,8,2577.97879,-1.2975,-2e-05,-0.5202,3534,1281,1636,2479,1752,...,1175,792,1092,865,1546,1679,25514,14486,24640,15360
2,9,1756.299497,-1.4243,-1.9e-05,-0.527,3217,1088,1416,2268,1503,...,1175,792,1092,865,1546,1679,24082,15918,24640,15360
3,4,3603.515907,-1.5271,-2e-05,-0.52535,2254,1174,1380,1450,1676,...,1175,792,1092,865,1546,1679,21355,18645,24640,15360
4,8,4365.757094,-1.4998,-2e-05,-0.527725,2223,1040,1374,1431,1663,...,1175,792,1092,865,1546,1679,21278,18722,24640,15360


In [36]:
train_data = switch_contact(train_data)

In [37]:
len(train_data)

623942

In [38]:
train_data = TabularDataset(train_data)

In [39]:
train_data = train_data.sample(n=subsample_size, random_state=0)

In [40]:
train_data.head()

Unnamed: 0,LABEL,grp.compl.ij_anvdist,grp.compl.ij_CIIkmer,grp.compl.ij_CIIG,grp.compl.ij_CIIalign,grp.kmer3.i_AAA,grp.kmer3.i_AAC,grp.kmer3.i_AAG,grp.kmer3.i_AAT,grp.kmer3.i_ACA,...,grp.kmer3.j_GCA,grp.kmer3.j_GCC,grp.kmer3.j_GGA,grp.kmer3.j_GTA,grp.kmer3.j_TAA,grp.kmer3.j_TCA,grp.kmer1.i_A,grp.kmer1.i_C,grp.kmer1.j_A,grp.kmer1.j_C
280644,2,7816.168865,-1.7715,-1.8e-05,-0.5458,1923,910,1339,1134,1548,...,1007,530,976,1031,2229,1623,19286,20714,26438,13562
309448,6,2622.599962,-0.953,-2.4e-05,-0.51745,3261,1126,1550,1744,1599,...,1228,1065,1293,815,1527,1507,22920,17080,22839,17161
70260,2,4613.782877,-1.0278,-2.2e-05,-0.521375,3003,1161,1499,2096,1591,...,966,688,989,1036,2147,1616,23630,16370,25433,14567
24188,12,1539.529464,-0.9476,-2.3e-05,-0.514925,3717,1300,1587,2530,1632,...,1000,697,1134,999,1989,1599,25603,14397,25134,14866
296962,3,3606.961193,-1.061,-2.3e-05,-0.525775,3093,1208,1651,2191,1620,...,1302,1031,1256,816,1407,1500,24386,15614,22642,17358


In [None]:
predictor = TabularPredictor(label=label_col, eval_metric=metric, path=model_path, problem_type=problem_type).fit(train_data, presets=presets, time_limit=time_limit)

Presets specified: ['good_quality']
Stack configuration (auto_stack=True): num_stack_levels=0, num_bag_folds=5, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 120s
AutoGluon will save models to "/Users/ltamon/SahakyanLab/GenomicContactDynamics/26_PredictingCp/out_autogluon_run/agModel_grpcomplgrpkmer_50/"
AutoGluon Version:  0.5.2
Python Version:     3.9.12
Operating System:   Darwin
Train Data Rows:    50
Train Data Columns: 72
Label Column: LABEL
Preprocessing data ...
Train Data Class Count: 16
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    7360.29 MB
	Train Data (Original)  Memory Usage: 0.03 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Genera

Fitting model: LightGBMXT_BAG_L1 ... Training model for up to 115.89s of the 115.89s of remaining time.
	Fitting 5 child models (S1F1 - S1F5) | Fitting with SequentialLocalFoldFittingStrategy
