In [78]:
import pandas as pd
import re
import numpy as np
import math
from autogluon.tabular import TabularDataset, TabularPredictor

In [79]:
whorunsit = 'LiezelMac'  # 'LiezelMac' | 'LiezelCluster'

if whorunsit == 'LiezelMac':
    home_dir = '/Users/ltamon'
elif whorunsit == 'LiezelCluster':
    home_dir = '/project/sahakyanlab/ltamon'
else:
    print("The supplied <whorunsit> option is not created in the script.")

wk_dir = home_dir + '/SahakyanLab/GenomicContactDynamics/26_PredictingCp'
csv_dir = wk_dir + '/z_ignore_git/out_makeTable'
#csv_dir = wk_dir + '/out_makeTable'
out_dir = wk_dir + '/out_autogluon_run'

In [80]:
generate_train_data = True
fit_model = False

In [81]:
gcb = 'min2Mb'
chrs = ['chr22']
feat_regex = 'grp.compl.|grp.kmer3.|grp.kmer1.|grp.GC.'

In [82]:
do_subsample = True
percs = np.repeat([0.001,0.001], [18,3])
groups = list(range(1,22))
sampling_id = '5percExceptgrthn18'
seed_val = 234

In [83]:
label_col = 'LABEL'
metric = 'accuracy'
problem_type = 'multiclass' # (options: ‘binary’, ‘multiclass’, ‘regression’, ‘quantile’)

presets = 'good_quality'
time_limit = 60 # seconds
label_count_threshold = 2 # For multi-class classification problems, this is the minimum number of times a label must appear in dataset in order to be considered an output class.

#holdout_frac
# Baggin/stack ensembling - if enabled, dont provide tuning_data
#num_bag_folds=5 # how many times the k-fold bagging process is repeated to further reduce variance (increasing this may further boost accuracy but will substantially increase training times, inference latency, and memory/disk usage)
#num_bag_sets=1, 
#num_stack_levels=1 
#auto_stack=True # Autogluon will select bagging/stacking numbers
# specifying presets='best_quality' in fit() simply sets auto_stack=True

#num_trials = 5  # try at most 5 different hyperparameter configurations for each type of model
#search_strategy = 'auto'  # to tune hyperparameters using random search routine with a local scheduler
#hyperparameter_tune_kwargs = {  # HPO is not performed unless hyperparameter_tune_kwargs is specified
#    'num_trials': num_trials,
#    'scheduler' : 'local',
#    'searcher': search_strategy,
#}

In [84]:
chrs_id = ''.join(chrs)
try:
    chrs[chrs.index('chr23')] = 'chrX'
except ValueError:
        print("No chr23 -> chrX.")
        
feat_regex_id = re.compile('[^a-zA-Z0-9]').sub('', feat_regex)
#[chrs_id, feat_regex_id]

No chr23 -> chrX.


In [85]:
def concat_chrcsv(csv_dir, gcb, chrs):
    
    df = []
    [df.append(pd.read_csv( csv_dir + '/' + gcb + '_' + chr + '_MLtbl.csv' )) for chr in chrs]
    df = pd.concat(df)
    
    return(df)

def choose_features(df, feat_regex, label_col):
    
    final_regex = feat_regex + '|' + label_col
    is_chosen = [ bool(re.search(pattern=final_regex, string=feat)) for feat in df.columns ] 
    is_todrop = [not bool for bool in is_chosen]
    df.drop(columns=list(df.columns[is_todrop]), inplace=True)

    return(df)
    
def switch_contact(df):
    
    features = np.array(df.columns)
    is_icol = ['.i_' in feat for feat in features]
    is_jcol = ['.j_' in feat for feat in features]
    
    features_switch = np.array(features)
    features_switch[is_icol] = features[is_jcol]
    features_switch[is_jcol] = features[is_icol]
    
    df_switch = df.set_axis(features_switch, axis=1, inplace=False)
    df_switch = df_switch[features]
    df = pd.concat([df, df_switch])
    
    return(df)

def sampleGroupInSeries(dfSeries, perc, group, seed_val):
    
    row_ind_group = dfSeries.index
    row_ind_group = row_ind_group[ dfSeries == group ]
    
    np.random.seed(seed_val)
    samp_size = math.ceil( sum(dfSeries == group) * perc / 100 )
    row_ind_group_sampled = list( np.random.choice(a = row_ind_group, size=samp_size) )
    
    return(row_ind_group_sampled)

def sampleManyGroupsInSeries(dfSeries, percs, groups, seed_val):
    
    num_groups = len(groups)
    seed_generated = np.random.randint(low=0, high=1000, size=num_groups, dtype=int)
    
    row_ind_manygroups_sampled = []
    
    for i in range(num_groups):
        
        perc = percs[i]
        group = groups[i]
        seed_i = seed_generated[i]
        
        row_ind_manygroups_sampled = row_ind_manygroups_sampled + sampleGroupInSeries(dfSeries, perc, group, seed_val=seed_i)
        
    return(row_ind_manygroups_sampled)

In [86]:
csv_id = gcb + '_' + chrs_id + '_MLtbl_' + feat_regex_id + '_seed' + str(seed_val) + '_' + sampling_id 
csv_path = csv_dir + '/' + csv_id + '.csv'
#[csv_id, csv_path]

In [87]:
if generate_train_data:
    
    print("Generating training data...")
    
    train_data = concat_chrcsv(csv_dir, gcb, chrs)
    
    if do_subsample:
        ind = sampleManyGroupsInSeries(train_data['LABEL'], percs, groups, seed_val=seed_val)
        train_data = train_data.iloc[ind,:]
        print(sampling_id  + ": Subsampled training data.")
    else:
        print("No subsampling of training data.")
    
    train_data = choose_features(train_data, feat_regex, label_col)
    train_data = switch_contact(train_data)
    
    if do_subsample:
        train_data.to_csv(csv_path)
        print("Training data saved as CSV..")

Generating training data...
5percExceptgrthn18: Subsampled training data.
Training data saved as CSV..


In [88]:
if fit_model:
    
    if do_subsample:
        train_data = TabularDataset(csv_path)
        train_data.drop(columns=train_data.columns[0], axis=1, inplace=True)
        print("Training data loaded from CSV.")
    else:
        train_data = TabularDataset(train_data)
        
    model_id = gcb + '_' + chrs_id + '_' + feat_regex_id + '_' + presets + '_' + str(time_limit) + 'sec_' + problem_type
    model_id = model_id + '_' + str(len(train_data)) + 'subsample'
    model_path = out_dir + '/' + 'agModel_' + model_id

In [89]:
if fit_model:
    predictor = TabularPredictor(label=label_col, eval_metric=metric, path=model_path, problem_type=problem_type, learner_kwargs={'label_count_threshold': label_count_threshold}).fit(train_data, ag_args_fit={'num_gpus': num_gpus}, presets=presets, time_limit=time_limit)