In [11]:
import ipynb
from ipynb.fs.full.A_libs_26sep_e30 import *
from ipynb.fs.full.B_funs_26sep_e30 import *

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000

### Configuration

In [12]:
output_dir = create_output_dir()
seed_everything(seed=42)

Directory  ../results/_29_Create_KRIDGE_feats_16oct_e30  already exists


In [13]:
#my_submission = pd.read_csv('../input/firstpison26sep/submission0.014740.csv') #LB=0.01875 PCA + removing low std
#my_submission = pd.read_csv('../input/pison26sepv2/submission0.014731.csv') #LB=0.01874 PCA (variance=False)

class CFG:
    num_folds = 10
    use_pca = True
    pca_comp_genes = 30
    pca_comp_cells = 18
    
    variance_enc = False
    variance_thres = 0.50

In [14]:
# BEST: pca_comp_genes=30 pca_comp_cells=18 Local=0.014694 LB=0.01872

# NO_PCA Local=0.014734
# pca_comp_genes=20 Local=0.014724 
# pca_comp_genes=30 Local=0.014721
# pca_comp_genes=40 Local=0.014732
# pca_comp_genes=45 Local=0.014739 
# pca_comp_genes=50 Local=0.014731 LB=0.01874 !Comprobado
# pca_comp_genes=60 Local=0.014753
# pca_comp_genes=75 Local=0.014750

### DataBases

In [15]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')

test_features = pd.read_csv('../input/lish-moa/test_features.csv')
sample_submission = pd.read_csv('../input/lish-moa/sample_submission.csv')

In [16]:
train_orig = train_features.merge(train_targets_scored, on='sig_id')
train_noctl = train_orig[train_orig['cp_type']!='ctl_vehicle'].drop('cp_type', axis=1).reset_index(drop=True)
train_ctl = train_orig[train_orig['cp_type']=='ctl_vehicle'].drop('cp_type', axis=1).reset_index(drop=True)

test_noctl = test_features[test_features['cp_type']!='ctl_vehicle'].drop('cp_type', axis=1).reset_index(drop=True)
test_ctl = test_features[test_features['cp_type']=='ctl_vehicle'].drop('cp_type', axis=1).reset_index(drop=True)

target = train_noctl[train_targets_scored.columns]

GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]
target_cols = target.drop('sig_id', axis=1).columns.values.tolist()
numeric_cols = GENES + CELLS

In [17]:
folds = train_noctl.copy()
mskf = MultilabelStratifiedKFold(n_splits=CFG.num_folds, shuffle=True, random_state=43)

for f, (t_idx, v_idx) in enumerate(mskf.split(X=train_noctl, y=target)):
    folds.loc[v_idx, 'kfold'] = int(f)

folds['kfold'] = folds['kfold'].astype(int)

In [18]:
print(train_orig.shape)
print(folds.shape)
print(test_noctl.shape)
print(target.shape)
print(sample_submission.shape)

(23814, 1082)
(21948, 1082)
(3624, 875)
(21948, 207)
(3982, 207)


### Search individual features

In [19]:
def trimm_correlated(df_in, threshold):
    df_corr = df_in.corr(method='pearson', min_periods=1)
    df_not_correlated = ~(df_corr.mask(np.tril(np.ones([len(df_corr)]*2, dtype=bool))).abs() > threshold).any()
    un_corr_idx = df_not_correlated.loc[df_not_correlated[df_not_correlated.index] == True].index
    return un_corr_idx

In [20]:
from scipy.stats import norm
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.kernel_ridge import KernelRidge
from scipy.stats import ks_2samp
import pickle

folds_feat = folds.copy()
test_feat = test_noctl.copy()

folds_feat['cp_dose'] = (folds_feat['cp_dose']=='D2').astype(float)
folds_feat['cp_time'] /= 72.0

test_feat['cp_dose'] = (test_feat['cp_dose']=='D2').astype(float)
test_feat['cp_time'] /= 72.0

# C_logistic = 0.01
thr= 0.05 #p-value thr
thr2 = 0.93 #Correlation Thr

# for thr in [0.05]: #1, 0.03, 0.05, 0.07, 0.09]:
#     for thr2 in [0.90, 0.93, 0.95, 0.97, 1.00]:
res = []
res_best = []
features_models = []
list_target_cols = []
list_new_cols = []
scaler = StandardScaler()
list_AUC = []
list_loss = []

for ncol, col_target in enumerate(tqdm(target_cols)):
#             if ncol<=50: #00:
    list_target_cols.append(col_target)
    name_feat = f'kridge_{col_target}'
    list_new_cols.append(name_feat)

    # Compare distribution of positives and negatives with kolgomorov
    pvalues = []
    for col_feat in GENES+CELLS:
        values_orig = train_noctl[col_feat].values
        target_orig = train_noctl[col_target].values
        target = target_orig[values_orig> -4]
        values = values_orig[values_orig> -4]
        positive = values[target==1]
        negative = values[target==0]
        if len(positive)>0:
            pvalue = ks_2samp(positive, negative)[1]
        else:
            pvalue = 1.0
        pvalues.append(pvalue)
    pvalues = np.array(pvalues)

    # Search best model with different thresholds
    best_auc = 0.0
    best_loss = 999999.9
    best_thr = 0.0 # thr=pvalue
    best_thr2 = 0.0 #th2=correlation
    best_lenfeats = 0
    best_lenfeats_uncorr = 0
    best_list_feats = []
    alpha = 100
    folds_feat[name_feat] = 0.0
    feats_trn = np.array(GENES+CELLS)[pvalues<thr]
    feats_trn_uncorr = trimm_correlated(folds_feat[feats_trn], thr2).tolist()
    if len(feats_trn_uncorr)>0:
        X_trn = folds_feat.loc[:, feats_trn_uncorr].astype(float).values
        X_trn = scaler.fit_transform(X_trn)

        for fold in range(CFG.num_folds):
            X_trn_fold = X_trn[folds_feat['kfold']!=fold, :]
            X_val_fold = X_trn[folds_feat['kfold']==fold, :]
            y_trn_fold = folds_feat.loc[folds_feat['kfold']!=fold, col_target].astype(float).values
            if np.sum(y_trn_fold)>10:
                model = KernelRidge(alpha = alpha, kernel = 'rbf')
#                 model = LogisticRegression(C=C_logistic)
                model.fit(X_trn_fold, y_trn_fold)
#                 folds_feat.loc[folds_feat['kfold']==fold, name_feat] = model.predict_proba(X_val_fold)[:,1]
                folds_feat.loc[folds_feat['kfold']==fold, name_feat] = model.predict(X_val_fold)

        auc = roc_auc_score(folds_feat[col_target].values, folds_feat[name_feat].values)
        list_AUC.append(auc)
        loss = log_loss(folds_feat[col_target].values, folds_feat[name_feat].values)
        list_loss.append(loss)
        lenfeats = len(feats_trn)
        lenfeats_uncorr = len(feats_trn_uncorr)
        res.append(dict({'ncol':ncol, 'col':col_target, 'thr':thr, 'thr2':thr2, \
                         'lenfeats':lenfeats, 'lenfeats_uncorr':lenfeats_uncorr, \
                         'sumones':folds_feat[col_target].sum(), 'AUC':auc, 'Mean_AUC':np.mean(list_AUC), \
                         'loss':loss, 'Mean_loss':np.mean(list_loss)}))
#                         print(f'{ncol+1}/{len(target_cols)}: col={col_target} thr={thr} thr2={thr2} lenfeats={lenfeats} lenfeats_uncorr={lenfeats_uncorr} sum_ones={folds_feat[col_target].sum()} loss={loss} AUC={auc} Mean_AUC={np.mean(list_AUC)}')
        if loss < best_loss:
            best_auc = auc
            best_loss = loss
            best_thr = thr
            best_thr2 = thr2
            best_lenfeats = lenfeats
            best_lenfeats_uncorr = lenfeats_uncorr
            preds = folds_feat[name_feat].values
            best_list_feats = feats_trn_uncorr

        # Get best
        loss = best_loss
        auc = best_auc
        thr = best_thr
        thr2 = best_thr2
        lenfeats = best_lenfeats
        lenfeats_uncorr = best_lenfeats_uncorr
        folds_feat[name_feat] = preds
        feats_trn_uncorr = best_list_feats

        # Provisional mlogloss
        valid_results_score = train_targets_scored[['sig_id']].merge(folds_feat[['sig_id']+list_new_cols], on='sig_id', how='left').fillna(0)
        y_true = train_targets_scored[list_target_cols].values
        valid_results_score.columns = ['sig_id']+list_target_cols
        y_pred = valid_results_score[list_target_cols].values
        score = 0
        for i in range(len(list_target_cols)):
            score_ = log_loss(y_true[:, i], y_pred[:, i])
            score += score_ 
        score /= len(list_target_cols)
#         print(C_logistic, thr, thr2, np.mean(list_AUC))
        print(f'BEST {ncol+1}/{len(target_cols)}: col={col_target} thr={thr} thr2={thr2} lenfeats={lenfeats} lenfeats_uncorr={lenfeats_uncorr} sum_ones={folds_feat[col_target].sum()} loss={loss} AUC={auc} Mean_AUC={np.mean(list_AUC)} mlogloss={score}')
        
        # With Test DB
        X_trn = folds_feat[feats_trn_uncorr].astype(float).values
        X_test = test_feat[feats_trn_uncorr].astype(float).values
        y = folds_feat[col_target].astype(float).values 

        scaler = StandardScaler()
        X_trn = scaler.fit_transform(X_trn)
        X_test = scaler.transform(X_test)

        model = KernelRidge(alpha = alpha, kernel = 'rbf')
#         model = LogisticRegression(C=C_logistic)
        model.fit(X_trn, y)
#         test_feat[name_feat] = model.predict_proba(X_test)[:,1]
        test_feat[name_feat] = model.predict(X_test)

        # Save all
        with open(output_dir / f'model__{name_feat}__{col_target}.pkl', 'wb') as file:
            pickle.dump(model, file)
            pickle.dump(feats_trn_uncorr, file)
        
        folds_feat.to_csv( output_dir / 'folds.csv',index=False)
        test_feat.to_csv(output_dir / 'test.csv',index=False)
        
        res_best.append(dict({'ncol':ncol, 'col':col_target, 'thr':thr, 'thr2':thr2, \
                     'lenfeats':lenfeats, 'lenfeats_uncorr':lenfeats_uncorr, \
                     'sumones':folds_feat[col_target].sum(), 'AUC':auc, 'loss':loss, 'mlogloss':score}))
        res_df = pd.DataFrame(res_best)
        res_df.to_csv(output_dir / 'res.csv',index=False)

HBox(children=(FloatProgress(value=0.0, max=206.0), HTML(value='')))

BEST 1/206: col=5-alpha_reductase_inhibitor thr=0.05 thr2=0.93 lenfeats=215 lenfeats_uncorr=215 sum_ones=17 loss=0.004952995346922938 AUC=0.971592722630067 Mean_AUC=0.971592722630067 mlogloss=0.004564892158993303
BEST 2/206: col=11-beta-hsd1_inhibitor thr=0.05 thr2=0.93 lenfeats=108 lenfeats_uncorr=108 sum_ones=18 loss=0.005689574020408316 AUC=0.9207681005218625 Mean_AUC=0.9461804115759648 mlogloss=0.004904323349168348
BEST 3/206: col=acat_inhibitor thr=0.05 thr2=0.93 lenfeats=29 lenfeats_uncorr=29 sum_ones=24 loss=0.010190746828133646 AUC=0.7969538709481239 Mean_AUC=0.8964382313666844 mlogloss=0.006400291479214876
BEST 4/206: col=acetylcholine_receptor_agonist thr=0.05 thr2=0.93 lenfeats=284 lenfeats_uncorr=284 sum_ones=190 loss=0.04698000419371844 AUC=0.714768675526485 Mean_AUC=0.8510208424066346 mlogloss=0.015624913453239728
BEST 5/206: col=acetylcholine_receptor_antagonist thr=0.05 thr2=0.93 lenfeats=476 lenfeats_uncorr=476 sum_ones=301 loss=0.06951532761097365 AUC=0.67527038726334

BEST 41/206: col=bacterial_30s_ribosomal_subunit_inhibitor thr=0.05 thr2=0.93 lenfeats=102 lenfeats_uncorr=102 sum_ones=60 loss=0.01846382794701785 AUC=0.7930662158869395 Mean_AUC=0.7713121930798014 mlogloss=0.016892146408032747
BEST 42/206: col=bacterial_50s_ribosomal_subunit_inhibitor thr=0.05 thr2=0.93 lenfeats=140 lenfeats_uncorr=140 sum_ones=80 loss=0.023073621559943084 AUC=0.8091657901957198 Mean_AUC=0.7722134692016089 mlogloss=0.016996277102898853
BEST 43/206: col=bacterial_antifolate thr=0.05 thr2=0.93 lenfeats=151 lenfeats_uncorr=151 sum_ones=36 loss=0.010732654778522683 AUC=0.8781045900774817 Mean_AUC=0.7746760534080247 mlogloss=0.016831053757779005
BEST 44/206: col=bacterial_cell_wall_synthesis_inhibitor thr=0.05 thr2=0.93 lenfeats=340 lenfeats_uncorr=340 sum_ones=192 loss=0.04758585301832906 AUC=0.7107184512318441 Mean_AUC=0.7732224715403841 mlogloss=0.017445283343754538
BEST 45/206: col=bacterial_dna_gyrase_inhibitor thr=0.05 thr2=0.93 lenfeats=205 lenfeats_uncorr=205 sum_

BEST 80/206: col=dopamine_receptor_antagonist thr=0.05 thr2=0.93 lenfeats=539 lenfeats_uncorr=539 sum_ones=424 loss=0.09033546149601805 AUC=0.7025312682990117 Mean_AUC=0.7758814918004513 mlogloss=0.01930683312606741
BEST 81/206: col=egfr_inhibitor thr=0.05 thr2=0.93 lenfeats=658 lenfeats_uncorr=658 sum_ones=336 loss=0.05288308972250019 AUC=0.9510703654495299 Mean_AUC=0.7780443174010572 mlogloss=0.01967019710283658
BEST 82/206: col=elastase_inhibitor thr=0.05 thr2=0.93 lenfeats=91 lenfeats_uncorr=91 sum_ones=6 loss=0.009441983705553398 AUC=0.5 Mean_AUC=0.7746535330425077 mlogloss=0.019536440255850968
BEST 83/206: col=erbb2_inhibitor thr=0.05 thr2=0.93 lenfeats=11 lenfeats_uncorr=11 sum_ones=1 loss=0.0015736639509263994 AUC=0.5 Mean_AUC=0.7713444543311521 mlogloss=0.019318535625559637
BEST 84/206: col=estrogen_receptor_agonist thr=0.05 thr2=0.93 lenfeats=263 lenfeats_uncorr=263 sum_ones=158 loss=0.04014115911724882 AUC=0.7308151457235639 Mean_AUC=0.7708619625620142 mlogloss=0.01952897933

BEST 120/206: col=kit_inhibitor thr=0.05 thr2=0.93 lenfeats=601 lenfeats_uncorr=601 sum_ones=273 loss=0.05977826780778955 AUC=0.8783369033888067 Mean_AUC=0.8001806315503036 mlogloss=0.019650743760094562
BEST 121/206: col=laxative thr=0.05 thr2=0.93 lenfeats=53 lenfeats_uncorr=53 sum_ones=6 loss=0.009441983705553398 AUC=0.5 Mean_AUC=0.7976997998845987 mlogloss=0.01956025939554846
BEST 122/206: col=leukotriene_inhibitor thr=0.05 thr2=0.93 lenfeats=42 lenfeats_uncorr=42 sum_ones=6 loss=0.009441983705553398 AUC=0.5 Mean_AUC=0.7952596375904626 mlogloss=0.019471258381240815
BEST 123/206: col=leukotriene_receptor_antagonist thr=0.05 thr2=0.93 lenfeats=116 lenfeats_uncorr=116 sum_ones=62 loss=0.017689129769314214 AUC=0.829824928588905 Mean_AUC=0.7955406562164662 mlogloss=0.019445500653527882
BEST 124/206: col=lipase_inhibitor thr=0.05 thr2=0.93 lenfeats=55 lenfeats_uncorr=55 sum_ones=12 loss=0.00919321399887328 AUC=0.7135758570386579 Mean_AUC=0.794879649771484 mlogloss=0.019357011606790722
BES

BEST 160/206: col=progesterone_receptor_agonist thr=0.05 thr2=0.93 lenfeats=312 lenfeats_uncorr=312 sum_ones=119 loss=0.03140189449153449 AUC=0.850297441804153 Mean_AUC=0.797844496830211 mlogloss=0.01933234437028541
BEST 161/206: col=progesterone_receptor_antagonist thr=0.05 thr2=0.93 lenfeats=102 lenfeats_uncorr=102 sum_ones=18 loss=0.005116259105079315 AUC=0.9719410244718042 Mean_AUC=0.7989258417223948 mlogloss=0.019241555666215077
BEST 162/206: col=prostaglandin_inhibitor thr=0.05 thr2=0.93 lenfeats=148 lenfeats_uncorr=148 sum_ones=36 loss=0.011012424020165229 AUC=0.8385942760942761 Mean_AUC=0.7991707086012335 mlogloss=0.01918543199266257
BEST 163/206: col=prostanoid_receptor_antagonist thr=0.05 thr2=0.93 lenfeats=163 lenfeats_uncorr=163 sum_ones=84 loss=0.02371876345528669 AUC=0.7378747190423919 Mean_AUC=0.7987946595855352 mlogloss=0.019201841771424297
BEST 164/206: col=proteasome_inhibitor thr=0.05 thr2=0.93 lenfeats=860 lenfeats_uncorr=860 sum_ones=726 loss=0.08892780413557706 AU

BEST 199/206: col=trpv_antagonist thr=0.05 thr2=0.93 lenfeats=65 lenfeats_uncorr=65 sum_ones=48 loss=0.015298282316752947 AUC=0.8220852359208524 Mean_AUC=0.7973078855110013 mlogloss=0.019097312874582772
BEST 200/206: col=tubulin_inhibitor thr=0.05 thr2=0.93 lenfeats=802 lenfeats_uncorr=802 sum_ones=316 loss=0.0459148300238122 AUC=0.9558626811662049 Mean_AUC=0.7981006594892774 mlogloss=0.01921341165693083
BEST 201/206: col=tyrosine_kinase_inhibitor thr=0.05 thr2=0.93 lenfeats=130 lenfeats_uncorr=130 sum_ones=73 loss=0.020373994015926277 AUC=0.7956828180039139 Mean_AUC=0.7980886304271612 mlogloss=0.019211243158585942
BEST 202/206: col=ubiquitin_specific_protease_inhibitor thr=0.05 thr2=0.93 lenfeats=49 lenfeats_uncorr=49 sum_ones=6 loss=0.0094419837055534 AUC=0.5 Mean_AUC=0.7966129441379178 mlogloss=0.019159217873890053
BEST 203/206: col=vegfr_inhibitor thr=0.05 thr2=0.93 lenfeats=288 lenfeats_uncorr=288 sum_ones=170 loss=0.040326076779139707 AUC=0.885657949468703 Mean_AUC=0.797051589484

In [21]:
# STOP!!!!
# STOP!!!!
# STOP!!!!
sys.exit(0)
# STOP!!!!
# STOP!!!!
# STOP!!!!


SystemExit: 0

In [None]:
# Provisional mlogloss
valid_results_score = train_targets_scored[['sig_id']].merge(folds_feat[['sig_id']+list_new_cols], on='sig_id', how='left').fillna(0)
y_true = train_targets_scored[list_target_cols].values
valid_results_score.columns = ['sig_id']+list_target_cols
y_pred = valid_results_score[list_target_cols].values
score = 0
for i in range(len(list_target_cols)):
    score_ = log_loss(y_true[:, i], y_pred[:, i])
    score += score_ 
score /= len(list_target_cols)
score

In [None]:
from scipy.stats import norm
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import ks_2samp
import pickle

folds_feat = folds.copy()
test_feat = test_noctl.copy()

folds_feat['cp_dose'] = (folds_feat['cp_dose']=='D2').astype(float)
folds_feat['cp_time'] /= 72.0

test_feat['cp_dose'] = (test_feat['cp_dose']=='D2').astype(float)
test_feat['cp_time'] /= 72.0

res = []
res_best = []
features_models = []
list_target_cols = []
list_new_cols = []
for ncol, col_target in enumerate(target_cols):
    if ncol>100:
        list_target_cols.append(col_target)
        name_feat = f'bayes_{col_target}'
        list_new_cols.append(name_feat)
            
        # Compare distribution of positives and negatives with kolgomorov
        pvalues = []
        for col_feat in GENES+CELLS:
            values_orig = train_noctl[col_feat].values
            target_orig = train_noctl[col_target].values
            target = target_orig[values_orig> -4]
            values = values_orig[values_orig> -4]
            positive = values[target==1]
            negative = values[target==0]
            if len(positive)>0:
                pvalue = ks_2samp(positive, negative)[1]
            else:
                pvalue = 1.0
            pvalues.append(pvalue)
        pvalues = np.array(pvalues)
        
        # Search best model with different thresholds
        best_auc = 0.0
        best_thr = 0.0 # thr=pvalue
        best_thr2 = 0.0 #th2=correlation
        best_lenfeats = 0
        best_lenfeats_uncorr = 0
        best_list_feats = []

        for thr in [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4]:
            for thr2 in [0.50, 0.60, 0.75, 0.80, 0.85, 0.90, 0.95]:
                folds_feat[name_feat] = 0.0
                feats_trn = np.array(GENES+CELLS)[pvalues<thr]
                feats_trn_uncorr = trimm_correlated(folds_feat[feats_trn], thr2)
    #             print(len(feats_trn), len(feats_trn_uncorr))

                if len(feats_trn_uncorr)>0:
                    for fold in range(CFG.num_folds):
                        X_trn  = folds_feat.loc[folds_feat['kfold']!=fold, feats_trn_uncorr.tolist()].astype(float).values
                        X_fold = folds_feat.loc[folds_feat['kfold']==fold, feats_trn_uncorr.tolist()].astype(float).values
                        y = folds_feat.loc[folds_feat['kfold']!=fold, col_target].astype(float).values
                        
                        scaler = MinMaxScaler()
                        X_trn = scaler.fit_transform(X_trn)
                        X_fold = scaler.transform(X_fold)

                        model = ComplementNB()
                        model.fit(X_trn, y)
                        folds_feat.loc[folds_feat['kfold']==fold, name_feat] = model.predict_proba(X_fold)[:,1]

                    auc = roc_auc_score(folds_feat[col_target].values, folds_feat[name_feat].values)
                    lenfeats = len(feats_trn)
                    lenfeats_uncorr = len(feats_trn_uncorr)
                    res.append(dict({'ncol':ncol, 'col':col_target, 'thr':thr, 'thr2':thr2, \
                                     'lenfeats':lenfeats, 'lenfeats_uncorr':lenfeats_uncorr, \
                                     'sumones':folds_feat[col_target].sum(), 'AUC':auc}))
                    print(f'{ncol+1}/{len(target_cols)}: col={col_target} thr={thr} thr2={thr2} lenfeats={lenfeats} lenfeats_uncorr={lenfeats_uncorr} sum_ones={folds_feat[col_target].sum()} AUC={auc}')
        #             print(f'{ncol+1}/{len(target_cols)}: col={col_target} thr={thr} lenfeats={lenfeats} sum_ones={folds_feat[col_target].sum()} AUC={auc}')
                    if auc>best_auc:
                        best_auc = auc
                        best_thr = thr
                        best_thr2 = thr2
                        best_lenfeats = lenfeats
                        best_lenfeats_uncorr = lenfeats_uncorr
                        preds = folds_feat[name_feat].values
                        best_list_feats = feats_trn_uncorr
        # End of optimize loop
        
        # Keep best
        auc = best_auc
        thr = best_thr
        thr2 = best_thr2
        lenfeats = best_lenfeats
        lenfeats_uncorr = best_lenfeats_uncorr
        folds_feat[name_feat] = preds
        feats_trn_uncorr = best_list_feats

        # For Test DB
        X_trn = folds_feat[feats_trn_uncorr.tolist()].astype(float).values
        X_test = test_feat[feats_trn_uncorr.tolist()].astype(float).values
        y = folds_feat[col_target].astype(float).values

        scaler = MinMaxScaler()
        X_trn = scaler.fit_transform(X_trn)
        X_test = scaler.transform(X_test)

        model = ComplementNB()
        model.fit(X_trn, y)
        test_feat[name_feat] = mmodel.predict_proba(X_test)[:,1]
        
        # Provisional mlogloss
        valid_results = train_targets_scored.drop(columns=list_target_cols).merge(folds_feat[list_new_cols], on='sig_id', how='left').fillna(0)
        y_true = train_targets_scored[list_target_cols].values
        valid_results.columns = ['sig_id']+list_target_cols
        y_pred = valid_results[target_cols].values
        score = 0
        for i in range(len(target_cols)):
            score_ = log_loss(y_true[:, i], y_pred[:, i])
            score += score_ / target.shape[1]

        # Save models and results
        folds_feat.to_csv('../input/bayes_folds_feats30sep_e31.csv',index=False)
        test_feat.to_csv('../input/bayes_test_feats30sep_e31.csv',index=False)

        with open(output_dir / f'bayes_model__{name_feat}.pkl', 'wb') as file:
            pickle.dump(model, file)

        features_models.append(feats_trn_uncorr.tolist())
        with open(output_dir / f'bayes_feats__{name_feat}.pkl', 'wb') as file:
            pickle.dump(features_models, file)

        res_best.append(dict({'ncol':ncol, 'col':col_target, 'thr':thr, 'thr2':thr2, \
                             'lenfeats':lenfeats, 'lenfeats_uncorr':lenfeats_uncorr, \
                             'sumones':folds_feat[col_target].sum(), 'AUC':auc, 'mlogloss':score}))
        res_df = pd.DataFrame(res_best)
        res_df.to_csv(output_dir / 'bayes_res_30sep_e31.csv',index=False)
        resall_df = pd.DataFrame(res)
        resall_df.to_csv(output_dir / 'bayes_resall_30sep_e31.csv',index=False)
    
        print(f'BEST {ncol+1}/{len(target_cols)}: col={col_target} thr={thr} thr2={thr2} lenfeats={lenfeats} lenfeats_uncorr={lenfeats_uncorr} sum_ones={folds_feat[col_target].sum()} AUC={auc} mlogloss={score}')
        

In [None]:
# from scipy.stats import norm
# from sklearn.naive_bayes import ComplementNB
# from sklearn.metrics import roc_auc_score
# from sklearn.preprocessing import MinMaxScaler
# from scipy.stats import ks_2samp
# import pickle

# folds_feat = folds.copy()
# test_feat = test_noctl.copy()

# folds_feat['cp_dose'] = (folds_feat['cp_dose']=='D2').astype(float)
# folds_feat['cp_time'] /= 72.0

# test_feat['cp_dose'] = (test_feat['cp_dose']=='D2').astype(float)
# test_feat['cp_time'] /= 72.0

# res = []
# res_best = []
# features_models = []
# for ncol, col_target in enumerate(target_cols):
#     pvalues = []
#     for col_feat in GENES+CELLS:
#         values_orig = train_noctl[col_feat].values
#         target_orig = train_noctl[col_target].values
#         target = target_orig[values_orig> -4]
#         values = values_orig[values_orig> -4]
#         positive = values[target==1]
#         negative = values[target==0]
#         pvalues.append(ks_2samp(positive, negative)[1])
#     pvalues = np.array(pvalues)
#     best_auc = 0.0
#     best_thr = 0.0
#     best_thr2 = 0.0
#     best_lenfeats = 0
#     best_lenfeats_uncorr = 0
#     best_list_feats = []
    
#     for thr in [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4]:
#         for thr2 in [0.60, 0.75, 0.80, 0.85, 0.90, 0.95]:
#             name_feat = f'bayes{col_target}'
#             folds_feat[name_feat] = 0.0
#             feats_trn = np.array(GENES+CELLS)[pvalues<thr]
#             feats_trn_uncorr = trimm_correlated(folds_feat[feats_trn], thr2)
# #             print(len(feats_trn), len(feats_trn_uncorr))

#             if len(feats_trn)>0:
#                 for fold in range(CFG.num_folds):
#                     X_trn = folds_feat.loc[folds_feat['kfold']!=fold, ['cp_time','cp_dose']+feats_trn_uncorr.tolist()].astype(float).values
#                     X_fold = folds_feat.loc[folds_feat['kfold']==fold,['cp_time','cp_dose']+feats_trn_uncorr.tolist()].astype(float).values

#                     scaler = MinMaxScaler()
#                     X_trn = scaler.fit_transform(X_trn)
#                     X_fold = scaler.transform(X_fold)

#                     y = folds_feat.loc[folds_feat['kfold']!=fold, col_target].astype(float).values
#                     model = ComplementNB()
#                     model.fit(X_trn, y)
#                     folds_feat.loc[folds_feat['kfold']==fold, name_feat] = model.predict(X_fold)

#                 auc = roc_auc_score(folds_feat[col_target].values, folds_feat[name_feat].values)
#                 lenfeats = len(feats_trn)
#                 lenfeats_uncorr = len(feats_trn_uncorr)
#                 res.append(dict({'ncol':ncol, 'col':col_target, 'thr':thr, 'thr2':thr2, \
#                                  'lenfeats':lenfeats, 'lenfeats_uncorr':lenfeats_uncorr, \
#                                  'sumones':folds_feat[col_target].sum(), 'AUC':auc}))
# #                 print(f'{ncol+1}/{len(target_cols)}: col={col_target} thr={thr} thr2={thr2} lenfeats={lenfeats} lenfeats_uncorr={lenfeats_uncorr} sum_ones={folds_feat[col_target].sum()} AUC={auc}')
#     #             print(f'{ncol+1}/{len(target_cols)}: col={col_target} thr={thr} lenfeats={lenfeats} sum_ones={folds_feat[col_target].sum()} AUC={auc}')
#                 if auc>best_auc:
#                     best_auc = auc
#                     best_thr = thr
#                     best_thr2 = thr2
#                     best_lenfeats = lenfeats
#                     best_lenfeats_uncorr = lenfeats_uncorr
#                     preds = folds_feat[name_feat].values
#                     best_list_feats = feats_trn_uncorr
    
#     # Keep best
#     auc = best_auc
#     thr = best_thr
#     thr2 = best_thr2
#     lenfeats = best_lenfeats
#     lenfeats_uncorr = best_lenfeats_uncorr
#     folds_feat[name_feat] = preds
#     feats_trn_uncorr = best_list_feats
        
#     # With Test DB
#     X_trn = folds_feat[feats_trn_uncorr.tolist()].astype(float).values
#     X_test = test_feat[feats_trn_uncorr.tolist()].astype(float).values
#     y = folds_feat[col_target].astype(float).values
    
#     scaler = MinMaxScaler()
#     X_trn = scaler.fit_transform(X_trn)
#     X_test = scaler.transform(X_test)
    
#     model = ComplementNB()
#     model.fit(X_trn, y)
#     test_feat[name_feat] = model.predict(X_test)
    
#     # Save models and results
#     folds_feat.to_csv('../input/bayes_folds_feats30sep_e30.csv',index=False)
#     test_feat.to_csv('../input/bayes_test_feats30sep_e30.csv',index=False)

#     with open(output_dir / f'bayesmodel__{name_feat}__{auc:.6f}.pkl', 'wb') as file:
#         pickle.dump(model, file)
    
#     features_models.append(feats_trn_uncorr.tolist())
#     with open(output_dir / f'bayesfeats__{name_feat}.pkl', 'wb') as file:
#         pickle.dump(features_models, file)
        
#     res_best.append(dict({'ncol':ncol, 'col':col_target, 'thr':thr, 'thr2':thr2, \
#                          'lenfeats':lenfeats, 'lenfeats_uncorr':lenfeats_uncorr, \
#                          'sumones':folds_feat[col_target].sum(), 'AUC':auc}))
    
#     res_df = pd.DataFrame(res_best)
#     res_df.to_csv('../input/bayesres_30sep.csv',index=False)
#     print(f'BEST {ncol+1}/{len(target_cols)}: col={col_target} thr={thr} thr2={thr2} lenfeats={lenfeats} lenfeats_uncorr={lenfeats_uncorr} sum_ones={folds_feat[col_target].sum()} AUC={auc}')

In [None]:
res = pd.DataFrame(res)

In [None]:
res

In [None]:
        
#         prob_pos = norm.pdf(values_orig, np.mean(positive), np.std(positive))
#         prob_neg = norm.pdf(values_orig, np.mean(negative), np.std(negative))
#         pvalue_probs = ks_2samp(prob_pos, prob_neg)[1]
#         df = pd.DataFrame({'target':target_orig, 'diff':prob_pos-prob_neg})
#         ax = sns.boxplot(x='target', y='diff', data=df)
#         plt.title(f'feat={col_feat} target={col_target}) n1s={np.sum(target_orig==1)} pv_negpos={pvalue_neg_pos:.6f}')
#         plt.show()
        
#         q25_pos = np.quantile(prob_pos, q=0.25)
#         q75_pos = np.quantile(prob_pos, q=0.75)
#         q25_neg = np.quantile(prob_neg, q=0.25)
#         q75_neg = np.quantile(prob_neg, q=0.75)
    
    
#     name_feat = f'bayes{col_target}'
#     folds_feat[name_feat] = 0.0
#     for fold in range(CFG.num_folds):
#         X_trn = folds_feat.loc[folds_feat['kfold']!=fold, GENES+CELLS].astype(float).values
#         X_fold = folds_feat.loc[folds_feat['kfold']==fold,GENES+CELLS].astype(float).values
#         scaler = MinMaxScaler()
        
#         X_trn = scaler.fit_transform(X_trn)
#         X_fold = scaler.transform(X_fold)
        
#         y = folds_feat.loc[folds_feat['kfold']!=fold, col_target].astype(float).values
#         model = ComplementNB()
#         model.fit(X_trn, y)
#         folds_feat.loc[folds_feat['kfold']==fold, name_feat] = model.predict(X_fold)
    
#     auc = roc_auc_score(folds_feat[col_target].values, folds_feat[name_feat].values)
#     res.append(dict({'ncol':ncol, 'col':col_target, 'sumones':folds_feat[col_target].sum(), 'AUC':auc}))
#     print(f'{ncol+1}/{len(target_cols)}: col={col_target} sum_ones={folds_feat[col_target].sum()} AUC={auc}')
    
# res = pd.DataFrame(res)
#             if auc>best_auc:
#                 best_auc = auc
#                 best_alpha = alpha
#                 preds = folds_feat.loc[f'{name_feat}_{col}']

In [None]:
from scipy.stats import norm
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import ks_2samp
folds_feat = folds.copy()
test_noctl_feat = test_noctl.copy()
res = []
for ncol, col_target in enumerate(target_cols):
    pvalues = []
    for col_feat in GENES+CELLS:
        values_orig = train_noctl[col_feat].values
        target_orig = train_noctl[col_target].values
        target = target_orig[values_orig> -4]
        values = values_orig[values_orig> -4]
        positive = values[target==1]
        negative = values[target==0]
        pvalues.append(ks_2samp(positive, negative)[1])
    pvalues = np.array(pvalues)
    best_auc = 0.0
    best_thr = 0.0
    best_lenfeats = 0
    for thr in [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.3]:
        name_feat = f'bayes{col_target}'
        folds_feat[name_feat] = 0.0
        feats_trn = np.array(GENES+CELLS)[pvalues<thr]
        if len(feats_trn)>0:
            for fold in range(CFG.num_folds):
                X_trn = folds_feat.loc[folds_feat['kfold']!=fold, feats_trn].astype(float).values
                X_fold = folds_feat.loc[folds_feat['kfold']==fold,feats_trn].astype(float).values

                scaler = MinMaxScaler()
                X_trn = scaler.fit_transform(X_trn)
                X_fold = scaler.transform(X_fold)

                y = folds_feat.loc[folds_feat['kfold']!=fold, col_target].astype(float).values
                model = ComplementNB()
                model.fit(X_trn, y)
                folds_feat.loc[folds_feat['kfold']==fold, name_feat] = model.predict(X_fold)

            auc = roc_auc_score(folds_feat[col_target].values, folds_feat[name_feat].values)
            lenfeats = len(feats_trn)
            res.append(dict({'ncol':ncol, 'col':col_target, 'sumones':folds_feat[col_target].sum(), 'AUC':auc}))
#             print(f'{ncol+1}/{len(target_cols)}: col={col_target} thr={thr} lenfeats={lenfeats} sum_ones={folds_feat[col_target].sum()} AUC={auc}')
            if auc>best_auc:
                best_auc = auc
                best_thr = thr
                best_lenfeats = lenfeats
                preds = folds_feat[name_feat].values
    # Keep best
    auc = best_auc
    thr = best_thr
    lenfeats = best_lenfeats
    folds_feat[name_feat] = preds
    print(f'BEST {ncol+1}/{len(target_cols)}: col={col_target} thr={thr} lenfeats={lenfeats} sum_ones={folds_feat[col_target].sum()} AUC={auc}')
        
#         prob_pos = norm.pdf(values_orig, np.mean(positive), np.std(positive))
#         prob_neg = norm.pdf(values_orig, np.mean(negative), np.std(negative))
#         pvalue_probs = ks_2samp(prob_pos, prob_neg)[1]
#         df = pd.DataFrame({'target':target_orig, 'diff':prob_pos-prob_neg})
#         ax = sns.boxplot(x='target', y='diff', data=df)
#         plt.title(f'feat={col_feat} target={col_target}) n1s={np.sum(target_orig==1)} pv_negpos={pvalue_neg_pos:.6f}')
#         plt.show()
        
#         q25_pos = np.quantile(prob_pos, q=0.25)
#         q75_pos = np.quantile(prob_pos, q=0.75)
#         q25_neg = np.quantile(prob_neg, q=0.25)
#         q75_neg = np.quantile(prob_neg, q=0.75)
    
    
#     name_feat = f'bayes{col_target}'
#     folds_feat[name_feat] = 0.0
#     for fold in range(CFG.num_folds):
#         X_trn = folds_feat.loc[folds_feat['kfold']!=fold, GENES+CELLS].astype(float).values
#         X_fold = folds_feat.loc[folds_feat['kfold']==fold,GENES+CELLS].astype(float).values
#         scaler = MinMaxScaler()
        
#         X_trn = scaler.fit_transform(X_trn)
#         X_fold = scaler.transform(X_fold)
        
#         y = folds_feat.loc[folds_feat['kfold']!=fold, col_target].astype(float).values
#         model = ComplementNB()
#         model.fit(X_trn, y)
#         folds_feat.loc[folds_feat['kfold']==fold, name_feat] = model.predict(X_fold)
    
#     auc = roc_auc_score(folds_feat[col_target].values, folds_feat[name_feat].values)
#     res.append(dict({'ncol':ncol, 'col':col_target, 'sumones':folds_feat[col_target].sum(), 'AUC':auc}))
#     print(f'{ncol+1}/{len(target_cols)}: col={col_target} sum_ones={folds_feat[col_target].sum()} AUC={auc}')
    
# res = pd.DataFrame(res)
#             if auc>best_auc:
#                 best_auc = auc
#                 best_alpha = alpha
#                 preds = folds_feat.loc[f'{name_feat}_{col}']



In [None]:
from scipy.stats import norm
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
folds_feat = folds.copy()
test_noctl_feat = test_noctl.copy()
res = []
for ncol, col_target in enumerate(target_cols):
    name_feat = f'bayes{col_target}'
    folds_feat[name_feat] = 0.0
    for fold in range(CFG.num_folds):
        X_trn = folds_feat.loc[folds_feat['kfold']!=fold, GENES+CELLS].astype(float).values
        X_fold = folds_feat.loc[folds_feat['kfold']==fold,GENES+CELLS].astype(float).values
        scaler = StandardScaler()
        
        X_trn = scaler.fit_transform(X_trn)
        X_test = scaler.transform(X_test)
        
        y = folds_feat.loc[folds_feat['kfold']!=fold, col_target].astype(float).values
        model = GaussianNB()
        model.fit(X_trn, y)
        folds_feat.loc[folds_feat['kfold']==fold, name_feat] = model.predict(X_fold)
    
    auc = roc_auc_score(folds_feat[col_target].values, folds_feat[name_feat].values)
    res.append(dict({'ncol':ncol, 'col':col_target, 'sumones':folds_feat[col_target].sum(), 'AUC':auc}))
    print(f'{ncol+1}/{len(target_cols)}: col={col_target} sum_ones={folds_feat[col_target].sum()} AUC={auc}')
    
res = pd.DataFrame(res)
#             if auc>best_auc:
#                 best_auc = auc
#                 best_alpha = alpha
#                 preds = folds_feat.loc[f'{name_feat}_{col}']


        
        
        
        

In [None]:
from scipy.stats import norm
folds_feat = folds.copy()
test_noctl_feat = test_noctl.copy()
for col1 in tqdm(CELLS+GENES):
    values_orig = train_noctl[col1].values
    target_orig = train_noctl[col2].values
    target = target_orig[values_orig> -4]
    values = values_orig[values_orig> -4]
    positive = values[target==1]
    negative = values[target==0]
    prob_pos = norm.pdf(values_orig, np.mean(positive), np.std(positive))
    prob_neg = norm.pdf(values_orig, np.mean(negative), np.std(negative))
        
    for col2 in target_cols:
        name_feat = f'diffprob_{col1}_{col2}'
        folds_feat[name_feat] = 0.0
        for fold in range(CFG.num_folds):
            X_trn = folds_feat.loc[folds_feat['kfold']!=fold, [col1, col2]]
            X_fold = folds_feat.loc[folds_feat['kfold']==fold, [col1, col2]]
            values_orig = X_trn[col1].values
            target_orig = X_trn[col2].values
            
            
            
            target = target_orig[values_orig> -4]
            values = values_orig[values_orig> -4]
            positive = values[target==1]
            negative = values[target==0]
            
            values_fold = X_fold[col1].values
            prob_pos = norm.pdf(values_fold, np.mean(positive), np.std(positive))
            prob_neg = norm.pdf(values_fold, np.mean(negative), np.std(negative))
            folds_feat.loc[folds_feat['kfold']==fold, name_feat] = prob_pos-prob_neg
            
        # For test    
        X_trn = folds_feat[[col1, col2]]
        X_test = test_noctl_feat[[col1]]
        values_orig = X_trn[col1].values
        target_orig = X_trn[col2].values
        target = target_orig[values_orig> -4]
        values = values_orig[values_orig> -4]
        positive = values[target==1]
        negative = values[target==0]

        values_test = X_test[col1].values
        prob_pos = norm.pdf(values_test, np.mean(positive), np.std(positive))
        prob_neg = norm.pdf(values_test, np.mean(negative), np.std(negative)) 
        test_noctl_feat[name_feat] = prob_pos-prob_neg        

In [None]:
def prob_reordered(train, validation, by = ['product_id'], weight=10):
    target_mean = train['deal_probability'].mean()
    c = (train[['deal_probability'] + by]).groupby(by).agg(['sum','count'])
    s = c[('deal_probability','sum')]
    n = c[('deal_probability','count')]
    prob = (s + target_mean*weight) / (n + weight)
    prob=prob.reset_index().rename(columns={0: 'prob_'+'_'.join(by)})
    validation = pd.merge(validation, prob, on=by, how='left').reset_index(drop=True)
    validation.fillna(target_mean, inplace=True)
    return validation


col1 = CELLS[2]
col2 = target_cols[2]
df_train = folds.loc[folds['kfold']!=0,[col1,col2]].reset_index(drop=True)
df_val = folds.loc[folds['kfold']==0,[col1,col2]].reset_index(drop=True)
target_mean = train['deal_probability'].mean()

df.head()

In [None]:
train_noctl[target_cols].sum(axis=0).sort_values()

In [None]:

x = np.linspace(-4,4,100)
norm.pdf(x, -10, 5)

In [None]:
plt.plot(x)

In [None]:
np.sum(target==0)

In [None]:
from scipy.stats import norm
def mean_sd_with_weigths(values, by, weigth=5):
    mean_v = np.mean(values)
    len_values = len(values[by])
    mean_w = (np.sum(values[by])+mean_v*weigth)/(len_values+weigth)
    
    std_v = np.std(values)
    std_w = np.sqrt((np.sum((values[by]-mean_w)**2)+std_v*weigth)/(len_values+weigth))
    
    return np.mean(values[by]), np.std(values[by])
#     return mean_w, std_w


def prob_norm(val, mean, std):
    return np.exp(-0.5*((val-mean)/std)**2)/(std*np.sqrt(2*np.pi))

for col2 in target_cols[:1]:
    for col1 in CELLS[:100]:
        values_orig = train_noctl[col1].values
        target_orig = train_noctl[col2].values
        target = target_orig[values_orig> -4]
        values = values_orig[values_orig> -4]
        positive = values[target==1]
        negative = values[target==0]
        prob_pos = norm.pdf(values_orig, np.mean(positive), np.std(positive))
        prob_neg = norm.pdf(values_orig, np.mean(negative), np.std(negative))
        df = pd.DataFrame({'target':target_orig, 'diff':prob_pos-prob_neg})
        ax = sns.boxplot(x='target', y='diff', data=df)
        
        
#         x = np.linspace(-4,4,100)
#         meanw1, stdw1 = mean_sd_with_weigths(values, target==1)
#         meanw0, stdw0 = mean_sd_with_weigths(values, target==0)
#         print(meanw1, stdw1, meanw0, stdw0)
#         sns.kdeplot(positive, label=f'{col2}_1s',cumulative=False)
#         sns.kdeplot(negative, label=f'{col2}_0s',cumulative=False)
#         plt.plot(np.sort(positive), norm.pdf(np.sort(positive), np.mean(positive), np.std(positive)), '.b-')
#         plt.plot(np.sort(negative), norm.pdf(np.sort(negative), np.mean(negative), np.std(negative)), '.r-')
        
        
        
        
#         plt.plot(x, norm.pdf(x, meanw0, stdw0), '.g-')
        
#         plt.plot(x, norm.pdf(x, meanw1, stdw1), '.b-')
#         plt.plot(x, norm.pdf(x, meanw0, stdw0), '.g-')
#         plt.legend('upper left')
        plt.title(f'{col1} {col2} num_1s={np.sum(target==1)}')

#         plt.title(f'{col1} 1s({(train_noctl[col2]>0.50).sum()}) \ 
#     {(train_noctl.loc[train_noctl[col2]>0.50,col1]).mean():.3f} \
#     0s({(train_noctl[col2]>0.50).sum()})\
#     {(train_noctl.loc[train_noctl[col2]<0.50,col1]).mean():.3f}')
        plt.show()
        


In [None]:
for col1 in CELLS[:1]:
    for col2 in target_cols[:100]:
        positive = train_noctl.loc[train_noctl[col2]>0.50,col1].values
        negative = train_noctl.loc[train_noctl[col2]<0.50,col1].values
        positive = positive[positive> -4]
        negative = negative[negative> -4]
        
        
        
#         x = sns.kdeplot(, label=f'{col2}_1s',cumulative=False)
#         sns.kdeplot(train_noctl.loc[train_noctl[col2]<0.50,col1], label=f'{col2}_0s',cumulative=False)
        sns.kdeplot(positive, label=f'{col2}_1s',cumulative=False)
        sns.kdeplot(negative, label=f'{col2}_0s',cumulative=False)
        
    
        plt.title(f'{col1} 1s({(train_noctl[col2]>0.50).sum()}) \
        {(train_noctl.loc[train_noctl[col2]>0.50,col1]).mean():.3f} \
        0s({(train_noctl[col2]>0.50).sum()})\
        {(train_noctl.loc[train_noctl[col2]<0.50,col1]).mean():.3f}')
        plt.show()
        
        
#         x = sns.kdeplot(train_ctl[col], label='trn_ctl',cumulative=False)
#         sns.kdeplot(train_noctl[col], label='trn_noctl',cumulative=False)
#         sns.kdeplot(test_ctl[col], label='tst_ctl',cumulative=False)
#         sns.kdeplot(test_noctl[col], label='tst_noctl',cumulative=False)
#     plt.show()

### Features Basic Statistics

In [None]:
for (cetiq, cols) in [['G',CELLS],['C', GENES]]:
    for (fetiq, func) in [['mean', np.mean]]: #,['min', np.min], ['max', np.max]]:
        name = f'{cetiq}_{fetiq}'
        print(name)
        folds[name] = folds[cols].apply(lambda x: func(x), axis=1).values
        test_noctl[name] = test_noctl[cols].apply(lambda x: func(x), axis=1).values
print(folds.shape, test_noctl.shape)

# NO stats BEST: pca_comp_genes=30 pca_comp_cells=18 Local=0.014694 LB=0.01872
# Mean Local=0.014702



### PCA Features

In [None]:
if CFG.use_pca:
    etiq = ['G','C']
    num_pca = [CFG.pca_comp_genes, CFG.pca_comp_cells]
    for niter, cols in enumerate([GENES, CELLS]):
        # PCA for train with folds
        train_pca = []
        train_pca_sig_id = []
        num_comp = num_pca[niter]
        columns_pca = [f'pca_{etiq[niter]}-{i}' for i in range(num_comp)]
        numeric_cols += columns_pca
        for fold in tqdm(range(CFG.num_folds)):
            pca = PCA(n_components=num_comp, random_state=42).fit(folds.loc[folds['kfold']!=fold, cols])
            train_pca.append(pca.transform(folds.loc[folds['kfold']==fold, cols]))
            train_pca_sig_id.append(folds.loc[folds['kfold']==fold, 'sig_id'])
        train_pca = np.concatenate(train_pca)
        train_pca = pd.DataFrame(train_pca, columns=columns_pca)
        train_pca['sig_id'] = np.concatenate(train_pca_sig_id)
        folds = pd.merge(folds, train_pca, on='sig_id')

        # PCA for test
        pca = PCA(n_components=num_comp, random_state=42).fit(folds[cols])
        test_pca = pca.transform(test_noctl[cols])
        test_pca = pd.DataFrame(test_pca, columns=columns_pca)
        test_noctl = pd.concat((test_noctl, test_pca), axis=1)
        print(folds.shape, test_noctl.shape)
        
#     data2 = (PCA(n_components=CFG.pca_comp_genes, random_state=42).fit_transform(data[GENES]))
#     train2 = data2[:train_features.shape[0]]; test2 = data2[-test_features.shape[0]:]

#     train2 = pd.DataFrame(train2, columns=[f'pca_G-{i}' for i in range(CFG.pca_comp_genes)])
#     test2 = pd.DataFrame(test2, columns=[f'pca_G-{i}' for i in range(CFG.pca_comp_genes)])

#     train_features = pd.concat((train_features, train2), axis=1)
#     test_features = pd.concat((test_features, test2), axis=1)
#     print(train_features.shape, test_features.shape)
    
#     #CELLS
#     data = pd.concat([pd.DataFrame(train_features[CELLS]), pd.DataFrame(test_features[CELLS])])
#     data2 = (PCA(n_components=CFG.pca_comp_cells, random_state=42).fit_transform(data[CELLS]))
#     train2 = data2[:train_features.shape[0]]; test2 = data2[-test_features.shape[0]:]

#     train2 = pd.DataFrame(train2, columns=[f'pca_C-{i}' for i in range(CFG.pca_comp_cells)])
#     test2 = pd.DataFrame(test2, columns=[f'pca_C-{i}' for i in range(CFG.pca_comp_cells)])
    
#     train_features = pd.concat((train_features, train2), axis=1)
#     test_features = pd.concat((test_features, test2), axis=1)
#     print(train_features.shape, test_features.shape)
    

### Feature Selection using Variance Encoding


In [None]:
if CFG.variance_enc:
    var_thresh = VarianceThreshold(threshold=CFG.variance_thres)
    selected_cols = np.array(numeric_cols)[var_thresh.fit(folds[numeric_cols]).get_support(True)].tolist()
    
    folds = folds[['sig_id','cp_time','cp_dose','kfold']+selected_cols+target_cols]
    test_noctl = test_noctl[['sig_id','cp_time','cp_dose']+selected_cols]
    print(folds.shape, test_noctl.shape)

### Training Functions

In [None]:
class MoADataset:
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float),
            'y' : torch.tensor(self.targets[idx, :], dtype=torch.float)            
        }
        return dct
    
class TestDataset:
    def __init__(self, features):
        self.features = features
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float)
        }
        return dct
    

In [None]:
def train_fn(model, optimizer, scheduler, loss_fn, dataloader, device):
    model.train()
    final_loss = 0
    
    for data in dataloader:
        optimizer.zero_grad()
        inputs, targets = data['x'].to(device), data['y'].to(device)
#         print(inputs.shape)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        final_loss += loss.item()
        
    final_loss /= len(dataloader)
    
    return final_loss


def valid_fn(model, loss_fn, dataloader, device):
    model.eval()
    final_loss = 0
    valid_preds = []
    
    for data in dataloader:
        inputs, targets = data['x'].to(device), data['y'].to(device)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        
        final_loss += loss.item()
        valid_preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    final_loss /= len(dataloader)
    valid_preds = np.concatenate(valid_preds)
    
    return final_loss, valid_preds

def inference_fn(model, dataloader, device):
    model.eval()
    preds = []
    
    for data in dataloader:
        inputs = data['x'].to(device)

        with torch.no_grad():
            outputs = model(inputs)
        
        preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    preds = np.concatenate(preds)
    
    return preds

### Model

In [None]:
class Model(nn.Module):
    def __init__(self, num_features, num_targets, hidden_size):
        super(Model, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dropout1 = nn.Dropout(0.2)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_features, hidden_size))
        
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)
        self.dropout2 = nn.Dropout(0.5)
        self.dense2 = nn.utils.weight_norm(nn.Linear(hidden_size, hidden_size))
        
        self.batch_norm3 = nn.BatchNorm1d(hidden_size)
        self.dropout3 = nn.Dropout(0.5)
        self.dense3 = nn.utils.weight_norm(nn.Linear(hidden_size, num_targets))
    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = F.relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.relu(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)
        
        return x

### Preprocessing Steps

In [None]:
def process_data(data):
    
    data = pd.get_dummies(data, columns=['cp_time','cp_dose'])
#     data.loc[:, 'cp_time'] = data.loc[:, 'cp_time'].map({24: 0, 48: 1, 72: 2})
#     data.loc[:, 'cp_dose'] = data.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})

# --------------------- Normalize ---------------------
#     for col in GENES:
#         data[col] = (data[col]-np.mean(data[col])) / (np.std(data[col]))
    
#     for col in CELLS:
#         data[col] = (data[col]-np.mean(data[col])) / (np.std(data[col]))
    
#--------------------- Removing Skewness ---------------------
#     for col in GENES + CELLS:
#         if(abs(data[col].skew()) > 0.75):
            
#             if(data[col].skew() < 0): # neg-skewness
#                 data[col] = data[col].max() - data[col] + 1
#                 data[col] = np.sqrt(data[col])
            
#             else:
#                 data[col] = np.sqrt(data[col])
    
    return data

In [None]:
feature_cols = [c for c in process_data(folds).columns if c not in target_cols]
feature_cols = [c for c in feature_cols if c not in ['kfold','sig_id']]
len(feature_cols)

### Train

In [None]:
# HyperParameters

DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 25
BATCH_SIZE = 128
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-5
NFOLDS = 5
EARLY_STOPPING_STEPS = 10
EARLY_STOP = False

num_features=len(feature_cols)
num_targets=len(target_cols)
hidden_size=1024

In [None]:
def run_training(fold, seed, display=2):
    
    seed_everything(seed)
    
    train = process_data(folds)
    test_ = process_data(test_noctl)
    
    trn_idx = train[train['kfold'] != fold].index
    val_idx = train[train['kfold'] == fold].index
    
    train_df = train[train['kfold'] != fold].reset_index(drop=True)
    valid_df = train[train['kfold'] == fold].reset_index(drop=True)
    
    x_train, y_train  = train_df[feature_cols].values, train_df[target_cols].values
    x_valid, y_valid =  valid_df[feature_cols].values, valid_df[target_cols].values
    
    train_dataset = MoADataset(x_train, y_train)
    valid_dataset = MoADataset(x_valid, y_valid)
    trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    model = Model(
        num_features=num_features,
        num_targets=num_targets,
        hidden_size=hidden_size,
    )
    
    model.to(DEVICE)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3, 
                                              max_lr=1e-2, epochs=EPOCHS, steps_per_epoch=len(trainloader))
    
    loss_fn = nn.BCEWithLogitsLoss()
    
    early_stopping_steps = EARLY_STOPPING_STEPS
    early_step = 0
    
    oof = np.zeros((len(train), target.iloc[:, 1:].shape[1]))
    best_loss = np.inf
    best_epoch = 0
    res = []
    for epoch in range(EPOCHS):
        
        train_loss = train_fn(model, optimizer,scheduler, loss_fn, trainloader, DEVICE)
                
#         print(f"FOLD: {fold}, EPOCH: {epoch}, train_loss: {train_loss}")
        valid_loss, valid_preds = valid_fn(model, loss_fn, validloader, DEVICE)
    
        if valid_loss < best_loss:
            best_loss = valid_loss
            best_epoch = epoch
            oof[val_idx] = valid_preds
            torch.save(model.state_dict(), output_dir / f'model_seed{seed}_fold{fold}_.pth')
        elif(EARLY_STOP == True):
            early_step += 1
            if (early_step >= early_stopping_steps):
                break
        lr = scheduler.get_last_lr()
        
        # Save results
        # ------------
        res.append(dict({'epoch':epoch, 'lr':lr[0], 'trn_loss':train_loss, 'val_loss':valid_loss, 'best_epoch':best_epoch, 'best_loss':best_loss}))
        
        res_df = pd.DataFrame(res)
        res_df.to_csv(output_dir / f'res_seed{seed}_fold{fold}_.csv')
        
        fig, ax = plt.subplots(nrows=1, ncols=1 )
        min_val = res_df[['trn_loss','val_loss']].min().min()
        ax.plot(res_df['trn_loss'])
        ax.plot(res_df['val_loss'])
        plt.ylim((min_val,0.020))
        plt.title(f"logloss in fold={fold} min={res_df['val_loss'].min()}")
        fig.savefig(output_dir / f'loss_seed{seed}_fold{fold}_.png')
        plt.close(fig)
        
        fig, ax = plt.subplots(nrows=1, ncols=1 )
        ax.plot(res_df['lr'])
        plt.title(f"lr in fold={fold}")
        fig.savefig(output_dir / f'lr_seed{seed}_fold{fold}_.png')
        plt.close(fig)
        
        if display==2:
            print(f"SEED:{seed} FOLD:{fold}, EPOCH:{epoch:2d}, lr:{lr[0]:.9f} train_loss:{train_loss:.6f}, valid_loss:{valid_loss:.6f}, best_epoch:{best_epoch}, best_loss:{best_loss:.6f}")

    if display==1:
        print(f"SEED:{seed} FOLD:{fold}, EPOCH:{epoch:2d}, lr:{lr[0]:.9f} train_loss:{train_loss:.6f}, valid_loss:{valid_loss:.6f}, best_epoch:{best_epoch}, best_loss:{best_loss:.6f}")
        
    
    #--------------------- PREDICTION---------------------
    x_test = test_[feature_cols].values
    testdataset = TestDataset(x_test)
    testloader = torch.utils.data.DataLoader(testdataset, batch_size=BATCH_SIZE, shuffle=False)
    
    model = Model(
        num_features=num_features,
        num_targets=num_targets,
        hidden_size=hidden_size,
    )
    
    model.load_state_dict(torch.load(output_dir / f'model_seed{seed}_fold{fold}_.pth'))
    model.to(DEVICE)
    
    predictions = np.zeros((len(test_), target.iloc[:, 1:].shape[1]))
    predictions = inference_fn(model, testloader, DEVICE)
    
    return oof, predictions


In [None]:
def run_k_fold(NFOLDS, seed, display=2):
    oof = np.zeros((len(folds), len(target_cols)))
    predictions = np.zeros((len(test_noctl), len(target_cols)))
    
    for fold in range(NFOLDS):
        oof_, pred_ = run_training(fold, seed, display)
        
        predictions += pred_ / NFOLDS
        oof += oof_
        
    return oof, predictions

In [None]:
# Averaging on multiple SEEDS
SEED = [0, 1, 2, 3 ,4, 5]
oof = np.zeros((len(folds), len(target_cols)))
predictions = np.zeros((len(test_noctl), len(target_cols)))

for seed in SEED:
    oof_, predictions_ = run_k_fold(NFOLDS, seed, display=1)
    oof += oof_ / len(SEED)
    predictions += predictions_ / len(SEED)
    
# FINAL CV LOGLOSS
folds[target_cols] = oof
for col in target_cols:
    test_noctl[col] = 0.0
test_noctl[target_cols] = predictions

valid_results = train_targets_scored.drop(columns=target_cols).merge(folds[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
y_true = train_targets_scored[target_cols].values
y_pred = valid_results[target_cols].values

score = 0
for i in range(len(target_cols)):
    score_ = log_loss(y_true[:, i], y_pred[:, i])
    score += score_ / target.shape[1]
print(f"FINAL CV logloss: {score:.6f}")

In [None]:
# my_submission = pd.read_csv('../input/firstpison26sep/submission0.014740.csv') #LB=0.01875 PCA + removing low std
#my_submission = pd.read_csv('../input/pison26sepv2/submission0.014731.csv') #LB=0.01874 PCA
print(f"FINAL CV logloss: {score:.6f}")

### Create Submission

In [None]:
submission = sample_submission.drop(columns=target_cols)\
.merge(test_noctl[['sig_id']+target_cols], on='sig_id', how='left')\
.fillna(0.0).reset_index(drop=True)
# sub.to_csv('submission.csv', index=False)
name_sub = output_dir / f'submission{score:.6f}.csv'
submission.to_csv(name_sub, index=False)
print(name_sub)

In [None]:
submission.head()