In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import os
import pickle

from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

from IPython.display import clear_output

In [5]:
pheno = pd.read_csv(r"../feno.txt")
geno = pd.read_csv(r"../geno.txt")

pheno_names = ["Cadmium_Chloride", 'Congo_red', 'Cycloheximide', 'Diamide',  'Ethanol', 'Hydroquinone', 'Lithium_Chloride',
              'Maltose', 'Neomycin', 'Tunicamycin', "Galactose", "YNB:ph3"]

pheno_12 = pheno[pheno_names]

In [6]:
results = []
for filename in os.listdir('../params/gbm'):
    if filename[-6:] == 'pickle':
        with open('../params/gbm/' + filename, 'rb') as f:
            results.append(pickle.load(f))

In [13]:
def add_noise(feno, ratio):
    
    feno_n = feno.to_numpy()
    N_random = np.random.permutation(feno.shape[0])
    std = np.std(feno_n)
    for n in N_random[:int(ratio*feno_n.shape[0])]:
        if np.random.choice([0, 1]):
            feno_n[n] = feno_n[n] + 2*std
        else:
            feno_n[n] = feno_n[n] - 2*std
    
    return feno_n

In [80]:
def delete_markers(geno, ratio):
    
    geno = geno.drop(columns = ["Unnamed: 0"]).values
    geno_n = geno.copy()
    N_or = geno.shape[1]
    N_random = np.random.permutation(N_or)[:int(N_or*ratio)]
    geno_n = np.delete(geno_n, N_random, axis = 1)
    
    print('{} of {} markers deleted.'.format(N_random.shape[0], N_or))

    return geno_n

In [86]:
i = 0
M = 4
noise_ratios = np.array([5, 10, 20, 30, 40, 50, 75, 90])*0.01
#del_ratios = np.array([10, 25, 50, 60, 70, 80, 90, 95, 99])*0.01
del_ratios = np.arange(5, 99, step = 3)*0.01
for name, y in pheno_12.iteritems():

    print('Analyzing fenotype: ' + name + '.')
    params = results[i][1]
    n_estimators = params['n_estimators']
    min_samples_split = params['min_samples_split']
    min_samples_leaf = params['min_samples_leaf']
    max_features = params['max_features']
    max_depth = params['max_depth']
    loss = params['loss']
    learning_rate = params['learning_rate']
    
    missing_phenos = y[ y.isnull() ].index.values
    y_c = y.copy()
    y_c = y.drop(missing_phenos, axis = 0)
    r2s = []
    for (j, del_ratio) in enumerate(del_ratios):
        geno_c = geno.copy()
        geno_c = geno_c.drop(missing_phenos, axis = 0)
        r2s_n = []
        for k in np.arange(M):
            geno_n = delete_markers(geno_c, del_ratio)
            # ESTANDARIZANDO MAL !!
            y_c = y_c - np.mean(y_c)
            y_c = y_c/np.std(y_c)
            #print(geno_n.shape)
            X_train, X_test, y_train, y_test = train_test_split(geno_n, y_c, test_size=0.3)

            #X_train = X_train.drop(columns = ["Unnamed: 0"]).values
            #X_test = X_test.drop(columns = ["Unnamed: 0"]).values

            #y_train_std = (y_train - np.mean(y_train)) / np.std(y_train)
            #y_test_std = (y_test - np.mean(y_train)) / np.std(y_train)
        
            gbm = GradientBoostingRegressor(n_estimators = n_estimators, min_samples_split = min_samples_split,\
                                        min_samples_leaf = min_samples_leaf, max_features = max_features,\
                                        max_depth = max_depth, loss = loss, learning_rate = learning_rate,
                                        subsample = 1)
            
            gbm.fit(X_train, y_train)
            
            gbm_predictions = gbm.predict(X_test)
            r2 = r2_score(y_test, gbm_predictions)

            r2s_n.append(r2)

        r2s.append(np.mean(np.array(r2s_n)))
        print('Iteration {} of {} complete.'.format(j+1, del_ratios.shape[0]))
    
    clear_output()
    with open('r2_gbm_del_{}.pickle'.format(name), 'wb') as f:
        pickle.dump(r2s, f)
        
    i+=1 

In [75]:
a = np.array([[
    1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12]])


In [63]:
a

array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12]])

In [76]:
a = np.delete(a, np.array([2, 3]), 1)

In [77]:
a

array([[ 1,  2],
       [ 5,  6],
       [ 9, 10]])