In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV, LassoCV, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR

In [2]:
import os.path

my_path = os.path.abspath(os.path.dirname(os.path.abspath("__file__")))
path_pheno = os.path.join(my_path, "grinberg_et_al_data/yeast/feno.txt")
path_geno = os.path.join(my_path, "grinberg_et_al_data/yeast/geno.txt")

In [4]:
pheno = pd.read_csv('feno.txt')
geno = pd.read_csv('geno.txt')
traits = ["Cadmium_Chloride", 'Congo_red', 'Cycloheximide', 'Diamide',  'Ethanol', 'Hydroquinone', 'Lithium_Chloride',
          'Maltose', 'Neomycin', 'Tunicamycin', "Galactose", "YNB:ph3"]


In [8]:
def clean_data(X, Y):
    missing_phenos = Y[ Y.isnull() ].index.values
    X = geno.drop(columns = ["Unnamed: 0"])
    X = X.drop( missing_phenos, axis = 0).values
    Y = Y.drop(missing_phenos, axis = 0).values
    #print(X.shape, Y.shape)
    return X, Y
    
def split_n_scale(X, Y, ts = 0.15):
    """
    Splits data into train and test. Also standarizes y. 
    """
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = ts)
    
    Y_train_std = (Y_train - np.mean(Y_train)) / np.std(Y_train)
    Y_test_std = (Y_test - np.mean(Y_train)) / np.std(Y_train)
    
    return  X_train, X_test, Y_train_std, Y_test_std

def train_n_test_lasso(X_train, X_test, Y_train_std, Y_test_std):
    model = Lasso(alpha = 1000).fit(X_train, y_train_std)
    y_pred = model.predict( X_test)
    Mse = mean_squared_error( y_test_std, y_pred )
    R2 = r2_score( y_test_std, y_pred )
    return Mse, R2, 0

def train_n_test_ridge(X_train, X_test, Y_train_std, Y_test_std):
    model = RidgeCV(alphas = [1500, 1650, 1750, 2000, 2250, 2500, 2750, 3000, 3500], cv = 3).fit(X_train, y_train_std)
    Alpha = model.alpha_
    y_pred = model.predict( X_test)
    Mse = mean_squared_error( y_test_std, y_pred )
    R2 = r2_score( y_test_std, y_pred )
    return Mse, R2, Alpha
    

# def train_n_test_svr(X_train, X_test, Y_train_std, Y_test_std):
#     svr = SVR(kernel = "rbf", gamma = "auto", C = 1).fit(X_train, y_train_std)
#     y_pred = svr.predict(X_test)
#     Mse = mean_squared_error(y_pred, y_test_std)
#     R2 = r2_score( y_test_std, y_pred )
#     return Mse, R2

In [9]:
mses = np.zeros(len(traits))
r2s = np.zeros(len(traits))
alphas = np.zeros(len(traits))

n = 1

model_stats = np.empty((len(traits), 3))
for j, t in enumerate(traits):
    mses = 0
    r2s = 0
    alphas = []
    for i in range(n):
        x, y = clean_data(geno, pheno[t])
        x_train, x_test, y_train_std, y_test_std = split_n_scale(x, y)
        mse, r2, alpha = train_n_test_lasso( x_train, x_test, y_train_std, y_test_std )
        mses += mse
        r2s += r2
        alphas.append(alpha)
    print(f" {t} c'est fini \n ")
    model_stats[j, 0] = np.median(alphas)
    model_stats[j, 1] = mses / n
    model_stats[j, 2] = r2s / n


 Cadmium_Chloride c'est fini 
 
 Congo_red c'est fini 
 
 Cycloheximide c'est fini 
 
 Diamide c'est fini 
 
 Ethanol c'est fini 
 
 Hydroquinone c'est fini 
 
 Lithium_Chloride c'est fini 
 
 Maltose c'est fini 
 
 Neomycin c'est fini 
 
 Tunicamycin c'est fini 
 
 Galactose c'est fini 
 
 YNB:ph3 c'est fini 
 


In [10]:
r2_model = model_stats[:, 2]
print(r2_model)

[-2.21786812e-04 -3.28669842e-03 -1.08974937e-02 -1.51847213e-04
 -5.46712549e-03 -3.60641540e-03 -3.34980669e-03 -8.07711686e-03
 -2.75498509e-02 -5.37524119e-03 -5.71579963e-03 -5.11299202e-06]
