In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV, HuberRegressor
from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold


In [2]:
path_archivo = "../input_data/KG.csv"
datos_full = pd.read_csv(path_archivo, delimiter = ",")
target = "K"
reg_defaults = [r"^(w\d+|eig\d+|eig_\d+|\(omega\^2\)_\d+)$", r"^(dx|dy|dz|lx|ly|lz)$", r"^(rho|Rho)$"]

In [43]:
def scale_column(key, dataSet, par = [], mode = "min-max"):
    if mode == "min-max":
        the_min = min(dataSet[key]) if len(par) == 0 else par[key]["min"]
        the_max = max(dataSet[key]) if len(par) == 0 else par[key]["max"]
        dataSet[key] = (dataSet[key] - the_min)/(the_max - the_min)
    #fin if 
#fin funcion

def get_metrics(X, y, model):
    y_gorro = model.predict(X)
    R2 = r2_score(y, y_gorro)
    RMSE = root_mean_squared_error(y, y_gorro)
    MAE = mean_absolute_error(y, y_gorro)
    return {"R2": R2, "RMSE": RMSE, "MAE": MAE}
#fin función

def get_params(X, mode = "min-max"):
    resp = dict()
    if mode == "min-max":
        for key in X.keys():
            resp[key] = {"min": min(X[key]), "max": max(X[key])}
        #fin for 
    #fin if 
    return resp
#fin función 

def determine_columns(datos, reg_expressions = reg_defaults):
    """
    Esta función retorna las columnas de datos que encajan con los regular expressions dados en la lista del segundo argumentos
    @input: datos <pd.DataFrame>: Tabla de datos 
    @input: reg_expressions <iterable>: Lista o iterable que contiene strings donde están las expresiones regulares
    @output: big_list <list>: Lista que contiene los nombres de las columnas que cuadran con las expresiones regulares dadas. 
    """
    big_list = sum(map(lambda y: list(filter(lambda x: re.match(y, x), datos.keys())), reg_expressions), [])
    return big_list 
#fin función

class TransformadorDeDatos(BaseEstimator, TransformerMixin):
    def __init__(self, operations, par = []):
        condicion = callable(operations) or all((callable(operations[key]) for key in operations.keys()))
        if condicion:
            self.operations = operations
        else:
            raise TypeError("Class input must be either a callable of a dict full of callables")
        #fin if 
        self.par = par
    #fin init

    def fit(self, X, y=None):
        return self
    #fin fit

    def transform(self, X, post_scaler = "min-max"):
        datos = X.copy()
        cols_transformar = determine_columns(datos)
        if callable(self.operations):
            for col in cols_transformar:
                if col in datos.keys():
                    datos[col] = self.operations(datos[col])
                    scale_column(col, datos, mode = post_scaler, par = self.par)
                #fin if 
            #fin for
        elif type(self.operations) == dict and all((callable(self.operations[key]) for key in self.operations.keys())): 
            for col in cols_transformar:
                if col in datos.keys() and col in self.operations:
                    datos[col] = self.operations[col](datos[col])
                    scale_column(col, datos, mode = post_scaler, par = self.par)
                #fin if 
            #fin for
        else: 
            raise TypeError("Class input must be either a callable of a dict full of callables")
        #fin if 
        return datos
    #fin transform
#fin class

In [4]:
orden = [("custom_transformer", TransformadorDeDatos(operations=np.log)),
         ("lin-reg", LinearRegression())]
pipeline = Pipeline(orden)

In [5]:
features = determine_columns(datos_full)
X_full = datos_full[features]
y_full = datos_full[target]
pipeline.fit(X_full, y_full)
y_gorro_full = pipeline.predict(X_full)

In [6]:
primer_transformer = pipeline.named_steps["custom_transformer"]
X_tranf = primer_transformer.transform(X_full)
X_tranf.head()

Unnamed: 0,w0,w1,w2,w3,w4,w5,w6,w7,w8,w9,dx,dy,dz,rho
0,0.817614,0.830022,0.827608,0.830971,0.830971,0.820179,0.820039,0.822732,0.823866,0.822408,0.0,0.0,0.0,0.0
1,0.871076,0.866716,0.871217,0.874975,0.881228,0.874864,0.875924,0.870402,0.870599,0.869217,0.0,0.0,0.0,0.0
2,0.878663,0.87456,0.877614,0.886671,0.897695,0.8852,0.893723,0.888088,0.895079,0.893737,0.0,0.0,0.0,0.0
3,0.882043,0.878055,0.878616,0.889105,0.907947,0.899394,0.902559,0.896869,0.904482,0.903155,0.0,0.0,0.0,0.0
4,0.884006,0.880084,0.879213,0.890491,0.913183,0.908723,0.908652,0.906841,0.910947,0.909631,0.0,0.0,0.0,0.0


In [7]:
X_full.head()

Unnamed: 0,w0,w1,w2,w3,w4,w5,w6,w7,w8,w9,dx,dy,dz,rho
0,6.846532,8.279029,8.279029,8.607635,8.607635,9.277155,9.277155,10.01862,10.188246,10.188246,0.1,0.1,0.1,0.2
1,12.049711,12.049711,12.852182,13.398173,14.267764,15.790196,15.968844,15.968844,16.066732,16.066732,0.1,0.1,0.1,0.2
2,13.056281,13.056281,13.708609,15.070374,16.836928,17.460062,18.98413,18.98413,20.396466,20.396466,0.1,0.1,0.1,0.2
3,13.53138,13.53138,13.847906,15.443661,18.665271,20.044593,20.686383,20.686383,22.354199,22.354199,0.1,0.1,0.1,0.2
4,13.815095,13.815095,13.931538,15.660428,19.674186,21.948089,21.948089,22.805466,23.808203,23.808203,0.1,0.1,0.1,0.2


In [8]:
print((np.log(X_full["w0"][0])-min(np.log(X_full["w0"])))/(max(np.log(X_full["w0"])- min(np.log(X_full["w0"])))))

0.8176139416371806


In [9]:
metricas_full = get_metrics(X_full, y_full, pipeline)

In [10]:
print(metricas_full)

{'R2': 0.08550190502541999, 'RMSE': 1.659009541687749, 'MAE': 1.428676898610909}


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size = 0.33)

In [12]:
param_train = get_params(X_train)
orden2 = [("custom_transformer", TransformadorDeDatos(operations=np.log, par=param_train)),
         ("lin-reg", LinearRegression())]
pipeline2 = Pipeline(orden2)
pipeline2.fit(X_train, y_train)
metricas2 = get_metrics(X_test, y_test, pipeline2)

In [13]:
print(metricas2)

{'R2': 0.08521501101553153, 'RMSE': 1.6596046816778283, 'MAE': 1.4298631937745745}


In [44]:
cross_validation = KFold(n_splits = 5, shuffle = True)
reg_cols = r"^(w\d+|eig\d+|eig_\d+|\(omega\^2\)_\d+|dx|dy|dz)$"
cols_log = filter(lambda x: re.match(reg_cols, x), X_train.keys())
dict_operations = dict(map(lambda x: (x, np.log), cols_log))
dict_operations["rho"] = np.sqrt
def probar_modelo(modelo, Xtrain, ytrain, Xtest, ytest, nombre_modelo):
    orden = [("custom_transformer", TransformadorDeDatos(operations=dict_operations, par=param_train)),
         (nombre_modelo, modelo)]
    pipeline_modelo = Pipeline(orden)
    pipeline_modelo.fit(Xtrain, ytrain)
    metricas_modelo = cross_val_score(pipeline_modelo, Xtest, ytest, cv = cross_validation, scoring="r2")
    return metricas_modelo
#fin probar modelo 

modelos = {
    'LinearRegression': LinearRegression(),
    'LassoCV': LassoCV(cv=cross_validation),
    'RidgeCV': RidgeCV(cv=cross_validation),
    'ElasticNetCV': ElasticNetCV(cv=cross_validation),
    'HuberRegressor': HuberRegressor()
}


In [48]:
#resultados_modelos = dict(map(lambda x: (x, probar_modelo(modelos[x], X_train, y_train, X_test, y_test, x)), modelos.keys()))
resultados_modelos = dict()
for key in modelos.keys():
    resultados_modelos[key] = probar_modelo(modelos[key], X_full, y_full, X_full, y_full, key)
#fin for 

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

In [47]:
print(pd.DataFrame(resultados_modelos))

   LinearRegression   LassoCV   RidgeCV  ElasticNetCV  HuberRegressor
0          0.076150  0.074803  0.058108      0.041779        0.044334
1          0.077624  0.081612  0.058336      0.045659        0.059769
2          0.078483  0.077288  0.065260      0.041850        0.053526
3          0.080767  0.077776  0.060049      0.039518        0.067473
4          0.080153  0.074867  0.062148      0.033959        0.064817
