In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import PolynomialFeatures, PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV, HuberRegressor
from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold

In [2]:
path_archivo = "../input_data/KG_combin.csv"
datos_no_eig = df = pd.read_csv(path_archivo, sep=',', usecols=range(7))
datos_eig = pd.read_csv(path_archivo, sep=',', usecols=[7], header=None, skiprows=1)
eigvals_split = datos_eig[7].str.split(expand=True)
eigvals_split.columns = [f'omega2_{i - 6}' for i in range(eigvals_split.shape[1])]
datos_full = pd.concat([datos_no_eig, eigvals_split], axis=1)
reg_defaults = [r"^(w\d+|eig\d+|eig_\d+|\(omega\^2\)_\d+|omega2_\d+)$", r"^(dx|dy|dz|lx|ly|lz)$", r"^(rho|Rho)$"]

In [3]:
for i in range(6):
    del datos_full["omega2_-" + str(i+1)]
#fin for 
for i in range(10, 94):
    del datos_full["omega2_" + str(i)]
#fin for 
del datos_full["shape"]
datos_full.head()

Unnamed: 0,K,G,rho,dx,dy,dz,omega2_0,omega2_1,omega2_2,omega2_3,omega2_4,omega2_5,omega2_6,omega2_7,omega2_8,omega2_9
0,0.3,0.3,0.2,0.1,0.1,0.1,72.76250234138867,72.76250234205544,123.37238855362752,160.30609242654566,160.3060924265749,240.54832324247909,240.54832324303428,272.11863169568386,272.11863169598064,278.390889163176
1,0.3,1.057143,0.2,0.1,0.1,0.1,216.07142090925953,247.43310432034008,247.43310432095623,530.3012965186392,530.3012965191873,553.7767022916263,553.7767022919858,643.6023518582099,643.602351858263,667.1205281939433
2,0.3,1.814286,0.2,0.1,0.1,0.1,255.8926618939088,415.00276750463826,415.0027675049765,719.0331932329563,725.7096705759152,725.7096705759459,783.1691861334593,783.1691861334986,936.6833442756296,936.6833442761408
3,0.3,2.571429,0.2,0.1,0.1,0.1,280.42915853211025,577.2220399922286,577.2220399965527,742.7301068077224,832.3290778837878,832.3290778838026,919.5581239644765,919.5581239647311,1307.922315384641,1307.9223153869075
4,0.3,3.328571,0.2,0.1,0.1,0.1,297.6725985645846,733.6997934621878,733.6997934659638,756.5838774387405,872.1368923721309,872.136892372449,1087.258878637283,1087.2588786375625,1613.401037506036,1613.401037514644


In [26]:
path_exp = "../input_data/KG_Experimental_Iso.csv"
datos_exp = pd.read_csv(path_exp, sep=",")
datos_exp.head()

Unnamed: 0,K,G,rho,dx,dy,dz,omega2_0,omega2_1,omega2_2,omega2_3,omega2_4,omega2_5,omega2_6,omega2_7,omega2_8,omega2_9
0,1.663,0.7362,8.052,0.7042,0.631,0.58393,1.485197,2.243456,2.722901,2.909486,3.16277,3.44288,3.77183,3.805168,3.946075,4.272144
1,1.315,0.5178,5.403,0.33511,0.26023,0.15492,5.171781,8.546431,13.095081,17.098761,19.098981,19.59366,19.775137,23.387424,26.975627,29.017625
2,1.0,0.6087,9.401,0.2348,0.1914,0.1484,8.507717,14.629007,17.771782,22.233325,25.365264,26.061988,27.416796,28.09385,29.148011,30.945435
3,1.146,0.6151,9.401,0.23668,0.19364,0.14553,8.507717,14.629007,17.771782,22.233325,25.365264,26.061988,27.416796,28.09385,29.148011,30.945435
4,1.317,0.3807,8.405,0.46319,0.2193,0.38864,1.320706,0.0001,3.498974,3.897721,4.139785,4.43806,4.934177,5.329428,5.784489,6.02288


In [9]:
def scale_column(key, dataSet, par = [], mode = "min-max"):
    if mode == "min-max":
        the_min = min(dataSet[key]) if len(par) == 0 else par[key]["min"]
        the_max = max(dataSet[key]) if len(par) == 0 else par[key]["max"]
        dataSet[key] = (dataSet[key] - the_min)/(the_max - the_min)
    #fin if 
#fin funcion

def get_metrics(X, y, model):
    y_gorro = model.predict(X)
    R2 = r2_score(y, y_gorro)
    RMSE = root_mean_squared_error(y, y_gorro)
    MAE = mean_absolute_error(y, y_gorro)
    return {"R2": R2, "RMSE": RMSE, "MAE": MAE}
#fin función

def get_params(X, mode = "min-max"):
    resp = dict()
    if mode == "min-max":
        for key in X.keys():
            resp[key] = {"min": min(X[key]), "max": max(X[key])}
        #fin for 
    #fin if 
    return resp
#fin función 

def determine_columns(datos, reg_expressions = reg_defaults):
    """
    Esta función retorna las columnas de datos que encajan con los regular expressions dados en la lista del segundo argumentos
    @input: datos <pd.DataFrame>: Tabla de datos 
    @input: reg_expressions <iterable>: Lista o iterable que contiene strings donde están las expresiones regulares
    @output: big_list <list>: Lista que contiene los nombres de las columnas que cuadran con las expresiones regulares dadas. 
    """
    big_list = sum(map(lambda y: list(filter(lambda x: re.match(y, x), datos.keys())), reg_expressions), [])
    return big_list 
#fin función

class TransformadorDeDatos(BaseEstimator, TransformerMixin):
    def __init__(self, deg = 3, reg_exp = reg_defaults):
        self.deg = deg
        self.reg_exp = reg_exp
    #fin init

    def fit(self, X, y=None):
        return self
    #fin fit

    def transform(self, X):
        datos = X.copy()
        cols_transformar = determine_columns(datos, self.reg_exp)
        poly = PolynomialFeatures(degree=self.deg, include_bias=False, interaction_only=True)
        poly_data = poly.fit_transform(datos[cols_transformar])
        poly_feature_names = poly.get_feature_names_out(input_features=cols_transformar)
        datos_finales = pd.DataFrame(poly_data, columns=poly_feature_names)
        return datos_finales
    #fin transform
#fin class

In [6]:
cross_validation = KFold(n_splits = 5, shuffle = True)
features = determine_columns(datos_full)
print(features)

['omega2_0', 'omega2_1', 'omega2_2', 'omega2_3', 'omega2_4', 'omega2_5', 'omega2_6', 'omega2_7', 'omega2_8', 'omega2_9', 'dx', 'dy', 'dz', 'rho']


In [23]:
def probar_modelo(datos, modelo, features, target, nombre_modelo, grado = 3):
    X = datos[features]
    y = datos[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33)
    reg_exp = [r"^(omega2_\d+|dx|dy|dz)$"]
    pipeline = Pipeline([('custom-transformer', TransformadorDeDatos(reg_exp=reg_exp, deg = grado)),
            ('power-transformer', PowerTransformer(method = "box-cox", standardize = True)), 
            (nombre_modelo, modelo)])
    primer_transformer = pipeline.named_steps["custom-transformer"]
    X_transf = primer_transformer.transform(X)
    print(X_transf.keys())
    pipeline.fit(X_train, y_train)
    #metricas_modelo = cross_val_score(pipeline, X_train, y_train, cv = cross_validation, scoring="r2") si aplico esto no acabo nunca!!
    metricas_modelo = get_metrics(X_test, y_test, pipeline)
    return {"metricas": metricas_modelo, "pipeline": pipeline}
#fin probar modelo 

In [20]:
mets = probar_modelo(datos_full, LinearRegression(), features, "K", "Regresion_lineal_K")

Index(['omega2_0', 'omega2_1', 'omega2_2', 'omega2_3', 'omega2_4', 'omega2_5',
       'omega2_6', 'omega2_7', 'omega2_8', 'omega2_9',
       ...
       'omega2_8 omega2_9 dx', 'omega2_8 omega2_9 dy', 'omega2_8 omega2_9 dz',
       'omega2_8 dx dy', 'omega2_8 dx dz', 'omega2_8 dy dz', 'omega2_9 dx dy',
       'omega2_9 dx dz', 'omega2_9 dy dz', 'dx dy dz'],
      dtype='object', length=377)


In [21]:
resultados_test = mets["metricas"]
print(resultados_test)

{'R2': 0.31851252005029684, 'RMSE': np.float64(1.4295981138240244), 'MAE': np.float64(1.1796850960226397)}


In [27]:
el_viejo_pipeline = mets["pipeline"]
X_exp = datos_exp[features]
y_exp = datos_exp["K"]
metricas_exp = get_metrics(X_exp, y_exp, el_viejo_pipeline)

In [28]:
print(metricas_exp)

{'R2': -389997302.29982305, 'RMSE': np.float64(5155.607634778404), 'MAE': np.float64(2091.6013464820094)}
