In [10]:
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm

%matplotlib inline
plt.style.use('ggplot')

In [11]:
str_to_ndarray = lambda x: np.fromstring(x, sep=' ')

In [12]:
path = os.path.join('..', '..', '..', 'data', 'KG_combin.csv')
kg_data = pd.read_csv(path, converters={'eigvals': str_to_ndarray})

In [13]:
for q in range(14):#nondeg_minlen):
    kg_data['omega2_' + str(q)] = kg_data['eigvals'].apply(lambda arr: arr[6 + q]) / kg_data['rho']

kg_data = kg_data.drop(columns=['eigvals'])

In [14]:
kg_data

Unnamed: 0,K,G,rho,dx,dy,dz,shape,omega2_0,omega2_1,omega2_2,...,omega2_4,omega2_5,omega2_6,omega2_7,omega2_8,omega2_9,omega2_10,omega2_11,omega2_12,omega2_13
0,0.3,0.300000,0.2,0.1,0.1,0.1,cone,363.812512,363.812512,616.861943,...,801.530462,1202.741616,1202.741616,1360.593158,1360.593158,1391.954446,1391.954446,1598.471054,1598.471054,2043.801060
1,0.3,1.057143,0.2,0.1,0.1,0.1,cone,1080.357105,1237.165522,1237.165522,...,2651.506483,2768.883511,2768.883511,3218.011759,3218.011759,3335.602641,4738.235859,4738.235859,4917.977790,4917.977790
2,0.3,1.814286,0.2,0.1,0.1,0.1,cone,1279.463309,2075.013838,2075.013838,...,3628.548353,3628.548353,3915.845931,3915.845931,4683.416721,4683.416721,6711.904416,6711.904416,8050.533407,8050.533407
3,0.3,2.571429,0.2,0.1,0.1,0.1,cone,1402.145793,2886.110200,2886.110200,...,4161.645389,4161.645389,4597.790620,4597.790620,6539.611577,6539.611577,7540.838290,7540.838290,10185.591908,11261.941102
4,0.3,3.328571,0.2,0.1,0.1,0.1,cone,1488.362993,3668.498967,3668.498967,...,4360.684462,4360.684462,5436.294393,5436.294393,8067.005188,8067.005188,8300.251487,8300.251487,11831.588014,12960.953559
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229371,5.6,2.571429,10.0,1.0,1.0,1.0,tetrahedron,1.776489,1.992767,1.992767,...,2.899190,4.501326,4.501326,4.935697,6.125115,7.256192,8.070548,8.070548,9.253246,9.253246
229372,5.6,3.328571,10.0,1.0,1.0,1.0,tetrahedron,2.213418,2.533433,2.533433,...,3.643310,5.720904,5.720904,6.363125,7.760717,9.045892,10.325101,10.325101,11.806433,11.806433
229373,5.6,4.085714,10.0,1.0,1.0,1.0,tetrahedron,2.618126,3.053709,3.053709,...,4.345456,6.901332,6.901332,7.772311,9.335054,10.704454,12.520110,12.520110,14.275421,14.275421
229374,5.6,4.842857,10.0,1.0,1.0,1.0,tetrahedron,2.993823,3.554073,3.554073,...,5.009227,8.046839,8.046839,9.157921,10.858081,12.246230,14.653307,14.653307,16.657100,16.657100


In [15]:
kg_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229376 entries, 0 to 229375
Data columns (total 21 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   K          229376 non-null  float64
 1   G          229376 non-null  float64
 2   rho        229376 non-null  float64
 3   dx         229376 non-null  float64
 4   dy         229376 non-null  float64
 5   dz         229376 non-null  float64
 6   shape      229376 non-null  object 
 7   omega2_0   229376 non-null  float64
 8   omega2_1   229376 non-null  float64
 9   omega2_2   229376 non-null  float64
 10  omega2_3   229376 non-null  float64
 11  omega2_4   229376 non-null  float64
 12  omega2_5   229376 non-null  float64
 13  omega2_6   229376 non-null  float64
 14  omega2_7   229376 non-null  float64
 15  omega2_8   229376 non-null  float64
 16  omega2_9   229376 non-null  float64
 17  omega2_10  229376 non-null  float64
 18  omega2_11  229376 non-null  float64
 19  omega2_12  229376 non-n

In [107]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from sklearn.compose import TransformedTargetRegressor

# Separación de features y target
X = kg_data.drop(['K', 'G'], axis=1)
y = kg_data['K']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=True)

sqrt_columns = ['rho', 'dx', 'dy', 'dz']
omega_columns = [f'omega2_{i}' for i in range(14)]
categorical_columns = ['shape']

In [108]:
# Definición de transformaciones de features
feature_transformer = ColumnTransformer(transformers=[
    ('sqrt', FunctionTransformer(np.sqrt), sqrt_columns),
    ('log', FunctionTransformer(np.log1p), omega_columns),
    ('onehot', OneHotEncoder(drop='first'), categorical_columns)
], remainder='drop')

# Creación del pipeline
pipeline = Pipeline(steps=[
    ('feature_transformation', feature_transformer),
    ('scaling', StandardScaler()),
    ('regression', LinearRegression(fit_intercept=True))
])

In [109]:
# TransformedTargetRegressor para transformar la variable objetivo (np.sqrt(K))
model = TransformedTargetRegressor(regressor=pipeline, func=np.sqrt, inverse_func=np.square)

model.fit(X_train, y_train)

# Predice los valores automáticamente con la transformación inversa
y_pred = model.predict(X_test)

In [110]:
# Métricas sobre los datos originales
rmse = root_mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f'RMSE: {rmse:.3f}')
print(f'MAE: {mae:.3f}')
print(f'MAPE: {mape:.3f}')

RMSE: 1.725
MAE: 1.475
MAPE: 1.320
