In [None]:
import os
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression
import json

# Definir rutas base y nombres de archivos
BASE_PATH = "../data/processed"
TRAIN_PATHS = [
    "X_train_con_outliers.xlsx",
    "X_train_sin_outliers.xlsx",
    "X_train_con_outliers_norm.xlsx",
    "X_train_sin_outliers_norm.xlsx",
    "X_train_con_outliers_scal.xlsx",
    "X_train_sin_outliers_scal.xlsx"
]
TEST_PATHS = [
    "X_test_con_outliers.xlsx",
    "X_test_sin_outliers.xlsx",
    "X_test_con_outliers_norm.xlsx",
    "X_test_sin_outliers_norm.xlsx",
    "X_test_con_outliers_scal.xlsx",
    "X_test_sin_outliers_scal.xlsx"
]

# Leer los datasets de entrenamiento y prueba
TRAIN_DATASETS = [pd.read_excel(os.path.join(BASE_PATH, path)) for path in TRAIN_PATHS]
TEST_DATASETS = [pd.read_excel(os.path.join(BASE_PATH, path)) for path in TEST_PATHS]

# Leer las etiquetas (target) de entrenamiento y prueba
y_train = pd.read_excel(os.path.join(BASE_PATH, "y_train.xlsx")).values.ravel()
y_test = pd.read_excel(os.path.join(BASE_PATH, "y_test.xlsx")).values.ravel()

# Definir el número de características que deseas seleccionar
k = 6  # Puedes cambiar este valor según el número de características que desees seleccionar

# Crear una lista para almacenar los resultados
results = []

# Crear un diccionario para almacenar todas las características seleccionadas
all_selected_features = {}

for index, (train_path, test_path) in enumerate(zip(TRAIN_PATHS, TEST_PATHS)):
    print(f"Procesando dataset {index + 1}: {train_path}")

    # Selección de características usando SelectKBest con f_regression (para regresión)
    selector = SelectKBest(f_regression, k=k)
    X_train_selected = selector.fit_transform(TRAIN_DATASETS[index], y_train)  # Ajuste y transformación para el conjunto de entrenamiento
    X_test_selected = selector.transform(TEST_DATASETS[index])  # Transformación para el conjunto de prueba

    # Almacenar las características seleccionadas en el diccionario
    selected_features = TRAIN_DATASETS[index].columns[selector.get_support()].tolist()  # Obtener los nombres de las características seleccionadas
    all_selected_features[train_path] = selected_features  # Usar el nombre del archivo como clave

    # Crear y entrenar el modelo de regresión lineal
    model = LinearRegression()
    model.fit(X_train_selected, y_train)

    # Predicciones en los datos de entrenamiento y prueba
    y_train_pred = model.predict(X_train_selected)
    y_test_pred = model.predict(X_test_selected)

    # Calcular el MSE y R² para los datos de entrenamiento
    mse_train = mean_squared_error(y_train, y_train_pred)
    r2_train = r2_score(y_train, y_train_pred)

    # Calcular el MSE y R² para los datos de prueba
    mse_test = mean_squared_error(y_test, y_test_pred)
    r2_test = r2_score(y_test, y_test_pred)

    # Almacenar los resultados en la lista, usando el nombre del archivo como "dataset"
    results.append(
        {
            "dataset_TRAIN": train_path,  # Usar el nombre del archivo del dataset
            "MSE_train": mse_train,  # Guardar mse_train como número (no como string)
            "r2_train": r2_train,  # Guardar r2_train como número (no como string)
            "dataset_TEST": test_path,  # Usar el nombre del archivo del dataset
            "MSE_test": mse_test,  # Guardar mse_test como número (no como string)
            "r2_test": r2_test,  # Guardar r2_test como número (no como string)
            "feat_sel": f"k{k}"  # Indicar el número de características seleccionadas
        }
    )

# Convertir la lista de resultados en un DataFrame
results_df = pd.DataFrame(results)

# Guardar los resultados en un archivo JSON
results_json_path = os.path.join("../data/results", f"results_k_{k}.json")
os.makedirs(os.path.dirname(results_json_path), exist_ok=True)  # Crear directorio si no existe
with open(results_json_path, 'w') as json_file:
    json.dump(results, json_file, indent=4)

# Guardar las características seleccionadas en un solo archivo JSON
selected_features_json_path = os.path.join("../models", f"selected_features_k_{k}.json")
os.makedirs(os.path.dirname(selected_features_json_path), exist_ok=True)  # Crear directorio si no existe
with open(selected_features_json_path, 'w') as json_file:
    json.dump(all_selected_features, json_file, indent=4)

# Imprimir el DataFrame con los resultados
results_df

Procesando dataset 1: X_train_con_outliers.xlsx
Procesando dataset 2: X_train_sin_outliers.xlsx
Procesando dataset 3: X_train_con_outliers_norm.xlsx
Procesando dataset 4: X_train_sin_outliers_norm.xlsx
Procesando dataset 5: X_train_con_outliers_scal.xlsx
Procesando dataset 6: X_train_sin_outliers_scal.xlsx


Unnamed: 0,dataset_TRAIN,MSE_train,r2_train,dataset_TEST,MSE_test,r2_test,feat_sel
0,X_train_con_outliers.xlsx,37280660.0,0.741705,X_test_con_outliers.xlsx,33635210.0,0.783346,k6
1,X_train_sin_outliers.xlsx,37226730.0,0.742079,X_test_sin_outliers.xlsx,33781330.0,0.782405,k6
2,X_train_con_outliers_norm.xlsx,37280660.0,0.741705,X_test_con_outliers_norm.xlsx,33635210.0,0.783346,k6
3,X_train_sin_outliers_norm.xlsx,37226730.0,0.742079,X_test_sin_outliers_norm.xlsx,33781330.0,0.782405,k6
4,X_train_con_outliers_scal.xlsx,37280660.0,0.741705,X_test_con_outliers_scal.xlsx,33635210.0,0.783346,k6
5,X_train_sin_outliers_scal.xlsx,37226730.0,0.742079,X_test_sin_outliers_scal.xlsx,33781330.0,0.782405,k6


In [17]:
df_results = results_df
df_results = pd.DataFrame(results)

df_results["MSE_train"] = np.sqrt(df_results["MSE_train"])
df_results["MSE_test"] = np.sqrt(df_results["MSE_test"])

df_results = df_results[["MSE_train", "MSE_test", "r2_train", "r2_test"]]

df_results

Unnamed: 0,MSE_train,MSE_test,r2_train,r2_test
0,6105.78932,5799.587091,0.741705,0.783346
1,6101.371115,5812.170661,0.742079,0.782405
2,6105.78932,5799.587091,0.741705,0.783346
3,6101.371115,5812.170661,0.742079,0.782405
4,6105.78932,5799.587091,0.741705,0.783346
5,6101.371115,5812.170661,0.742079,0.782405


In [21]:
import pandas as pd
import matplotlib.pyplot as plt 

train_data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/linear-regression-project-tutorial/main/medical_insurance_cost.csv")
test_data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/linear-regression-project-tutorial/main/medical_insurance_cost.csv")

train_data.head()


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [22]:
X_train = train_data.drop("charges", axis=1)  
y_train = train_data["charges"]  

X_test = test_data.drop("charges", axis=1)  
y_test = test_data["charges"]  

In [25]:
import pandas as pd
from sklearn.linear_model import LinearRegression

# Asumiendo que X_train y X_test son DataFrames de pandas
X_train = pd.get_dummies(X_train, drop_first=True)  # drop_first=True evita la multicolinealidad
X_test = pd.get_dummies(X_test, drop_first=True)

# Asegurarse de que las columnas en X_train y X_test coincidan
train_cols = X_train.columns
test_cols = X_test.columns

missing_cols_train = set(test_cols) - set(train_cols)
missing_cols_test = set(train_cols) - set(test_cols)

for c in missing_cols_train:
    X_train[c] = 0

for c in missing_cols_test:
    X_test[c] = 0

# Asegurarse de que el orden de las columnas coincida
X_test = X_test[X_train.columns]

model = LinearRegression()
model.fit(X_train, y_train)

print(f"Intercepto (a): {model.intercept_}")
print(f"Coeficientes (b): {model.coef_}")

Intercepto (a): -11938.538576167146
Coeficientes (b): [  256.85635254   339.19345361   475.50054515  -131.3143594
 23848.53454191  -352.96389942 -1035.02204939  -960.0509913 ]


In [26]:
y_pred = model.predict(X_test)
y_pred

array([25293.7130284 ,  3448.60283431,  6706.9884907 , ...,
        4149.13248568,  1246.58493898, 37085.62326757], shape=(1338,))

In [27]:
from sklearn.metrics import mean_squared_error, r2_score

print(f"Error cuadrático medio: {mean_squared_error(y_test, y_pred)}")
print(f"Coeficiente de determinación: {r2_score(y_test, y_pred)}")

Error cuadrático medio: 36501893.00741544
Coeficiente de determinación: 0.7509130345985207
