# Regresi√≥n linear

In [61]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import json
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [62]:
BASE_PATH = "../data/processed"
TRAIN_PATHS = [
    "X_train_con_outliers.xlsx",
    "X_train_sin_outliers.xlsx",
    "X_train_con_outliers_norm.xlsx",
    "X_train_sin_outliers_norm.xlsx",
    "X_train_con_outliers_scal.xlsx",
    "X_train_sin_outliers_scal.xlsx"
]
TRAIN_DATASETS = []
for path in TRAIN_PATHS:
    TRAIN_DATASETS.append(
        pd.read_excel(f"{BASE_PATH}/{path}")
    )

TEST_PATHS = [
    "X_test_con_outliers.xlsx",
    "X_test_sin_outliers.xlsx",
    "X_test_con_outliers_norm.xlsx",
    "X_test_sin_outliers_norm.xlsx",
    "X_test_con_outliers_scal.xlsx",
    "X_test_sin_outliers_scal.xlsx"
]
TEST_DATASETS = []
for path in TEST_PATHS:
    TEST_DATASETS.append(
        pd.read_excel(f"{BASE_PATH}/{path}")
    )

y_train = pd.read_excel(f"{BASE_PATH}/y_train.xlsx")
y_test = pd.read_excel(f"{BASE_PATH}/y_test.xlsx")

In [63]:
results = []
models=[]

for index, dataset in enumerate(TRAIN_DATASETS):
    model = LinearRegression()
    model.fit(dataset, y_train)
    models.append(model)
    
    y_pred_train = model.predict(dataset)
    y_pred_test = model.predict(TEST_DATASETS[index])

    results.append(
        {
            "train_mse": mean_squared_error(y_train, y_pred_train),
            "train_r2": r2_score(y_train, y_pred_train),
            "test_mse": mean_squared_error(y_test, y_pred_test),
            "test_r2": r2_score(y_test, y_pred_test)
        }
    )


results

[{'train_mse': 37005531.72811555,
  'train_r2': 0.7297182858804965,
  'test_mse': 35493102.61165053,
  'test_r2': 0.8068466322629111},
 {'train_mse': 36992599.00185515,
  'train_r2': 0.7298127441752071,
  'test_mse': 35467636.63842711,
  'test_r2': 0.8069852180198291},
 {'train_mse': 37005531.72811554,
  'train_r2': 0.7297182858804965,
  'test_mse': 35493102.61165054,
  'test_r2': 0.8068466322629111},
 {'train_mse': 36992599.00185515,
  'train_r2': 0.7298127441752071,
  'test_mse': 35467636.63842714,
  'test_r2': 0.806985218019829},
 {'train_mse': 37005531.72811555,
  'train_r2': 0.7297182858804965,
  'test_mse': 35493102.61165053,
  'test_r2': 0.8068466322629111},
 {'train_mse': 36992599.00185515,
  'train_r2': 0.7298127441752071,
  'test_mse': 35467636.63842708,
  'test_r2': 0.8069852180198294}]

In [64]:
best_model = 2
final_model = models[1]

with open("../models/linear_best_model.pkl", "wb") as f:
    pickle.dump(final_model, f)

with open("../models/final_results.json", "w") as f:
    json.dump(results, f, indent=4)