In [104]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression,  RidgeCV, LassoCV, ElasticNetCV

In [105]:
CSV_PATH = "Student_performance_data _.csv"
df = pd.read_csv(CSV_PATH)
y= df['GPA']
x= df.drop(['StudentID','Gender','Ethnicity','GradeClass','GPA'],axis=1)

In [106]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print("Train size :", x_train.shape, y_train.shape)
print("Test size  :", x_test.shape, y_test.shape)

Train size : (1913, 10) (1913,)
Test size  : (479, 10) (479,)


In [107]:
def evaluar_modelo(model, x_test, y_test, print_metrics=True):
    #Evalúa un modelo de regresión con MSE, R2 y MAE.

    y_pred = model.predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    r2  = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    if print_metrics:
        print("Mean Squared Error:", mse)
        print("R squared:", r2)
        print("Mean absolute Error:", mae)
    resultados={"mse": mse, "r2": r2, "mae": mae}
    return resultados

In [108]:
model_base = LinearRegression()
model_base.fit(x_train, y_train)
res1 = evaluar_modelo(model_base, x_test, y_test, True)

Mean Squared Error: 0.038724020841933736
R squared: 0.9531714916247549
Mean absolute Error: 0.1554642891824548


In [109]:
model_lasso = LassoCV()
model_lasso.fit(x_train, y_train)
res2 = evaluar_modelo(model_lasso, x_test, y_test, True)

Mean Squared Error: 0.039516109438980866
R squared: 0.9522136280895563
Mean absolute Error: 0.15547328941706887


In [110]:
model_ridge = RidgeCV()
model_ridge.fit(x_train, y_train)
res3 = evaluar_modelo(model_ridge, x_test, y_test, True)

Mean Squared Error: 0.03871889649315521
R squared: 0.9531776884401804
Mean absolute Error: 0.15544153937918662


In [111]:
model_elasticNet = ElasticNetCV()
model_elasticNet.fit(x_train, y_train)
res4 = evaluar_modelo(model_elasticNet, x_test, y_test, True)

Mean Squared Error: 0.03978902478048078
R squared: 0.9518835947387498
Mean absolute Error: 0.1558328317694584


In [112]:
resultados = [res1, res2, res3, res4]

# Mejor según R²
mejor_r2 = max(resultados, key=lambda x: x['r2'])
print("Mejor modelo por R2:", mejor_r2)

# Mejor según MAE
mejor_mae = min(resultados, key=lambda x: x['mae'])
print("Mejor modelo por MAE:", mejor_mae)

# Mejor según MSE
mejor_mse = min(resultados, key=lambda x: x['mse'])
print("Mejor modelo por MSE:", mejor_mse)

#El mejor modelo por R2, MAE y MSE es el de Ridge

Mejor modelo por R2: {'mse': 0.03871889649315521, 'r2': 0.9531776884401804, 'mae': 0.15544153937918662}
Mejor modelo por MAE: {'mse': 0.03871889649315521, 'r2': 0.9531776884401804, 'mae': 0.15544153937918662}
Mejor modelo por MSE: {'mse': 0.03871889649315521, 'r2': 0.9531776884401804, 'mae': 0.15544153937918662}


**El modelo con mejor R2 es el regularizado por ridge**

In [113]:
import joblib

joblib.dump(model_ridge, 'linear_regression_model.pkl')
print("Model saved as 'linear_regression_model.pkl'")

Model saved as 'linear_regression_model.pkl'
