In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor

In [4]:
df = pd.read_csv("https://raw.githubusercontent.com/camilousa/datasets/refs/heads/master/school_grades_dataset.csv")
df.head()
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649 entries, 0 to 648
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      649 non-null    object
 1   sex         649 non-null    object
 2   age         649 non-null    int64 
 3   address     649 non-null    object
 4   famsize     649 non-null    object
 5   Pstatus     649 non-null    object
 6   Medu        649 non-null    int64 
 7   Fedu        649 non-null    int64 
 8   Mjob        649 non-null    object
 9   Fjob        649 non-null    object
 10  reason      649 non-null    object
 11  guardian    649 non-null    object
 12  traveltime  649 non-null    int64 
 13  studytime   649 non-null    int64 
 14  failures    649 non-null    int64 
 15  schoolsup   649 non-null    object
 16  famsup      649 non-null    object
 17  paid        649 non-null    object
 18  activities  649 non-null    object
 19  nursery     649 non-null    object
 20  higher    

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0
mean,16.744222,2.514638,2.306626,1.568567,1.930663,0.22188,3.930663,3.180277,3.1849,1.502311,2.280431,3.53621,3.659476,11.399076,11.570108,11.906009
std,1.218138,1.134552,1.099931,0.74866,0.82951,0.593235,0.955717,1.051093,1.175766,0.924834,1.28438,1.446259,4.640759,2.745265,2.913639,3.230656
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,16.0,2.0,1.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,1.0,2.0,0.0,10.0,10.0,10.0
50%,17.0,2.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,2.0,4.0,2.0,11.0,11.0,12.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,3.0,5.0,6.0,13.0,13.0,14.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,32.0,19.0,19.0,19.0


In [6]:
train, test = train_test_split(df, test_size=0.2, random_state=42)
x_train = train[["G3"]]
y_train = train[["age"]]


In [12]:
from sklearn.metrics import mean_absolute_error

y_train_pred = model.predict(train[["G3"]])
mae = mean_absolute_error(train["age"], y_train_pred)
print("MAE on training set:", mae)

MAE on training set: 0.9665351041402971


In [13]:
import sklearn.metrics as sk_metrics

mse_value = sk_metrics.mean_squared_error(y_train, y_train_pred)
print("MSE on training set:", mse_value)

MSE on training set: 1.383046162667169


In [14]:
y_test_pred = model.predict(test[["G3"]])
mae_test = mean_absolute_error(test["age"], y_test_pred)
print("MAE on test set:", mae_test)

MAE on test set: 0.8901887586723645


In [15]:
mse_test = sk_metrics.mean_squared_error(test["age"], y_test_pred)
print("MSE on test set:", mse_test)

MSE on test set: 1.2318407455206173


In [16]:
print("Escala de G3:")
print("Valor mínimo:", df["G3"].min())
print("Valor máximo:", df["G3"].max())
print("Rango:", df["G3"].max() - df["G3"].min())
print("\nEstadísticas:")
print(df["G3"].describe())

Escala de G3:
Valor mínimo: 0
Valor máximo: 19
Rango: 19

Estadísticas:
count    649.000000
mean      11.906009
std        3.230656
min        0.000000
25%       10.000000
50%       12.000000
75%       14.000000
max       19.000000
Name: G3, dtype: float64


In [21]:
model.coef_ = df["G3"].std() / df["age"].std()
model.intercept_ = df["age"].mean() - model.coef_ * df["G3"].mean()
print("Coeficiente:", model.coef_)
print("Intercepto:", model.intercept_)


Coeficiente: 2.652127426407854
Intercepto: -14.832031777894436


In [22]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [None, 3, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

grid_search = GridSearchCV(DecisionTreeRegressor(random_state=42),
                           param_grid,
                           scoring='neg_mean_squared_error',
                           cv=5)
# Considera incluir variables adicionales si son relevantes
features = ["G3"]  # por ejemplo, podrías añadir ["G1", "G2", "G3"]
grid_search.fit(train[features], train["age"])

print("Mejores parámetros:", grid_search.best_params_)
print("Mejor MSE:", -grid_search.best_score_)

Mejores parámetros: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Mejor MSE: 1.4821146750750107


In [25]:
model.coef_ = df["G3"].std() / df["age"].std()
print(model.coef_)

2.652127426407854


In [27]:
model_predict = df["G3"] * model.coef_ + model.intercept_
print(model_predict)


0      14.341370
1      14.341370
2      16.993497
3      22.297752
4      19.645625
         ...    
644    11.689242
645    27.602007
646     9.037115
647    11.689242
648    14.341370
Name: G3, Length: 649, dtype: float64
