# Modélisation :

In [2]:
import pandas as pd
import os
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate, KFold
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
from xgboost import XGBRegressor

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


In [31]:
processed_path = os.path.join("..", "data", "processed/")

In [32]:
X_train = pd.read_csv(processed_path + 'X_train.csv')
X_test = pd.read_csv(processed_path + 'X_test.csv')
y_train = pd.read_csv(processed_path + 'y_train.csv')
y_test = pd.read_csv(processed_path + 'y_test.csv')

In [33]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [34]:
lr = LinearRegression()
rf = RandomForestRegressor(n_estimators=500)
ridge = Ridge(alpha=1.0)
lasso = Lasso(alpha=1.0)

In [35]:
scoring = {
    "r2": "r2",
    "rmse": make_scorer(mean_squared_error, greater_is_better=False),
    "mae": "neg_mean_absolute_error",
}

In [36]:
def compute_model_score(model, X, y):
    # computing cross val
    cross_validation = cross_validate(
        model,
        X,
        y,
        cv=cv,
        scoring=scoring,
        n_jobs=-1,
        )
    df = pd.DataFrame({
        "r2":  cross_validation["test_r2"],               
        "rmse": -cross_validation["test_rmse"],     
        "mae":  -cross_validation["test_mae"], 
    })
    print(f"Mean R² : {df.r2.mean()}")
    print(f"Mean rmse : {df.rmse.mean()}")
    print(f"Mean mae : {df.mae.mean()}")
    return df

In [37]:
compute_model_score(lr, X_train, y_train)

Mean R² : 0.9099691087765691
Mean rmse : 0.001763844818500241
Mean mae : 0.021717624745193974


Unnamed: 0,r2,rmse,mae
0,0.916042,0.001338,0.019315
1,0.909283,0.001649,0.020709
2,0.870431,0.002782,0.027837
3,0.92063,0.001581,0.021046
4,0.933459,0.001469,0.019681


In [38]:
compute_model_score(rf, X_train, y_train)

Mean R² : 0.8426348620301912
Mean rmse : 0.8558215791405008
Mean mae : 0.48976432500000017


Unnamed: 0,r2,rmse,mae
0,0.837224,1.391598,0.554916
1,0.878875,0.921791,0.535125
2,0.809348,0.549095,0.393842
3,0.844963,0.720013,0.487746
4,0.842765,0.69661,0.477192


In [39]:
compute_model_score(ridge, X_train, y_train)

Mean R² : 0.9100013045137221
Mean rmse : 0.0017632764103179652
Mean mae : 0.02171420487212599


Unnamed: 0,r2,rmse,mae
0,0.916091,0.001337,0.019332
1,0.909303,0.001649,0.020701
2,0.870365,0.002783,0.02783
3,0.920802,0.001578,0.021019
4,0.933445,0.001469,0.01969


In [40]:
compute_model_score(lasso, X_train, y_train)

Mean R² : 0.49255063267909394
Mean rmse : 0.009937443695340473
Mean mae : 0.05989832891090004


Unnamed: 0,r2,rmse,mae
0,0.499371,0.008005,0.05116
1,0.499992,0.009114,0.058723
2,0.483876,0.011113,0.064603
3,0.480456,0.010377,0.061452
4,0.499058,0.011078,0.063554


In [41]:
params = {
    'objective': 'reg:squarederror',
    'max_depth': 5,
    'learning_rate': 0.2,
    'n_estimators': 300,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42
}

xgb = XGBRegressor(**params)

In [42]:
compute_model_score(xgb, X_train, y_train)

Mean R² : 0.9176501154899597
Mean rmse : 168.84745178222656
Mean mae : 6.584983062744141


Unnamed: 0,r2,rmse,mae
0,0.912057,159.574326,6.375999
1,0.924542,165.498444,6.509373
2,0.898632,197.892792,7.394403
3,0.915907,191.477264,6.390359
4,0.937113,129.794434,6.254782


In [43]:
compute_model_score(lr, X_test, y_test)

Mean R² : 0.8963821928164348
Mean rmse : 0.002035491650741608
Mean mae : 0.02299086559432282


Unnamed: 0,r2,rmse,mae
0,0.856326,0.002631,0.023182
1,0.926804,0.001228,0.020285
2,0.908363,0.001959,0.025132
3,0.91212,0.002278,0.023236
4,0.878298,0.002081,0.02312


In [44]:
compute_model_score(xgb, X_test, y_test)

Mean R² : 0.8029023051261902
Mean rmse : 1200.3483276367188
Mean mae : 19.2870512008667


Unnamed: 0,r2,rmse,mae
0,0.733533,1237.517456,17.663464
1,0.758514,2084.174561,28.946243
2,0.822813,610.851624,13.185892
3,0.903798,621.112061,14.762425
4,0.795854,1448.085938,21.877232


**Choix :**  
Mon chois se tourne vers le modèle `LinearRegression` qui réalise le meilleur compromis :  
* **performance/généralisation**

In [45]:
lr.fit(X_train, y_train)
y_train_pred = lr.predict(X_train)

score = r2_score(y_train, y_train_pred)
score

0.9143355243889344

In [46]:
y_pred = lr.predict(X_test)

score_test = r2_score(y_pred, y_test)
score_test

0.9002530119468029

In [47]:
X_test.iloc[0]

Unnamed: 0                  361.000000
quant__gre_score              1.576604
quant__toefl_score            1.424271
quant__university_rating      0.775459
quant__sop                    0.633979
quant__lor                    0.021730
quant__cgpa                   1.597217
bin__research                 1.000000
Name: 0, dtype: float64

In [None]:
[1.576604, 1.424271, ]