In [1]:
# x,y,max degrees,cv -> fun -> performs poly reg then lin reg -> then evaluates using cross_val_score -> 
# if greater then best score -> best degree = current

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as snb
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [18]:
np.random.seed(42)
X = np.linspace(-5, 5, 100).reshape(-1, 1)
y = 3*X**2 + 2*X + 5 + np.random.randn(100, 1)*5

In [19]:
def best_degree(x,y,max_degree=10,cv=5):
    best_degree=1
    best_score = -np.inf

    for degree in range(1,max_degree+1):
        model = Pipeline([
            ('poly',PolynomialFeatures(degree)),
            ('lr',LinearRegression())
        ])
        scores = cross_val_score(model,x,y,cv=cv,scoring='r2')
        mean_score = scores.mean()
        print(f"Degree {degree}: CV R2 = {mean_score:.4f}")

        if (mean_score>best_score):
            best_score=mean_score
            best_degree=degree
    print("Best degree: ", best_degree)
    print("Best cv r2 score: ", best_score)

    final_model = Pipeline([
            ('poly',PolynomialFeatures(best_degree)),
            ('lin',LinearRegression())
    ])
    final_model.fit(x,y)
    return final_model , best_degree
        
        
    

In [20]:

best_degree(X, y, max_degree=8)

Degree 1: CV R2 = -16.1381
Degree 2: CV R2 = 0.6720
Degree 3: CV R2 = 0.6140
Degree 4: CV R2 = 0.4254
Degree 5: CV R2 = -1.5278
Degree 6: CV R2 = -12.0786
Degree 7: CV R2 = -15.0556
Degree 8: CV R2 = -673.4066
Best degree:  2
Best cv r2 score:  0.6720464849100121


(Pipeline(steps=[('poly', PolynomialFeatures()), ('lin', LinearRegression())]),
 2)

In [21]:
# this was the manual function approach 
# now better approach -> usign gridsearchcv

In [37]:
from sklearn.model_selection import GridSearchCV
def find_best_degree(x,y,max_degree=8,cv=5,scoring = 'r2'):
    # docstring(only this is generated in case u are stalking my code)
    """
    Finds the best polynomial degree using GridSearchCV.
    
    Parameters:
        X : feature matrix
        y : target vector
        max_degree : maximum polynomial degree to test
        cv : number of cross-validation folds
        scoring : metric to optimize ('r2', 'neg_mean_squared_error', etc.)
    
    Returns:
        best_model : trained pipeline with best degree
        best_degree : selected polynomial degree
    """

    pipeline = Pipeline([
        ('poly',PolynomialFeatures()),
        ('lin',LinearRegression())
    ])
    param_grid = {
        "poly__degree" : list(range(1,max_degree+1))
    }
    grid = GridSearchCV(
        estimator = pipeline,
        param_grid = param_grid,
        scoring = scoring,
        cv =cv,
        n_jobs = -1
    )

    grid.fit(x,y)
    best_degree = grid.best_params_['poly__degree']
    best_score = grid.best_score_
    best_model = grid.best_estimator_
    
    print("best degree:",best_degree)
    print("best cv r2 score:",best_score)
    print("best model:",best_model)
    return best_model , best_score , best_degree

In [38]:
find_best_degree(X,y)

best degree: 2
best cv r2 score: 0.6720464849100121
best model: Pipeline(steps=[('poly', PolynomialFeatures()), ('lin', LinearRegression())])


(Pipeline(steps=[('poly', PolynomialFeatures()), ('lin', LinearRegression())]),
 np.float64(0.6720464849100121),
 2)