In [37]:
import numpy as np

# Create a x vector that has a gaussian(normal) distribution with mean 0 and std 1.
x = np.random.normal(0, 1, 5000)
print(x)
print(x.shape)

[-0.08192438  1.14634711 -0.29175879 ... -1.44823047  1.08804823
 -3.05490831]
(5000,)


In [38]:
# Create a eps vector that has a gaussian(normal) distribution with mean 0 and std 0.25.
eps = np.random.normal(0, 0.25, 5000)

print(eps)
print(eps.shape)

[ 0.48805782 -0.26681142 -0.50254015 ...  0.16459297  0.20759637
 -0.37482473]
(5000,)


In [39]:
y = -1 + 0.5 * x - 2 * x**2 + 0.3 * x**3 + eps

print(y)
print(y.shape)

[ -0.56649253  -2.86993281  -1.82611657 ...  -6.66550847  -2.229652
 -30.1201558 ]
(5000,)


# Linear Regression

### Implementation 1

In [40]:
from sklearn.linear_model import LinearRegression

# implement the linear regression model using the sklearn's default linear regression class
model = LinearRegression()
model

In [41]:
from sklearn.model_selection import cross_val_score

# we can use cross_val_score of sklearn to implement the 10-fold cross-validation
# also we have to give the mean squared error to it, if we want the scoring result to be MSE (the default is accuracy)
# we use negative squared error because it wants to maximize the function by default
# the x is reshaped because it is the input and it has to be a matrix
scores = cross_val_score(model, x.reshape(-1, 1), y, cv=10, scoring='neg_mean_squared_error')

In [42]:
# the real values of errors are positive numbers so we multiply them by -1
scores = -1 * scores
scores

array([ 7.93456175,  6.44294515,  8.30492329, 13.57021727,  7.22516571,
        9.44092986,  8.52846606,  9.39598351,  7.57123573, 11.1110155 ])

In [43]:
# to get the value of the error from the cross validation, we should get the mean of the results
scores.mean()

8.952544382071316

### Implementation 2

In [44]:
from sklearn.model_selection import KFold

model2 = LinearRegression()
mse_scores = []

# we can use the sklearn's KFold to get the exact indexes to use for our for loop
kf = KFold(n_splits=10)

# X should be a matrix so we will make it like one, instead of being a vector
X = x.reshape(-1, 1)

for train_index, validation_index in kf.split(X):
    X_train, X_validation = X[train_index], X[validation_index]
    y_train, y_validation = y[train_index], y[validation_index]
    
    model2.fit(X_train, y_train)
    
    y_pred = model2.predict(X_validation)
    
    mse = np.mean((y_validation - y_pred)**2)
    mse_scores.append(mse)
    

avg_mse = np.mean(mse_scores)
    

In [45]:
mse_scores

[7.934561747768387,
 6.442945147947299,
 8.304923291645169,
 13.570217266462413,
 7.2251657052542475,
 9.440929860758432,
 8.52846605845085,
 9.395983508733146,
 7.5712357319642845,
 11.111015501728946]

In [46]:
np.mean(mse_scores)

8.952544382071316

using the second implementation we could get the coefficient for the regressor as well

In [47]:
model2.coef_

array([1.39352142])

# Polynomial Regression

### With cross validation

In [48]:
from sklearn.model_selection import train_test_split

# we can use the well known sklearn's train_test_split to split the data (random state 42 is also used all the time)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [49]:
# we get the shapes to understand the data better
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4000, 1), (1000, 1), (4000,), (1000,))

In [50]:
from sklearn.preprocessing import PolynomialFeatures

best_score = float('inf')
best_degree = 0
best_model = None

# we add different degree of the x to the data as features to help the same linear regression model do the polynomial work
# in this for loop, we add different degrees to see which one will give us the beset error rate
for degree in range(1, 11):
    poly_features = PolynomialFeatures(degree=degree)
    X_poly_train = poly_features.fit_transform(X_train)
    
    model_poly = LinearRegression()
    model_poly.fit(X_poly_train, y_train)
    
    scores = cross_val_score(model_poly, X_poly_train, y_train, cv=10, scoring='neg_mean_squared_error')
    score = -scores.mean()
    
    if score < best_score:
        best_score = score
        best_degree = degree
        best_model = model_poly
        
print(f'Best degree: {best_degree}')

Best degree: 3


In [51]:
poly_features = PolynomialFeatures(degree=best_degree)
X_poly_train = poly_features.fit_transform(X_train)
best_model.fit(X_poly_train, y_train)

print(f'Weights: {best_model.intercept_}, {best_model.coef_[1:]}')

Weights: -1.0036897853663835, [ 0.50856931 -2.00537092  0.29766059]


As we can see, the intercept and the coefficients are the same as the real ones

### Without cross validation

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [53]:
best_score = float('inf')
best_degree = 0
best_model = None

for degree in range(1, 11):
    poly_features = PolynomialFeatures(degree=degree)
    X_poly_train = poly_features.fit_transform(X_train)
    X_poly_test = poly_features.fit_transform(X_test)
    
    model_poly = LinearRegression()
    model_poly.fit(X_poly_train, y_train)
    
    y_pred = model_poly.predict(X_poly_test)
    
    mse2 = np.mean((y_test - y_pred)**2)
    
    if mse2 < best_score:
        best_score = mse2
        best_degree = degree
        best_model = model_poly
        
print(f'Best degree: {best_degree}')

Best degree: 3


In [54]:
poly_features = PolynomialFeatures(degree=best_degree)
X_poly_train = poly_features.fit_transform(X_train)
best_model.fit(X_poly_train, y_train)

print(f'Weights: {best_model.intercept_}, {best_model.coef_[1:]}')

Weights: -1.0036897853663835, [ 0.50856931 -2.00537092  0.29766059]


We see the same results for both with and without cross validation. It is because cross validation only gives us a more accurate number for the accuracy or the error that we are trying to calculate. It helps us get better values even if some parts of the data has outliers.

## References
we used links bellow to help us implement the code

In [None]:
https://data36.com/polynomial-regression-python-scikit-learn/
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
