# Linear regression using Scikit-learn

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler

In [8]:
data = pd.read_csv('data/weatherHistory.csv').head(30)

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Formatted Date            30 non-null     object 
 1   Summary                   30 non-null     object 
 2   Precip Type               30 non-null     object 
 3   Temperature (C)           30 non-null     float64
 4   Apparent Temperature (C)  30 non-null     float64
 5   Humidity                  30 non-null     float64
 6   Wind Speed (km/h)         30 non-null     float64
 7   Wind Bearing (degrees)    30 non-null     float64
 8   Visibility (km)           30 non-null     float64
 9   Loud Cover                30 non-null     float64
 10  Pressure (millibars)      30 non-null     float64
 11  Daily Summary             30 non-null     object 
dtypes: float64(8), object(4)
memory usage: 2.9+ KB


First, we start by selecting the features and the target values.

In [22]:
x_train = np.array([data['Humidity'], data['Wind Speed (km/h)'], data['Pressure (millibars)']]).transpose()
y_train = np.array(data['Temperature (C)'])


In [23]:
print(f"X is of size {x_train.shape}")
print(f"Y is of size {y_train.shape}")

X is of size (30, 3)
Y is of size (30,)


Scaling the data before feeding it to the learning algorithm.

In [24]:
scalar = StandardScaler()
x_scaled = scalar.fit_transform(x_train)
print(f" Raw        X:{np.ptp(x_train,axis=0)}")   
print(f"Normalized X : {np.ptp(x_scaled,axis=0)}")

 Raw        X:[ 0.49   18.0159  9.03  ]
Normalized X : [3.69954466 3.85855164 5.40322874]


creating and fitting the data into the linear regression model.

In [26]:
sgdr = SGDRegressor(max_iter=10000)
sgdr.fit(x_scaled, y_train)
print(sgdr)
print(f"number of iterations completed: {sgdr.n_iter_}, number of weight updates: {sgdr.t_}")

SGDRegressor(max_iter=10000)
number of iterations completed: 86, number of weight updates: 2581.0


Finding the optimized paramaters **w** & **b**.

In [28]:
b_opt = sgdr.intercept_
w_opt = sgdr.coef_
print(f"Foud optimized w = {w_opt} & b = {b_opt}")

Foud optimized w = [-3.04329507e+00  1.05557373e-03  6.82903836e-01] & b = [11.72889472]


Once finding optimal w & b, we can now plug them into the linear model.

In [36]:
y_pred_sgd = sgdr.predict(x_scaled)
y_pred_sgd
fx = np.dot(x_scaled, w_opt) + b_opt
print(f"prediction using np.dot() and sgdr.predict match: {(fx == y_pred_sgd).all()}")

print(f"Prediction on training set:\n{fx[:4]}" )
print(f"Target values \n{y_train[:4]}")


prediction using np.dot() and sgdr.predict match: True
Prediction on training set:
[7.80444131 8.69810133 8.13312371 9.70610725]
Target values 
[9.47222222 9.35555556 9.37777778 8.28888889]
