In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# scikit-learn libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score

# for saving/loading model
import joblib

In [None]:
# loading dataset
from sklearn.datasets import load_diabetes

data = load_diabetes()

In [None]:
# features and target / independent and dependent variables
X = pd.DataFrame(data.data,columns=data.feature_names)
y=pd.Series(data.target , name = 'target' )
y

In [None]:
# preprocess data 
# 1 - check for null values
# 2 - check for dtype
# 3- handle categorical values :
# One-hot encoding for nominal categories (OneHotEncoder)
# Ordinal encoding for ordered categories (map to numbers)
# 4- detect outliers
# 5- scale features(linear regression doesn't req scaling to work)
# 6- feature engineering
# 7 - avoid data leakage - transformations must be fit only on training data

In [None]:
# splitting datasets into train and test 
X_train , X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
# training the model
model=LinearRegression()
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)
# predict() - uses trained model to predict values for test set

In [None]:
# evaluating the model
mse = mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

print("MSE=",mse)
print("R sq score=",r2)

In [None]:
# mse = lower = better model
# r sq = 1 perfect fit ; 0 poor fit , <0 worse than guessing avg

In [None]:
# saving the model
joblib.dump(model,"linear_regression_model.pkl")

In [None]:
# loading the model
loaded_model=joblib.load("linear_regression_model.pkl")

In [None]:
y_loaded_pred=loaded_model.predict(X_test)

In [None]:
plt.scatter(y_test,y_pred,color="blue")
plt.xlabel("Actual Values")
plt.xlabel("Predicted Values")