In [None]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Using formulas etc. Without using sklearn
X, Y = load_diabetes(return_X_y=True)
X = X[:, 2]

sumOfX = X.sum()
sumOfY = Y.sum()
sumOfX2 = (X * X).sum()
sumOfXY = (X * Y).sum()
n = len(X)

slope = (n * sumOfXY - sumOfX * sumOfY) / (n * sumOfX2 - sumOfX * sumOfX)
intercept = (sumOfY - slope * sumOfX) / n

y_pred = slope * X + intercept

mse = np.sqrt(mean_squared_error(Y, y_pred))
print(f'Mean error: {mse:3.3} ({mse/np.mean(y_pred)*100:3.3}%)')

plt.plot(X, y_pred, color='blue', linewidth=3)
plt.scatter(X, Y, color='black')
plt.show()

In [None]:
# Using sklearn but without splitting train and test
X, Y = load_diabetes(return_X_y=True)
X = X[:, 2, np.newaxis]

model = LinearRegression()
model.fit(X, Y)
y_pred = model.predict(X)

mse = np.sqrt(mean_squared_error(Y, y_pred))
print(f'Mean error: {mse:3.3} ({mse/np.mean(y_pred)*100:3.3}%)')

plt.plot(X, y_pred, color='blue', linewidth=3)
plt.scatter(X, Y, color='black')
plt.show()

# Result: I got the same result either way. Everything goes well.

In [None]:
# Using sklearn with splitting train and test
X, Y = load_diabetes(return_X_y=True)
X = X[:, 2, np.newaxis]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33)

model = LinearRegression()
model.fit(X_train, Y_train)
y_pred = model.predict(X_test)

mse = np.sqrt(mean_squared_error(Y_test, y_pred))
print(f'Mean error: {mse:3.3} ({mse/np.mean(y_pred)*100:3.3}%)')

plt.plot(X_test, y_pred, color='blue', linewidth=3)
plt.scatter(X_test, Y_test, color='black')
plt.show()