In [43]:
import sklearn.datasets
diabetes = sklearn.datasets.load_diabetes()

In [44]:
print(diabetes.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5      ltg, possibly log of serum triglycerides level
      - s6      glu, blood sugar level

Note: Each of these 1

In [45]:
x = diabetes.data
y = diabetes.target

In [46]:
from sklearn.model_selection import cross_val_score

In [47]:
import numpy as np
import matplotlib.pyplot as plt

In [54]:
#k-nearest neighbors
import sklearn.neighbors
from sklearn.metrics import mean_squared_error

In [49]:
#split into training and testing sets
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(
        x, y, test_size=0.2, random_state=42)

#calculate cross-validated MSE for k neighbors between 1 to 20
k_range = range(1, 20)
k_scores = []
for k in k_range:
    knn = sklearn.neighbors.KNeighborsRegressor(n_neighbors=k)
    loss = cross_val_score(knn,
                           x_train,
                           y_train, 
                           cv=5, 
                           scoring='neg_mean_squared_error')
    k_score = np.sqrt(-loss).mean()
    k_scores.append(np.sqrt(-loss).mean())
    print('Cross validated MSE for ' + str(k) + ' = ' + str(k_score))

Cross validated MSE for 1 = 79.60028726447867
Cross validated MSE for 2 = 69.71662373552864
Cross validated MSE for 3 = 66.3218962404476
Cross validated MSE for 4 = 63.44692657786659
Cross validated MSE for 5 = 62.925437159134425
Cross validated MSE for 6 = 61.75759380655749
Cross validated MSE for 7 = 60.55169550739739
Cross validated MSE for 8 = 60.359801521121234
Cross validated MSE for 9 = 59.94991223293763
Cross validated MSE for 10 = 60.37795939742043
Cross validated MSE for 11 = 60.18520317866974
Cross validated MSE for 12 = 59.77150116645919
Cross validated MSE for 13 = 59.217708682875525
Cross validated MSE for 14 = 59.10810275282845
Cross validated MSE for 15 = 58.863205905709926
Cross validated MSE for 16 = 58.8375903397181
Cross validated MSE for 17 = 58.743089148139816
Cross validated MSE for 18 = 58.76943286592912
Cross validated MSE for 19 = 59.07287764735884


In [50]:
#determine the lowest MSE
min(k_scores)

58.743089148139816

In [52]:
#set the number of neighbors and fit the model onto training data
kNeighborModel = sklearn.neighbors.KNeighborsRegressor(n_neighbors=17)
kNeighborModel.fit(x_train, y_train)

#predict y using testing x test group
y_kNeighborPred = kNeighborModel.predict(x_test)

In [55]:
print('MSE_knn = ', mean_squared_error(y_test, y_kNeighborPred))

MSE_knn =  3003.3901481279886


In [57]:
#Linear Regression
linearRegModel = sklearn.linear_model.LinearRegression()

In [58]:
linearRegModel.fit(x_train, y_train)

LinearRegression()

In [59]:
y_linearPred = linearRegModel.predict(x_test)

In [85]:
print('MSE_LinearReg = ', mean_squared_error(y_test, y_linearPred))

MSE_LinearReg =  2900.1732878832318


In [56]:
import sklearn.tree

In [71]:
#calculate cross-validated MSE for depths between 1 to 10
depth_range = range(1, 11)
depth_scores = []
for depth in depth_range:
    depthN = sklearn.tree.DecisionTreeRegressor(max_depth=depth)
    loss = cross_val_score(depthN,
                           x_train,
                           y_train, 
                           cv=5, 
                           scoring='neg_mean_squared_error')
    depth_score = np.sqrt(-loss).mean()
    depth_scores.append(np.sqrt(-loss).mean())
    print('Cross validated MSE for ' + str(depth) + ' = ' + str(depth_score))

Cross validated MSE for 1 = 66.77484860968592
Cross validated MSE for 2 = 61.26383483081285
Cross validated MSE for 3 = 65.03284855777811
Cross validated MSE for 4 = 65.44586519511108
Cross validated MSE for 5 = 67.26256229735742
Cross validated MSE for 6 = 71.01815859479089
Cross validated MSE for 7 = 74.96966657782475
Cross validated MSE for 8 = 78.454080034318
Cross validated MSE for 9 = 78.63336699257707
Cross validated MSE for 10 = 81.19921351368258


In [72]:
min(depth_scores)

61.26383483081285

In [73]:
decisionTree2Model = sklearn.tree.DecisionTreeRegressor(max_depth=2)

x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(
    x, y, test_size=0.2, random_state=42)

decisionTree2Model.fit(x_train, y_train)
y_decisionTree2Pred = decisionTree2Model.predict(x_test)

In [74]:
print('MSE_depth2 = ', mean_squared_error(y_test, y_decisionTree2Pred))

MSE_depth2 =  3866.038156768628


In [78]:
text_representation = sklearn.tree.export_text(decisionTree2Model)
print(text_representation)

|--- feature_2 <= 0.01
|   |--- feature_8 <= 0.01
|   |   |--- value: [100.56]
|   |--- feature_8 >  0.01
|   |   |--- value: [164.67]
|--- feature_2 >  0.01
|   |--- feature_2 <= 0.07
|   |   |--- value: [191.10]
|   |--- feature_2 >  0.07
|   |   |--- value: [271.08]



In [79]:
from sklearn.ensemble import RandomForestRegressor

In [80]:
#calculate cross-validated MSE for depths between 1 to 10

depth_range = range(1, 11)
depth_scores = []
for depth in depth_range:
    depthN = sklearn.ensemble.RandomForestRegressor(max_depth=depth)
    loss = cross_val_score(depthN,
                           x_train,
                           y_train, 
                           cv=5, 
                           scoring='neg_mean_squared_error')
    depth_score = np.sqrt(-loss).mean()
    depth_scores.append(np.sqrt(-loss).mean())
    print('Cross validated MSE for ' + str(depth) + ' = ' + str(depth_score))

Cross validated MSE for 1 = 62.58143608432808
Cross validated MSE for 2 = 58.91838011934176
Cross validated MSE for 3 = 58.227148899300175
Cross validated MSE for 4 = 58.12636031742737
Cross validated MSE for 5 = 57.939587082433434
Cross validated MSE for 6 = 57.76896867426526
Cross validated MSE for 7 = 58.54064487556832
Cross validated MSE for 8 = 58.25849163514437
Cross validated MSE for 9 = 58.7596656314178
Cross validated MSE for 10 = 58.99689767140732


In [81]:
min(depth_scores)

57.76896867426526

In [82]:
randomForest6Model = sklearn.ensemble.RandomForestRegressor(max_depth=6)

x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(
    x, y, test_size=0.2, random_state=42)

randomForest6Model.fit(x_train, y_train)
y_randomForest6Pred = randomForest6Model.predict(x_test)

In [83]:
print('MSE_depth6 = ', mean_squared_error(y_test, y_randomForest6Pred))

MSE_depth6 =  2957.109367678497


In [84]:
text_representation = sklearn.tree.export_text(decisionTree2Model)
print(text_representation)

|--- feature_2 <= 0.01
|   |--- feature_8 <= 0.01
|   |   |--- value: [100.56]
|   |--- feature_8 >  0.01
|   |   |--- value: [164.67]
|--- feature_2 >  0.01
|   |--- feature_2 <= 0.07
|   |   |--- value: [191.10]
|   |--- feature_2 >  0.07
|   |   |--- value: [271.08]



In [None]:
#The lowest MSE that was found among the 4 models belonged to the linear regression model
#(MSE_LinearReg =  2900.1732878832318)