In [1]:
# My question for the Diabetes SkLearn dataset is : how does each attribute influence the target?
# That is, if we know some values in attributes, can we predict the target value, which is the measure 
# quantitative measure of the disease progression.

import numpy as np
#import relevant libraries and modules
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.linear_model import LinearRegression
import pandas as pd
import sklearn.datasets
from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from scipy import stats


In [2]:
#load data
#Each of these 10 feature variables have been mean centered and scaled by the
#standard deviation times n_samples (i.e. the sum of squares of each column totals 1).
data_d = datasets.load_diabetes()
dy = pd.DataFrame(data_d.target, columns = ['target'])
#Note that a series (which the target is) does not have columns, only an index.
dX = pd.DataFrame(data_d.data)
data = pd.merge(dX,dy, left_index = True, right_index = True)
print("Here is how the data looks after it has been mean centered and scaled", dX.head())

print("The shape of the dataset is:", dX.shape)


Here is how the data looks after it has been mean centered and scaled           0         1         2         3         4         5         6  \
0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
2  0.085299  0.050680  0.044451 -0.005671 -0.045599 -0.034194 -0.032356   
3 -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   

          7         8         9  
0 -0.002592  0.019908 -0.017646  
1 -0.039493 -0.068330 -0.092204  
2 -0.002592  0.002864 -0.025930  
3  0.034309  0.022692 -0.009362  
4 -0.002592 -0.031991 -0.046641  
The shape of the dataset is: (442, 10)


In [3]:
# Create summary statistics 
# Are certain features more important than others?
# 0: Age, 1: Gender, 2: BMI (Body Mass Index) 3: Avg Blood Pressure
# 4 - 9:  blood serum measurements, 
# target: a quantitative measure of disease progression one year after baseline
# how's it measured? what is the unit? percent? what is the baseline? 100%?
dX.describe()

# how to study these values.
# range of each column - 
# I notice the standard dev of every feature is the same. Wonder why?

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,-3.639623e-16,1.309912e-16,-8.013951e-16,1.289818e-16,-9.042540000000001e-17,1.301121e-16,-4.563971e-16,3.863174e-16,-3.848103e-16,-3.398488e-16
std,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905
min,-0.1072256,-0.04464164,-0.0902753,-0.1123996,-0.1267807,-0.1156131,-0.1023071,-0.0763945,-0.1260974,-0.1377672
25%,-0.03729927,-0.04464164,-0.03422907,-0.03665645,-0.03424784,-0.0303584,-0.03511716,-0.03949338,-0.03324879,-0.03317903
50%,0.00538306,-0.04464164,-0.007283766,-0.005670611,-0.004320866,-0.003819065,-0.006584468,-0.002592262,-0.001947634,-0.001077698
75%,0.03807591,0.05068012,0.03124802,0.03564384,0.02835801,0.02984439,0.0293115,0.03430886,0.03243323,0.02791705
max,0.1107267,0.05068012,0.1705552,0.1320442,0.1539137,0.198788,0.1811791,0.1852344,0.133599,0.1356118


In [4]:
dy.describe()

# no missing values
# How can we best extract the data? Perhaps it is not necessary since it is small dataset.

Unnamed: 0,target
count,442.0
mean,152.133484
std,77.093005
min,25.0
25%,87.0
50%,140.5
75%,211.5
max,346.0


In [5]:
data.corr(method='pearson')
#correlate y to other var
#z score normalization but this one has already been normalized.

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,target
0,1.0,0.173737,0.185085,0.335427,0.260061,0.219243,-0.075181,0.203841,0.270777,0.301731,0.187889
1,0.173737,1.0,0.088161,0.241013,0.035277,0.142637,-0.37909,0.332115,0.149918,0.208133,0.043062
2,0.185085,0.088161,1.0,0.395415,0.249777,0.26117,-0.366811,0.413807,0.446159,0.38868,0.58645
3,0.335427,0.241013,0.395415,1.0,0.24247,0.185558,-0.178761,0.257653,0.393478,0.390429,0.441484
4,0.260061,0.035277,0.249777,0.24247,1.0,0.896663,0.051519,0.542207,0.515501,0.325717,0.212022
5,0.219243,0.142637,0.26117,0.185558,0.896663,1.0,-0.196455,0.659817,0.318353,0.2906,0.174054
6,-0.075181,-0.37909,-0.366811,-0.178761,0.051519,-0.196455,1.0,-0.738493,-0.398577,-0.273697,-0.394789
7,0.203841,0.332115,0.413807,0.257653,0.542207,0.659817,-0.738493,1.0,0.617857,0.417212,0.430453
8,0.270777,0.149918,0.446159,0.393478,0.515501,0.318353,-0.398577,0.617857,1.0,0.46467,0.565883
9,0.301731,0.208133,0.38868,0.390429,0.325717,0.2906,-0.273697,0.417212,0.46467,1.0,0.382483


column 2 and 8 from the features have stronger correlations with the target.

In [6]:
#data[[2,8]]

In [7]:
# After the basic statistics, correlation analysis, and basic plotting, we may further explore
# our data and identify subgroups' behavior, if we can. Actually, we kind of already did that.
# WHAT IS OUR STORY for the Data Analytic report?

In [8]:
# LINEAR regression: minimize MSE.
# My hypothesis is that the BMI and the second to last blood serum measurement influences the 
# disease progression of diabetes. We can make predictions based on these data.


# CRISP DM: Based on Data Selection Criteria, decide if one or more attributes are more important than others and
#weight the attributes accordingly. Decide, based on the context (i.e., application, tool, etc.), how to handle
#the weighting.

In [9]:
#kf = KFold(n_splits=5, random_state=3, shuffle=True) 
#kf.get_n_splits(data_d.data) # returns the number of splitting iterations in the cross-validator
#print(kf) 
#print(train_index, test_index in kf.split(dX))

In [10]:
#for train_index, test_index in kf.split(data_d.data):
    
    
 #  X_train, y_train = data_d.data[train_index, 8], data_d.target[train_index]
  #  X_test, y_test = data_d.data[test_index, 8], data_d.target[test_index]

In [11]:
regress = LinearRegression().fit(data[[2,8]],data['target'])

In [12]:
#y_pred = regress.predict(X_test)
#np.sqrt(mean_squared_error(y_pred, y_test))
#Now how to scor
print(regress.coef_)
print(regress.intercept_)

[675.06977443 614.95050478]
152.1334841628967


In [16]:
# will come back to CV

array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990842, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06832974, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286377, -0.02593034],
       ...,
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04687948,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452837, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00421986,  0.00306441]])