# Introduction to sklearn

### Importing libraries

In [1]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

### Loading the dataset

In [2]:
diabetes = datasets.load_diabetes()

In [3]:
print(diabetes['DESCR'])

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - Age
      - Sex
      - Body mass index
      - Average blood pressure
      - S1
      - S2
      - S3
      - S4
      - S5
      - S6

Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).

Source URL:
http://www4.stat.ncsu.edu/~boos/var.select/diabetes.html

For more information see:
Brad

In [4]:
# Use only one feature
diabetes_X = diabetes.data[:, np.newaxis, 2]

### Splitting the dataset (Train y Test)

In [5]:
# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]

In [6]:
# Split the targets into training/testing sets
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test = diabetes.target[-20:]

### Creating the linear regression model

In [7]:
regr = linear_model.LinearRegression()
#clf = linear_model.LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')

In [8]:
regr

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

### Training the model with the training sets

In [9]:
regr.fit(diabetes_X_train, diabetes_y_train)
#clf.fit(diabetes_X_train, diabetes_y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [10]:
regr.coef_

array([938.23786125])

In [11]:
regr.get_params()

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'normalize': False}

In [12]:
regr.intercept_

152.91886182616167

### Actually making sense of the coefficients

In [13]:
# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(diabetes_y_test, diabetes_y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(diabetes_y_test, diabetes_y_pred))

Coefficients: 
 [938.23786125]


NameError: name 'diabetes_y_pred' is not defined

### Making the predictions through inference

In [None]:
diabetes_y_pred = regr.predict(diabetes_X_test)
#diabetes_y_pred = clf.predict(diabetes_X_test)

In [None]:
diabetes_y_pred

### Printing the results

In [None]:
regr.score(diabetes_X_test, diabetes_y_test)
#clf.score(X_test,y_test)

In [None]:
# Plot outputs
plt.scatter(diabetes_X_test, diabetes_y_test,  color='black')
plt.plot(diabetes_X_test, diabetes_y_pred, color='blue', linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()