# Preprocessing

In [1]:
from sklearn.datasets import load_iris
from sklearn import preprocessing

# load the iris dataset
iris = load_iris()
print(iris.data.shape)

# separate the data from the target attributes
X = iris.data
y = iris.target

(150, 4)


## Normalization

Normalization refers to rescaling real valued numeric attributes into the range 0 and 1. It is useful to scale the input attributes for a model that relies on the magnitude of values, such as distance measures used in k­nearest neighbors and in the preparation of coefficients in regression.

In [2]:
# normalize the data attributes
normalized_X = preprocessing.normalize(X)

## Standardization

Standardization refers to shifting the distribution of each attribute to have a mean of 0 and a standard deviation of 1. It is useful to standardize attributes for a model that relies on the distribution of attributes such as Gaussian processes. 

In [3]:
# standardize the data attributes
standardized_X = preprocessing.scale(X)

## Imputation

* Data can have missing values. 
* These are values for attributes where a measurement could not be taken or is corrupt for some reason.
* It is important to identify and mark this missing data. Once marked, replacement values can be prepared.
* This is useful because some algorithms are unable to work with or exploit missing data.  
 
The following code demonstrates marking 0 values from the Pima Indians dataset as NaN and imputing the missing values with the mean of the attribute.

In [5]:
# Mark 0 values as missing and impute with the mean
import numpy as np
import urllib.request
from sklearn.preprocessing import Imputer

# Load the Pima Indians Diabetes dataset
url = "http://goo.gl/j0Rvxq"
raw_data = urllib.request.urlopen(url)
dataset = np.loadtxt(raw_data, delimiter=",")
print(dataset.shape)

(768, 9)


In [6]:
# separate the data and target attributes
X = dataset[:,0:7]
y = dataset[:,8]

# Mark all zero values as NaN
X[X==0] = np.nan

X

array([[   6.   ,  148.   ,   72.   , ...,      nan,   33.6  ,    0.627],
       [   1.   ,   85.   ,   66.   , ...,      nan,   26.6  ,    0.351],
       [   8.   ,  183.   ,   64.   , ...,      nan,   23.3  ,    0.672],
       ..., 
       [   5.   ,  121.   ,   72.   , ...,  112.   ,   26.2  ,    0.245],
       [   1.   ,  126.   ,   60.   , ...,      nan,   30.1  ,    0.349],
       [   1.   ,   93.   ,   70.   , ...,      nan,   30.4  ,    0.315]])

In [7]:
# Impute all missing values with the mean of the attribute
imp = Imputer(missing_values='NaN', strategy='mean')
imputed_X = imp.fit_transform(X)

imputed_X

array([[   6.        ,  148.        ,   72.        , ...,  155.54822335,
          33.6       ,    0.627     ],
       [   1.        ,   85.        ,   66.        , ...,  155.54822335,
          26.6       ,    0.351     ],
       [   8.        ,  183.        ,   64.        , ...,  155.54822335,
          23.3       ,    0.672     ],
       ..., 
       [   5.        ,  121.        ,   72.        , ...,  112.        ,
          26.2       ,    0.245     ],
       [   1.        ,  126.        ,   60.        , ...,  155.54822335,
          30.1       ,    0.349     ],
       [   1.        ,   93.        ,   70.        , ...,  155.54822335,
          30.4       ,    0.315     ]])

# Modeling

## Linear Regression

* Classical Regression
* Handling Nonlinearity
* Regularization
* Dimensionality

In [8]:
# Linear Regression
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LinearRegression

# load the diabetes datasets
dataset = datasets.load_diabetes()

# fit a linear regression model to the data
model = LinearRegression()
model.fit(dataset.data, dataset.target)
print(model)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)


In [12]:
# make predictions
expected = dataset.target
predicted = model.predict(dataset.data)

# summarize the fit of the model
mse = np.mean((predicted-expected)**2)
print(mse)
print(model.score(dataset.data, dataset.target))

2859.69039877
0.517749425413


* *score* returns the *R^2* of the model (http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html)
* generally, higher the *score*, the better

## Logistic Regression

* Logistic regression fits a logistic model to data and makes predictions about the probability of an event (between 0 and 1)

We show the fitting of a logistic regression algorithm to the iris dataset. Because this is a mutliclass classification problem and logistic regression makes predictions between 0 and 1, a one-vs-all scheme is used (one model is created per class).

In [14]:
# Logistic Regression
from sklearn import datasets
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

# load the iris datasets
dataset = datasets.load_iris()

# fit a logistic regression model to the data
model = LogisticRegression()
model.fit(dataset.data, dataset.target)
print(model)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [15]:
# make predictions
expected = dataset.target
predicted = model.predict(dataset.data)

# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        50
          1       0.98      0.90      0.94        50
          2       0.91      0.98      0.94        50

avg / total       0.96      0.96      0.96       150

[[50  0  0]
 [ 0 45  5]
 [ 0  1 49]]
