In [2]:
#import some useful things

import numpy as np
import urllib

url = "http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"

raw_data = urllib.request.urlopen(url)

dataset = np.loadtxt(raw_data, delimiter = ',')

In [5]:
X = dataset[:, 0:7]
y = dataset[:, 8]

In [6]:
from sklearn import preprocessing
normalized_X = preprocessing.normalize(X)

standardized_X = preprocessing.scale(X)

In [7]:
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(X, y)

print(model.feature_importances_)

[ 0.12519484  0.26216808  0.10791575  0.09221217  0.09091399  0.18076016
  0.14083501]


In [8]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

#create rfe model and select 3 attributes

rfe = RFE(model, 3)
rfe = rfe.fit(X, y)

print(rfe.support_)
print(rfe.ranking_)

[ True False False False False  True  True]
[1 2 3 5 4 1 1]


In [9]:
#trying to use logistic regression and initialize model

model = LogisticRegression()
model.fit(X, y)
print(model)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)


In [10]:
#make predictions
expected = y
predicted = model.predict(X)

In [11]:
#summarize the fit of the model
print(metrics.classification_report(expected, predicted))

             precision    recall  f1-score   support

        0.0       0.79      0.89      0.84       500
        1.0       0.74      0.55      0.63       268

avg / total       0.77      0.77      0.77       768



In [12]:
print(metrics.confusion_matrix(expected, predicted))

[[447  53]
 [120 148]]


In [13]:
#try to use SVM

from sklearn import metrics
from sklearn.svm import SVC

model = SVC()

model.fit(X, y)
print(model)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)


In [14]:
expected = y
predicted = model.predict(X)

In [15]:
print(metrics.classification_report(expected, predicted))

             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00       500
        1.0       1.00      1.00      1.00       268

avg / total       1.00      1.00      1.00       768



In [16]:
print(metrics.confusion_matrix(expected, predicted))

[[500   0]
 [  0 268]]


In [17]:
#how to choose really good parameter for the model?

import numpy as np
from sklearn.linear_model import Ridge
from sklearn.grid_search import GridSearchCV

#prepare a range of values alpha

alphas = np.array([1, 0.1, 0.01, 0.001, 0.0001, 0])

In [18]:
model = Ridge()

In [19]:
grid = GridSearchCV(estimator = model, param_grid = dict(alpha=alphas))

In [20]:
grid.fit(X, y)
print(grid)

GridSearchCV(cv=None,
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, solver='auto', tol=0.001),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'alpha': array([  1.00000e+00,   1.00000e-01,   1.00000e-02,   1.00000e-03,
         1.00000e-04,   0.00000e+00])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)


In [21]:
print(grid.best_estimator_.alpha)

1.0


In [22]:
#for randon variant of parameters select

import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn.linear_model import Ridge
from sklearn.grid_search import RandomizedSearchCV

# prepare a uniform distribution to sample for the alpha parameter
param_grid = {'alpha': sp_rand()}

# create and fit a ridge regression model, testing random alpha values
model = Ridge()
rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100)
rsearch.fit(X, y)
print(rsearch)

# summarize the results of the random parameter search
print(rsearch.best_score_)
print(rsearch.best_estimator_.alpha)

RandomizedSearchCV(cv=None,
          estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, solver='auto', tol=0.001),
          fit_params={}, iid=True, n_iter=100, n_jobs=1,
          param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7ff83d4228d0>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring=None, verbose=0)
0.282118853386
0.9962037431004837
