In [1]:
import numpy as np
import pandas as pd
import patsy


from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.grid_search import GridSearchCV



# ![](https://ga-dash.s3.amazonaws.com/production/assets/logo-9f88ae6c9c3871690e33280fcf557f33.png) Advanced Model Tuning

Week 5 | Day 1

### LEARNING OBJECTIVES
*After this lesson, you will be able to:*
- Explain what gridsearch is and why it's useful in machine learning
- Implement auto-tuning in sklearn

## Recap

At this point we have learned about a couple of differenct classification models.


Check: Which models do we know know?

## Recap

And we've seen that each of these models have inputs that can be set upon initialization

Check: What are some of these inputs? Take a quick look...
    
[K-NN](http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)<br>
[Logistic Regression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

## Given all the possible inputs, how can we fit the best model?

## Autotune!


It isn't realistic for us to attempt to construct and all the possible combinations of models by hand.

Fortunately, sklearn provides a number of built-in functions, that allow us to search the space of all possible models to find the best one. (Recall LassoCV and RidgeCV)

We're going to walk through an example of how to do that now step by step...

## We are going to build a model to predict crime in SF


## The features will included day of week, time, and district
## The target is the type of crime

## Load our data set 

In [4]:
sf_crime = pd.read_csv('./assets/datasets/sf_crime_train.csv')
sf_crime = sf_crime.dropna()

In [5]:
sf_crime.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,5/13/15 23:53,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,5/13/15 23:53,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,5/13/15 23:33,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,5/13/15 23:30,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,5/13/15 23:30,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


## Data type conversions and transformations

In [6]:
sf_crime['Dates'] = pd.to_datetime(sf_crime['Dates'])
sf_crime_dates = pd.DatetimeIndex(sf_crime['Dates'].values, dtype='datetime64[ns]', freq=None)

sf_crime['hour'] = sf_crime_dates.hour
sf_crime['month'] = sf_crime_dates.month
sf_crime['year'] = sf_crime_dates.year

## Let's see what all the listed crimes are

In [8]:
sf_crime['Category'].unique()

array(['WARRANTS', 'OTHER OFFENSES', 'LARCENY/THEFT', 'VEHICLE THEFT',
       'VANDALISM', 'NON-CRIMINAL', 'ROBBERY', 'ASSAULT', 'WEAPON LAWS',
       'BURGLARY', 'SUSPICIOUS OCC', 'DRUNKENNESS',
       'FORGERY/COUNTERFEITING', 'DRUG/NARCOTIC', 'STOLEN PROPERTY',
       'SECONDARY CODES', 'TRESPASS', 'MISSING PERSON', 'FRAUD',
       'KIDNAPPING', 'RUNAWAY', 'DRIVING UNDER THE INFLUENCE',
       'SEX OFFENSES FORCIBLE', 'PROSTITUTION', 'DISORDERLY CONDUCT',
       'ARSON', 'FAMILY OFFENSES', 'LIQUOR LAWS', 'BRIBERY',
       'EMBEZZLEMENT', 'SUICIDE', 'LOITERING', 'SEX OFFENSES NON FORCIBLE',
       'EXTORTION', 'GAMBLING', 'BAD CHECKS'], dtype=object)

## We'll select a subsection of the listed crimes

In [75]:
subset = ['BURGLARY','DRUG/NARCOTIC']
sf_crime_sub = sf_crime[sf_crime['Category'].isin(subset)]

In [76]:
sf_crime_sub.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,hour,month,year
49,2015-05-13 19:52:00,BURGLARY,"BURGLARY, VEHICLE (ARREST MADE)",Wednesday,PARK,"ARREST, BOOKED",1500 Block of HAIGHT ST,-122.447761,37.769846,19,5,2015
87,2015-05-13 18:30:00,BURGLARY,"BURGLARY OF RESIDENCE, ATTEMPTED FORCIBLE ENTRY",Wednesday,BAYVIEW,NONE,1300 Block of FELTON ST,-122.417938,37.726605,18,5,2015
97,2015-05-13 18:00:00,BURGLARY,"BURGLARY OF APARTMENT HOUSE, UNLAWFUL ENTRY",Wednesday,SOUTHERN,NONE,0 Block of 6TH ST,-122.409504,37.781526,18,5,2015
104,2015-05-13 17:55:00,BURGLARY,"BURGLARY,STORE UNDER CONSTRUCTION, UNLAWFUL ENTRY",Wednesday,SOUTHERN,"ARREST, BOOKED",1200 Block of MARKET ST,-122.415449,37.778294,17,5,2015
107,2015-05-13 17:47:00,DRUG/NARCOTIC,POSSESSION OF NARCOTICS PARAPHERNALIA,Wednesday,BAYVIEW,NONE,0 Block of WHITFIELD CT,-122.381838,37.731104,17,5,2015


## Check the total number of districts

In [77]:
sf_crime_sub['PdDistrict'].unique()

array(['PARK', 'BAYVIEW', 'SOUTHERN', 'CENTRAL', 'NORTHERN', 'INGLESIDE',
       'TARAVAL', 'MISSION', 'TENDERLOIN', 'RICHMOND'], dtype=object)

In [78]:
sf_crime_sub['PdDistrict'].nunique()

10

## Set up our design matrix and target vector with Patsy

### Patsy allows us to use R-style formulas to do this 
[Patsy Docs](http://patsy.readthedocs.io/en/latest/)

In [79]:
X = patsy.dmatrix('~ C(hour) + C(DayOfWeek) + C(PdDistrict)', sf_crime_sub)
y = sf_crime_sub['Category'].values

In [80]:
y

array(['BURGLARY', 'BURGLARY', 'BURGLARY', ..., 'DRUG/NARCOTIC',
       'DRUG/NARCOTIC', 'BURGLARY'], dtype=object)

In [81]:
X.design_info.column_names

['Intercept',
 'C(hour)[T.1]',
 'C(hour)[T.2]',
 'C(hour)[T.3]',
 'C(hour)[T.4]',
 'C(hour)[T.5]',
 'C(hour)[T.6]',
 'C(hour)[T.7]',
 'C(hour)[T.8]',
 'C(hour)[T.9]',
 'C(hour)[T.10]',
 'C(hour)[T.11]',
 'C(hour)[T.12]',
 'C(hour)[T.13]',
 'C(hour)[T.14]',
 'C(hour)[T.15]',
 'C(hour)[T.16]',
 'C(hour)[T.17]',
 'C(hour)[T.18]',
 'C(hour)[T.19]',
 'C(hour)[T.20]',
 'C(hour)[T.21]',
 'C(hour)[T.22]',
 'C(hour)[T.23]',
 'C(DayOfWeek)[T.Monday]',
 'C(DayOfWeek)[T.Saturday]',
 'C(DayOfWeek)[T.Sunday]',
 'C(DayOfWeek)[T.Thursday]',
 'C(DayOfWeek)[T.Tuesday]',
 'C(DayOfWeek)[T.Wednesday]',
 'C(PdDistrict)[T.CENTRAL]',
 'C(PdDistrict)[T.INGLESIDE]',
 'C(PdDistrict)[T.MISSION]',
 'C(PdDistrict)[T.NORTHERN]',
 'C(PdDistrict)[T.PARK]',
 'C(PdDistrict)[T.RICHMOND]',
 'C(PdDistrict)[T.SOUTHERN]',
 'C(PdDistrict)[T.TARAVAL]',
 'C(PdDistrict)[T.TENDERLOIN]']

## Let's look at our design matrix as a DataFrame

In [82]:
pdf = pd.DataFrame(X, columns=X.design_info.column_names)
pdf['Target'] = y
pdf

Unnamed: 0,Intercept,C(hour)[T.1],C(hour)[T.2],C(hour)[T.3],C(hour)[T.4],C(hour)[T.5],C(hour)[T.6],C(hour)[T.7],C(hour)[T.8],C(hour)[T.9],...,C(PdDistrict)[T.CENTRAL],C(PdDistrict)[T.INGLESIDE],C(PdDistrict)[T.MISSION],C(PdDistrict)[T.NORTHERN],C(PdDistrict)[T.PARK],C(PdDistrict)[T.RICHMOND],C(PdDistrict)[T.SOUTHERN],C(PdDistrict)[T.TARAVAL],C(PdDistrict)[T.TENDERLOIN],Target
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,BURGLARY
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BURGLARY
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,BURGLARY
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,BURGLARY
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DRUG/NARCOTIC
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BURGLARY
6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,DRUG/NARCOTIC
7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,BURGLARY
8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,BURGLARY
9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,DRUG/NARCOTIC


## Let's see how many districts are listed in our design matrix 

In [83]:
sf_crime_sub['PdDistrict'].nunique()

10

In [84]:
[x for x in pdf.columns if 'PdDistrict' in x]

['C(PdDistrict)[T.CENTRAL]',
 'C(PdDistrict)[T.INGLESIDE]',
 'C(PdDistrict)[T.MISSION]',
 'C(PdDistrict)[T.NORTHERN]',
 'C(PdDistrict)[T.PARK]',
 'C(PdDistrict)[T.RICHMOND]',
 'C(PdDistrict)[T.SOUTHERN]',
 'C(PdDistrict)[T.TARAVAL]',
 'C(PdDistrict)[T.TENDERLOIN]']

In [85]:
pd.Series([x for x in pdf.columns if 'PdDistrict' in x]).nunique()

9

## And how many hours?

In [86]:
sf_crime_sub['hour'].nunique()

24

In [87]:
pd.Series([x for x in pdf.columns if 'hour' in x]).nunique()

23

## Check: Why is there one less on both?

## Set up our training and testing sets

In [88]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=77)

## Now let's fit a standard logistic regression model

In [89]:
lr = LogisticRegression(solver='liblinear')

In [90]:
lr_model = lr.fit(X_train, y_train)

## Make our predictions

In [91]:
lr_ypred = lr_model.predict(X_test)

## Check our misclassifications with a confusion matrix

[Confusion Matrix](http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py)

In [92]:
# actual = index; predicted = columns
lr_cm = confusion_matrix(y_test, lr_ypred, labels=lr.classes_)
lr_cm = pd.DataFrame(lr_cm, columns=lr.classes_, index=lr.classes_)
lr_cm

Unnamed: 0,BURGLARY,DRUG/NARCOTIC
BURGLARY,212,37
DRUG/NARCOTIC,91,66


## Check our precision, recall, and f1

[Precision & Recall](https://www.quora.com/What-is-the-best-way-to-understand-the-terms-precision-and-recall)<br>
[F1](https://en.wikipedia.org/wiki/F1_score)

In [93]:
print classification_report(y_test, lr_ypred, labels=lr.classes_)

               precision    recall  f1-score   support

     BURGLARY       0.70      0.85      0.77       249
DRUG/NARCOTIC       0.64      0.42      0.51       157

  avg / total       0.68      0.68      0.67       406



## Check the CV Score

In [94]:
cvs1 = cross_val_score(lr, X, y, cv=3, scoring='f1_weighted')
cvs1

array([ 0.61981714,  0.66558914,  0.68167336])

In [95]:
cvs1.mean()

0.65569321229443955

## Let's now use a penalized regression - we'll use LASSO (L1)

In [101]:
lr_l1 = LogisticRegression(C=1, penalty='l1', solver='liblinear')
lr_l1_model = lr_l1.fit(X_train, y_train)
lr_l2 = LogisticRegression(C=1, penalty='l2', solver='liblinear')
lr_l2_model = lr_l1.fit(X_train, y_train)


In [102]:
lr_l1_model = lr_l1.fit(X_train, y_train)
lr_l2_model = lr_l2.fit(X_train, y_train)

In [106]:
lr_l1_ypred = lr_l1_model.predict(X_test)
lr_l2_ypred = lr_l2_model.predict(X_test)

## Get the confusion matrix

In [107]:
lr_l1_cm = confusion_matrix(y_test, lr_l1_ypred, labels=lr_l1.classes_)
lr_l1_cm = pd.DataFrame(lr_l1_cm, columns=lr_l1.classes_, index=lr_l1.classes_)
lr_l1_cm


Unnamed: 0,BURGLARY,DRUG/NARCOTIC
BURGLARY,221,28
DRUG/NARCOTIC,95,62


In [108]:
lr_l2_cm = confusion_matrix(y_test, lr_l2_ypred, labels=lr_l2.classes_)
lr_l2_cm = pd.DataFrame(lr_l2_cm, columns=lr_l2.classes_, index=lr_l2.classes_)
lr_l2_cm

Unnamed: 0,BURGLARY,DRUG/NARCOTIC
BURGLARY,212,37
DRUG/NARCOTIC,91,66


## Get the classification report

In [110]:
print classification_report(y_test, lr_l1_ypred, labels=lr_l1.classes_)
print classification_report(y_test, lr_l2_ypred, labels=lr_l2.classes_)

               precision    recall  f1-score   support

     BURGLARY       0.70      0.89      0.78       249
DRUG/NARCOTIC       0.69      0.39      0.50       157

  avg / total       0.70      0.70      0.67       406

               precision    recall  f1-score   support

     BURGLARY       0.70      0.85      0.77       249
DRUG/NARCOTIC       0.64      0.42      0.51       157

  avg / total       0.68      0.68      0.67       406



## Get mean cross val score

In [111]:
cvs2_l1 = cross_val_score(lr_l1, X, y, cv=3)
cvs2_l2 = cross_val_score(lr_l2, X, y, cv=3)

In [112]:
print cvs2_l1.mean()
print cvs2_l2.mean()

0.675100880593
0.675921839903


 ## Looks like a minimal + change with L1 penalty at 1, how about other values?

## We can build a function to test this

In [119]:
def test_penalties_l1(c_val):
    lr_l1 = LogisticRegression(C=c_val, penalty='l1', solver='liblinear')
    cvs_l1 = cross_val_score(lr_l1, X, y, cv=3, scoring='f1_weighted')
    return cvs_l1
def test_penalties_l2(c_val):
    lr_l2 = LogisticRegression(C=c_val, penalty='l2', solver='liblinear')
    cvs_l2 = cross_val_score(lr_l2, X, y, cv=3, scoring='f1_weighted')
    return cvs_l2

In [122]:
# let's test it...
test_cs_l1 = pd.Series([.001, .01, .1, 1, 1.5, 2.5, 5, 10, 100]).to_frame('c_vals')
score_frame_l1 = pd.DataFrame([test_penalties_l1(x) for x in test_cs_l1['c_vals']]).mean(axis=1).to_frame('score')

final_scores_l1 = pd.concat([test_cs, score_frame], axis=1)
print final_scores_l1

# let's test it...
test_cs_l2 = pd.Series([.001, .01, .1, 1, 1.5, 2.5, 5, 10, 100]).to_frame('c_vals')
score_frame_l2 = pd.DataFrame([test_penalties_l2(x) for x in test_cs_l2['c_vals']]).mean(axis=1).to_frame('score')

final_scores_l2 = pd.concat([test_cs, score_frame], axis=1)
print final_scores_l2

    c_vals     score
0    0.001  0.445245
1    0.010  0.445245
2    0.100  0.625001
3    1.000  0.651551
4    1.500  0.653042
5    2.500  0.652712
6    5.000  0.650600
7   10.000  0.650875
8  100.000  0.648290
    c_vals     score
0    0.001  0.445245
1    0.010  0.445245
2    0.100  0.625001
3    1.000  0.651551
4    1.500  0.653042
5    2.500  0.652712
6    5.000  0.650600
7   10.000  0.650875
8  100.000  0.648290


In [123]:
# and so the best c value...
print final_scores_l1['c_vals'][final_scores['score'].idxmax()]
print final_scores_l2['c_vals'][final_scores['score'].idxmax()]

1.5
1.5


## That wasn't too bad, but...

But that was only changing one parameter. What if we wanted to try out L2 as well as L1? Or if we had a different algorithm with numerous parameters? It can start to become a hassle to code these from scratch.

Fortunately, sk-learn has a function that will do this for us.

In [52]:
# fit model with three folds and lasso regularization
# use Cs=20 to test a grid of 20 distinct parameters
# remeber: Cs describes the inverse of regularization strength
logreg_cv = LogisticRegressionCV(Cs=20, solver='liblinear', cv=3, penalty='l1', scoring='f1')
cv_model = logreg_cv.fit(X_train, y_train)

## Find best C per class

In [53]:
print('best C for class:')
best_C = {logreg_cv.classes_[i]:x for i, (x, c) in enumerate(zip(logreg_cv.Cs_, logreg_cv.classes_))}
print(best_C)

best C for class:
{'BURGLARY': 0.0001, 'VEHICLE THEFT': 0.00069519279617756048, 'DRUG/NARCOTIC': 0.00026366508987303583}


## Get the classification report for our best model

In [54]:
print classification_report(y_test, logreg_cv.predict(X_test))

               precision    recall  f1-score   support

     BURGLARY       0.48      0.40      0.44       249
DRUG/NARCOTIC       0.69      0.25      0.36       170
VEHICLE THEFT       0.53      0.78      0.63       306

  avg / total       0.55      0.53      0.50       725



## Exercise

Using the data set (pdf):
- Fit two models to predict between "Burglary" and "Drug/Narcotic" crimes
- One model should use an L1 penalty and the other should use an L2 penalty
- Make sure to use train_test_split
- Print out a confusion matrix and a classification report for both models
- Finally, build a third model that uses LogisticRegressionCV
- Print our a confusion matrix, classification report and the best value of C

## So LogisticRegressionCV is useful for finding the best penalty, but we had to manually change our penalty from 'l1' to 'l2' to try both...What if there was a better way...?

## Introducing GridSearchCV

[GridSearchCV](http://scikit-learn.org/0.17/modules/generated/sklearn.grid_search.GridSearchCV.html)

## To start we'll select a model and penalties and some hyperparameters 

Then will pass those to GridSearchCV

In [124]:
logreg = LogisticRegression(solver='liblinear')
C_vals = [0.0001, 0.001, 0.01, 0.1, .15, .25, .275, .33, 0.5, .66, 0.75, 1.0, 2.5, 5.0, 10.0, 100.0, 1000.0]
penalties = ['l1','l2']

gs = GridSearchCV(logreg, {'penalty': penalties, 'C': C_vals}, verbose=False, cv=15)
gs.fit(X, y)

GridSearchCV(cv=15, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.0001, 0.001, 0.01, 0.1, 0.15, 0.25, 0.275, 0.33, 0.5, 0.66, 0.75, 1.0, 2.5, 5.0, 10.0, 100.0, 1000.0]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=False)

## Now let's find the best parameters

In [125]:
gs.best_params_

{'C': 0.1, 'penalty': 'l1'}

## Use this parameter to fit, predict, and print a classification_report for our X and Y

In [126]:
logreg = LogisticRegression(C=gs.best_params_['C'], penalty=gs.best_params_['penalty'])
cv_model = logreg.fit(vice_X_train, vice_y_train)

NameError: name 'vice_X_train' is not defined

In [183]:
cv_pred = cv_model.predict(vice_X_test)

## Now let's check our stats...

In [184]:
cm3 = confusion_matrix(vice_y_test, cv_pred, labels=logreg.classes_)
cm3 = pd.DataFrame(cm3, columns=logreg.classes_, index=logreg.classes_)

In [185]:
cm3

Unnamed: 0,BURGLARY,DRUG/NARCOTIC
BURGLARY,227,24
DRUG/NARCOTIC,95,60


In [205]:
print classification_report(vice_y_test, cv_pred, labels=logreg.classes_)

             precision    recall  f1-score   support

   BURGLARY       0.70      0.90      0.79       251
DRUG/NARCOTIC       0.71      0.39      0.50       155

avg / total       0.71      0.71      0.68       406



## Independent Practice

Use GridSearchCV with knn on the iris data set:
- Use train_test_split with a train size of .66
- Set a parameter dictionary with the number of neighbors and at least one other parameter
- Get your best estimator and print out a classification report

In [132]:
from sklearn import svm, grid_search, datasets
import statsmodels.api as sm

iris = sm.datasets.get_rdataset('iris','datasets')
y = iris.data.Species.copy()
x = iris.data.ix[:, 0:4].copy()
x['constant']=1.


Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,constant
0,5.1,3.5,1.4,0.2,1.0
1,4.9,3.0,1.4,0.2,1.0
2,4.7,3.2,1.3,0.2,1.0
3,4.6,3.1,1.5,0.2,1.0
4,5.0,3.6,1.4,0.2,1.0
5,5.4,3.9,1.7,0.4,1.0
6,4.6,3.4,1.4,0.3,1.0
7,5.0,3.4,1.5,0.2,1.0
8,4.4,2.9,1.4,0.2,1.0
9,4.9,3.1,1.5,0.1,1.0


In [138]:
from sklearn import svm, grid_search, datasets
import statsmodels.api as sm

iris = sm.datasets.get_rdataset('iris','datasets')
y = iris.data.Species.copy()
x = iris.data.ix[:, 0:4].copy()
x['constant']=1.

X = patsy.
('~ C(Sepal.Length) + C(Sepal.Width) + C(Petal.Length) + C(Petal.Width)', x)
#y = sf_crime_sub['Category'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.66, random_state=77)
# C_vals = [0.0001, 0.001, 0.01, 0.1, .15, .25, .275, .33, 0.5, .66, 0.75, 1.0, 2.5, 5.0, 10.0, 100.0, 1000.0]
# parameters = {'n_neighbors':3, 'C':C_vals}

# # >>> svr = svm.SVC()
# clf = grid_search.GridSearchCV(svr, parameters)
# clf.fit(iris.data, iris.target)



# # >>> from sklearn.neighbors import KNeighborsClassifier
# # >>> neigh = KNeighborsClassifier(n_neighbors=3)
# # >>> neigh.fit(X, y) 
# # KNeighborsClassifier(...)

# # logreg = LogisticRegression(solver='liblinear')
# # C_vals = [0.0001, 0.001, 0.01, 0.1, .15, .25, .275, .33, 0.5, .66, 0.75, 1.0, 2.5, 5.0, 10.0, 100.0, 1000.0]
# # penalties = ['l1','l2']

# # gs = GridSearchCV(logreg, {'penalty': penalties, 'C': C_vals}, verbose=False, cv=15)
# # gs.fit(X, y)



AttributeError: 'module' object has no attribute 'x'