In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn import metrics as mt
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVC
from datetime import datetime
import matplotlib.pyplot as plt
from scipy import stats

# load in raw dataset
person_raw = pd.read_csv('../data/person-subset-2.5percent.csv')

# clean data (as performed in Project 1)
# will provide us with a new dataset "df"
# ...and a list of "important_features"
execfile('../python/clean_data_person.py')

In [2]:
def create_affluency():
    global lr
    global important_features

    lr = df[important_features].copy(deep=True)
    lr['affluency'] = pd.cut(df.PINCP, [-1, 99999.99, 1e12], labels=('general', 'rich'))
create_affluency()

In [3]:
execfile('../python/clean_data_classification.py')
lr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60357 entries, 0 to 60356
Data columns (total 49 columns):
PINCP                                              60357 non-null float64
AGEP                                               60357 non-null int64
PWGTP                                              60357 non-null int64
PAP                                                60357 non-null float64
wealthy                                            60357 non-null bool
is_male                                            60357 non-null int64
Travel_Time__half hour                             60357 non-null uint8
Travel_Time__hour                                  60357 non-null uint8
Travel_Time__long                                  60357 non-null uint8
Travel_Time__na                                    60357 non-null uint8
Travel_Time__short                                 60357 non-null uint8
Citizen__Born Abroad)                              60357 non-null uint8
Citizen__Naturalized    

In [5]:
def get_X_y(regression=False):
   global lr2
   lr2 = lr.copy(deep=True)
   y = list(lr2.wealthy.values)
   if regression:
       y = list(lr2.PINCP.values)
   del lr2['wealthy']
   del lr2['PINCP']
   X = lr2.values
   
   ### Standardize X values
   scl_obj = MinMaxScaler()
   scl_obj.fit(X)
   X = scl_obj.transform(X)
 
   return (X,y)

# variables to be used later
methodSpeeds = []
methodSpeedNames = []
_results = []
names = []

def print_accuracy(title, results):
    accuracy = round(results.mean()*100, 2)
    std = round(results.std(), 6)
    print('%s%% accuracy (%s std) - %s' % (accuracy, std, title))

def fit_and_test(title, model, show_individual_accuracies=False, print_confusion=False, regression=False, scoring='accuracy'):
    startTime = datetime.now()

    X, y = get_X_y(regression)
    
    cv_results = model_selection.cross_val_score(model, X, y, cv=10, scoring=scoring)
    
    _results.append(cv_results)
    names.append(title)
    
    print_accuracy(title, cv_results)

    methodSpeedNames.append(model.__class__.__name__)
    timePassed = datetime.now() - startTime
    methodSpeeds.append(timePassed.total_seconds())

In [6]:
mb_clf = MultinomialNB(alpha=100)
fit_and_test('bayes multinomial (alpha=100)', mb_clf, show_individual_accuracies=False)
mb_clf = MultinomialNB(alpha=1)
fit_and_test('bayes multinomial (alpha=1)', mb_clf, show_individual_accuracies=False)
mb_clf = MultinomialNB(alpha=.001)
fit_and_test('bayes multinomial (alpha=.001)', mb_clf, show_individual_accuracies=False)

92.74% accuracy (0.001426 std) - bayes multinomial (alpha=100)
92.82% accuracy (0.001239 std) - bayes multinomial (alpha=1)
92.82% accuracy (0.001171 std) - bayes multinomial (alpha=.001)


## Now do the same thing but with grid search

In [9]:
from sklearn.model_selection import GridSearchCV

help(GridSearchCV)

Help on class GridSearchCV in module sklearn.model_selection._search:

class GridSearchCV(BaseSearchCV)
 |  Exhaustive search over specified parameter values for an estimator.
 |  
 |  Important members are fit, predict.
 |  
 |  GridSearchCV implements a "fit" and a "score" method.
 |  It also implements "predict", "predict_proba", "decision_function",
 |  "transform" and "inverse_transform" if they are implemented in the
 |  estimator used.
 |  
 |  The parameters of the estimator used to apply these methods are optimized
 |  by cross-validated grid-search over a parameter grid.
 |  
 |  Read more in the :ref:`User Guide <grid_search>`.
 |  
 |  Parameters
 |  ----------
 |  estimator : estimator object.
 |      This is assumed to implement the scikit-learn estimator interface.
 |      Either estimator needs to provide a ``score`` function,
 |      or ``scoring`` must be passed.
 |  
 |  param_grid : dict or list of dictionaries
 |      Dictionary with parameters names (string) as ke

In [26]:
params = {
    'alpha': [0.0005, 10, 100],
    'fit_prior': [True, False]
}

gs_bayes = GridSearchCV(MultinomialNB(), params, cv=3, n_jobs=-1)
x,y = get_X_y()
%time gs_bayes.fit(x, y)

print(gs_bayes.best_score_, gs_bayes.best_params_)

CPU times: user 3.01 s, sys: 211 ms, total: 3.22 s
Wall time: 3.62 s
(0.92814420862534586, {'alpha': 0.0005, 'fit_prior': True})


(0.92814420862534586, {'alpha': 0.0005, 'fit_prior': True})
