In [75]:
import os
import numpy as np
import pandas as pd

%matplotlib inline
pd.set_option("max_rows", 10)
np.set_printoptions(suppress=True)

with open("data\\adult.names") as fin:
    notes = fin.read()

# Scikit-Learn Transformers

- all have .transform() method
- OneHotEncoder, PCA, SVD, etc.

# Scikit-Learn Estimators

- all have .predict() method
- DecisionTreeClassifier(), LogisticRegression(), LinearRegression(), RandomForestClassifier(), etc.

In [76]:
dta = pd.read_csv("data/adult.data.cleaned.csv.gz", compression="gzip")
test = pd.read_csv("data/adult.test.cleaned.csv.gz", compression="gzip")

y = dta.pop("income")
y_test = test.pop("income")

# Data Pre-processing using LabelBinarizer Transformer
- Scikit-learn estimators only take numeric data (float)

In [77]:
from sklearn.preprocessing import LabelBinarizer

binarizer = LabelBinarizer()

estimator.fit_transform() is equivalent of doing
estimator.fit(), then 
estimator.transform()

estimator.fit() mutates the estimator object according to given dataset

estimator.transform() use the mutated estimator object, apply its ruls to data input and return output

In [78]:
try:
    print binarizer.classes_
except:
    print "no classes_ attribute available"
binarizer.fit(dta.native_country.head(15))
print binarizer.classes_

no classes_ attribute available
['?' 'Cuba' 'India' 'Jamaica' 'United-States']


In [79]:
dta.native_country.head(15).values

array(['United-States', 'United-States', 'United-States', 'United-States',
       'Cuba', 'United-States', 'Jamaica', 'United-States',
       'United-States', 'United-States', 'United-States', 'India',
       'United-States', 'United-States', '?'], dtype=object)

In [80]:
# perform 1-hot encoding
binarizer.fit_transform(dta.native_country.head(15))

array([[0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0]])

In [81]:
# Pandas equivalent of LabelBinarizer for doing 1-hot encoding
X_train = pd.get_dummies(dta)

X_test = pd.get_dummies(test)

In [82]:
# Sanity check
X_train.columns.difference(X_test.columns)

Index([u'native_country_Holand-Netherlands'], dtype='object')

In [83]:
# make a column in testing set for the missing observation, then assign all zeros because of no observation
X_test[X_train.columns.difference(X_test.columns)[0]] = 0

# make sure that training set and testing set have same column ordering
X_test = X_test[X_train.columns]

# Example of a customized Transformer
-  reliably transforms DataFrames and Arrays

In [110]:
from sklearn.base import TransformerMixin, BaseEstimator


class PandasTransformer(TransformerMixin, BaseEstimator):
    def __init__(self, dataframe):
        self.columns = dataframe.columns
        self.obj_columns = self.get_obj_cols(dataframe, index=True)
        obj_index = np.zeros(dataframe.shape[1], dtype=bool)
        obj_index[self.obj_columns] = True
        self.obj_index = obj_index
        
    def get_obj_cols(self, dta, index=False):
        """
        Return all column names whose data type is not float
        
        dta : pd.DataFrame
        index : bool
                Whether to return column names or the numeric index.
                Default False, returns column names.
        """
        columns = dta.columns.tolist()
        obj_col_names = list(filter(lambda x : dta[x].dtype.kind == "O", 
                                columns))
        if not index:
            return obj_col_names
        else:
            return list(columns.index(col) for col in obj_col_names) 
        
    def fit(self, X, y=None):
        X = np.asarray(X)
        # create the binarizer transforms
        _transformers = {}
        for col in self.obj_columns:
            print "column index:\n" + str(col)
            print "column value:"
            print X[:, col]
            _transformers.update({col : LabelBinarizer().fit(X[:, col])})
        
        self._transformers = _transformers
        return self
    
    def transform(self, X, y=None):
        X = np.asarray(X)
        
        dummies = None
        for col in self.obj_columns:
            if dummies is None:
                dummies = self._transformers[col].transform(X[:, col])
            else:
                new_dummy = self._transformers[col].transform(X[:, col])
                dummies = np.column_stack((dummies, new_dummy))
            
        # remove original columns
        X = X[:, ~self.obj_index]
        X = np.column_stack((X, dummies))
        return X
    
    def demo(self):
        obj_cols = self.get_obj_cols(dta)

        for col in obj_cols:
            print col
        
pt = PandasTransformer(dta)
pt.demo()

workclass
education
marital_status
occupation
relationship
race
sex
native_country


# A Decision Tree Estimator

In [111]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz

dtree = DecisionTreeClassifier(random_state = 0, max_depth = 2)

dtree.fit(X_train, y) # does the traning here for decision tree estimator

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=0, splitter='best')

#Pipeline object,  chained transformers and estimators with same Scikit-learn API
- in this case chaining custom PandasTransformer and DecisionTreeClassifier in to a pipeline object, one call does it all

In [112]:
from sklearn.pipeline import Pipeline

dtree_estimator_pipeline = Pipeline([('transformer', PandasTransformer(dta)), 
                            ('dtree', dtree)])

In [113]:
dtree_estimator_pipeline.fit(dta, y) # print outs are from PandasTransformer.fit() calls

column index:
1
column value:
['State-gov' 'Self-emp-not-inc' 'Private' ..., 'Private' 'Private'
 'Self-emp-inc']
column index:
3
column value:
['Bachelors' 'Bachelors' 'HS-grad' ..., 'HS-grad' 'HS-grad' 'HS-grad']
column index:
5
column value:
['Never-married' 'Married-civ-spouse' 'Divorced' ..., 'Widowed'
 'Never-married' 'Married-civ-spouse']
column index:
6
column value:
['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' ..., 'Adm-clerical'
 'Adm-clerical' 'Exec-managerial']
column index:
7
column value:
['Not-in-family' 'Husband' 'Not-in-family' ..., 'Unmarried' 'Own-child'
 'Wife']
column index:
8
column value:
['White' 'White' 'White' ..., 'White' 'White' 'White']
column index:
9
column value:
['Male' 'Male' 'Male' ..., 'Female' 'Male' 'Female']
column index:
13
column value:
['United-States' 'United-States' 'United-States' ..., 'United-States'
 'United-States' 'United-States']


Pipeline(steps=[('transformer', PandasTransformer(dataframe=None)), ('dtree', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=0, splitter='best'))])

In [117]:
# can only execute one of the steps within the pipeline

print  "Calling PandasTransformer part of pipeline"
dtree_estimator_pipeline.named_steps['transformer'].fit(dta)

print "Calling DecisionTreeClassifier part of pipeline"
dtree_estimator_pipeline.named_steps['dtree']

Calling PandasTransformer part of pipeline
column index:
1
column value:
['State-gov' 'Self-emp-not-inc' 'Private' ..., 'Private' 'Private'
 'Self-emp-inc']
column index:
3
column value:
['Bachelors' 'Bachelors' 'HS-grad' ..., 'HS-grad' 'HS-grad' 'HS-grad']
column index:
5
column value:
['Never-married' 'Married-civ-spouse' 'Divorced' ..., 'Widowed'
 'Never-married' 'Married-civ-spouse']
column index:
6
column value:
['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' ..., 'Adm-clerical'
 'Adm-clerical' 'Exec-managerial']
column index:
7
column value:
['Not-in-family' 'Husband' 'Not-in-family' ..., 'Unmarried' 'Own-child'
 'Wife']
column index:
8
column value:
['White' 'White' 'White' ..., 'White' 'White' 'White']
column index:
9
column value:
['Male' 'Male' 'Male' ..., 'Female' 'Male' 'Female']
column index:
13
column value:
['United-States' 'United-States' 'United-States' ..., 'United-States'
 'United-States' 'United-States']
Calling DecisionTreeClassifier part of pipeline


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=0, splitter='best')

In [119]:
# directly make predictions using pipeline object
dtree_estimator_pipeline.predict_proba(test)

array([[ 0.95085099,  0.04914901],
       [ 0.66898258,  0.33101742],
       [ 0.66898258,  0.33101742],
       ..., 
       [ 0.28082345,  0.71917655],
       [ 0.95085099,  0.04914901],
       [ 0.28082345,  0.71917655]])

# ALL objects in Scikit-learn can be serialized and save to disk
- picklable
- joblib is preferable to using pickle, more efficient

In [46]:
import joblib

ImportError: No module named joblib

# Ensemble methods
- Boosting
    - fits series of models sequentially on modified/weighted data
    - Gradient Boosting, Ada Boosting
- Bagging
    - Bootstrap Aggregating (Bagging), NOT sequential
    - Random Forest