In [1]:
import pandas as pd
import numpy as np
from __future__ import print_function
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import StratifiedKFold
from get_smarties import Smarties

### Prepare Dataset

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/yhat/demo-churn-pred/master/model/churn.csv')
df = df.drop(['Phone', 'Area Code'], axis=1)

In [3]:
X = df[df.columns[0:-1]]
Y = np.array(df[df.columns[-1]] == 'True.', dtype=np.int8)

### Setup pipeline with Smarties
`get_smarties` has fit/transform capabilities, which means you can inject it directly into your sklearn pipelines

In [4]:
training_pipeline = Pipeline([
    ('smarties', Smarties()),
    ('clf', MultinomialNB()),
])

### Easily train and test with 10-fold cross validation

In [5]:
accuracies = []
fold = 1
skf = StratifiedKFold(n_splits=10)

for train_index, test_index in skf.split(X, Y):
    x_train = X.iloc[train_index]
    y_train = Y[train_index]
    x_test = X.iloc[test_index]
    y_test = Y[test_index]
    
    # fit dataset
    training_pipeline.fit(x_train, y_train)
    
    # get test accuracy
    accuracy = training_pipeline.score(x_test, y_test)
    
    print('Fold', fold, 'accuracy:', accuracy)
    fold += 1
    accuracies.append(accuracy)

Fold 1 accuracy: 0.571856287425
Fold 2 accuracy: 0.664670658683
Fold 3 accuracy: 0.568862275449
Fold 4 accuracy: 0.597597597598
Fold 5 accuracy: 0.678678678679
Fold 6 accuracy: 0.6996996997
Fold 7 accuracy: 0.654654654655
Fold 8 accuracy: 0.633633633634
Fold 9 accuracy: 0.618618618619
Fold 10 accuracy: 0.66966966967


In [6]:
print('Final k-fold test accuracy:', np.average(accuracies))

Final k-fold test accuracy: 0.635794177411
