In [48]:
import pandas as pd
import numpy as np
import math
from sklearn.cross_validation import train_test_split, cross_val_score, StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.manifold import MDS
from sklearn import metrics 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier 
from sklearn.neighbors import KNeighborsClassifier
from mlxtend.sklearn import EnsembleClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

In [49]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


Please build a classifier to predict the probability of a user purchasing from Staples.com by looking at the first 8 pages viewed by the user in the session.  Use the classifier to predict the labels for the data in the test set.



In [50]:
## Load Dara 
train = pd.read_csv('SparX/train.csv', header = None)
test = pd.read_csv('SparX/test.csv', header = None)

In [51]:
col_names_train = ['activity_1','time_1','activity_2','time_2','activity_3','time_3','activity_4','time_4','activity_5','time_5',
                 'activity_6','time_6','activity_7','time_7','activity_8','time_8','target']

col_names_test = ['activity_1','time_1','activity_2','time_2','activity_3','time_3','activity_4','time_4','activity_5','time_5',
                 'activity_6','time_6','activity_7','time_7','activity_8','time_8']

In [53]:
train.columns = col_names_train
train.head()

Unnamed: 0,activity_1,time_1,activity_2,time_2,activity_3,time_3,activity_4,time_4,activity_5,time_5,activity_6,time_6,activity_7,time_7,activity_8,time_8,target
0,OTHER_PAGE,0,SKU,53,OTHER_PAGE,442,SKU,528,SKU,541,SKU,545,OTHER_PAGE,668,OTHER_PAGE,686,0
1,OTHER_PAGE,0,OTHER_PAGE,17,CLASS,53,CLASS,96,CART,180,CART,631,CART,641,OTHER_PAGE,647,1
2,OTHER_PAGE,0,OTHER_PAGE,891,OTHER_PAGE,891,SKU,899,SKU,899,CART,937,CART,949,CART,973,0
3,OTHER_PAGE,0,SEARCH_RESULTS,7,OTHER_PAGE,726,SEARCH_RESULTS,731,SEARCH_RESULTS,740,SEARCH_RESULTS,744,SEARCH_RESULTS,753,SEARCH_RESULTS,768,0
4,OTHER_PAGE,0,OTHER_PAGE,35,OTHER_PAGE,35,OTHER_PAGE,41,OTHER_PAGE,139,OTHER_PAGE,139,SKU,247,OTHER_PAGE,263,0


In [54]:
train.columns

Index([u'activity_1', u'time_1', u'activity_2', u'time_2', u'activity_3', u'time_3', u'activity_4', u'time_4', u'activity_5', u'time_5', u'activity_6', u'time_6', u'activity_7', u'time_7', u'activity_8', u'time_8', u'target'], dtype='object')

In [55]:
test.columns = col_names_test
test.head()

Unnamed: 0,activity_1,time_1,activity_2,time_2,activity_3,time_3,activity_4,time_4,activity_5,time_5,activity_6,time_6,activity_7,time_7,activity_8,time_8
0,SKU,0,SKU,373,SKU,817,SKU,1546,SKU,1696,SKU,1706,SKU,1717,SKU,1727
1,OTHER_PAGE,0,OTHER_PAGE,43,OTHER_PAGE,68,OTHER_PAGE,111,OTHER_PAGE,111,SKU,206,SKU,206,SKU,292
2,OTHER_PAGE,0,OTHER_PAGE,0,SKU,26,OTHER_PAGE,47,OTHER_PAGE,48,SEARCH_RESULTS,89,SEARCH_RESULTS,95,SEARCH_RESULTS,106
3,SKU,0,OTHER_PAGE,7,OTHER_PAGE,7,CART,16,CART,74,CART,174,CART,241,CART,458
4,OTHER_PAGE,0,OTHER_PAGE,0,OTHER_PAGE,7,SEARCH_RESULTS,16,SKU,34,SKU,314,SKU,335,SKU,387


In [56]:
test.columns

Index([u'activity_1', u'time_1', u'activity_2', u'time_2', u'activity_3', u'time_3', u'activity_4', u'time_4', u'activity_5', u'time_5', u'activity_6', u'time_6', u'activity_7', u'time_7', u'activity_8', u'time_8'], dtype='object')

In [57]:
target = train['target']
del train['target']

### Feature Engineering Pipeline

In [47]:
def cat_features_corpus(data):
    '''
    Create a corpus form the different activities per transaction
    '''
    activity_corpus = []

    countvec = data[['activity_1','activity_2','activity_3','activity_4','activity_5','activity_6',\
                      'activity_7','activity_8']]

    for i in range(len(countvec)):
        activity_corpus.extend([(' ').join(countvec.ix[i,:])])
                  
    return activity_corpus

def count_vectorizer(train_, test_, **mode):
    '''
    Create a count vectorized version og the inpute data for all the activities
    in the original data
    '''
    count_vect = CountVectorizer()
    
    for name, value in mode.items():
        if value is 'train':
            train_data = count_vect.fit_transform(train_)
            return pd.DataFrame(train_data.A, columns=count_vect.get_feature_names())
        if value is 'test':
            train_data = count_vect.fit_transform(train_)
            test_data = count_vect.transform(test_)
            return pd.DataFrame(test_data.A, columns=count_vect.get_feature_names())
        
def activity_time_delta_features(data):
    '''
    (dataframe) -> (dataframe)
    Builds out a new set of features that shows the time delta between on activity 
    and the one before it
    '''
    time_features = data[['time_1','time_2','time_3','time_4','time_5','time_6','time_7','time_8']]

    delta_87 = time_features['time_8'] - time_features['time_7']
    delta_76 = time_features['time_7'] - time_features['time_6']
    delta_65 = time_features['time_6'] - time_features['time_5']
    delta_54 = time_features['time_5'] - time_features['time_4']
    delta_43 = time_features['time_4'] - time_features['time_3']
    delta_32 = time_features['time_3'] - time_features['time_2']
    delta_21 = time_features['time_2'] 
    total_time = time_features['time_8']
        
    cols_data = [delta_87,delta_76,delta_65,delta_54,delta_43,delta_32,delta_21,total_time]
    cols = ['delta_87','delta_76,','delta_65','delta_54','delta_43','delta_32','delta_21','total_time']

    time_delta_df = pd.DataFrame()
    for col_name, data in zip(cols, cols_data) :
        time_delta_df[col_name] = data
        
    return time_delta_df

def feature_engineering(train, test, **mode):
    '''
    (dataframe) -> (dataframe)
    Feature Engineering Pipeline 
    This function applies the same feature engineering actions to both the training data and the prediction data 
    '''    
    corpus_df_train = cat_features_corpus(train)
    corpus_df_test = cat_features_corpus(test)
    
    time_train = activity_time_delta_features(train)
    time_test = activity_time_delta_features(test)
      
    for name, value in mode.items():
        if value is 'train':
            train_vec_data = count_vectorizer(corpus_df_train, corpus_df_test, mode = 'train')
            return train_vec_data.merge(time_train, left_index=True, right_index=True, how='left')
        if value is 'test':
            test_vec_data = count_vectorizer(corpus_df_train, corpus_df_test, mode = 'test')
            return test_vec_data.merge(time_test, left_index=True, right_index=True, how='left')


In [12]:
## run test and train data through feature engineering pipeline 
train_data = feature_engineering(train, test, mode = 'train')
test_data = feature_engineering(train, test, mode = 'test')

In [13]:
train_data.head()

Unnamed: 0,account,cart,class,department,home,other_page,search_results,sku,skuset,delta_87,"delta_76,",delta_65,delta_54,delta_43,delta_32,delta_21,total_time
0,0,0,0,0,0,4,0,4,0,18,123,4,13,86,389,53,686
1,0,3,2,0,0,3,0,0,0,6,10,451,84,43,36,17,647
2,0,3,0,0,0,3,0,2,0,24,12,38,0,8,0,891,973
3,0,0,0,0,0,2,6,0,0,15,9,4,9,5,719,7,768
4,0,0,0,0,0,7,0,1,0,16,108,0,98,6,0,35,263


In [14]:
test_data.head()

Unnamed: 0,account,cart,class,department,home,other_page,search_results,sku,skuset,delta_87,"delta_76,",delta_65,delta_54,delta_43,delta_32,delta_21,total_time
0,0,0,0,0,0,0,0,8,0,10,11,10,150,729,444,373,1727
1,0,0,0,0,0,5,0,3,0,86,0,95,0,43,25,43,292
2,0,0,0,0,0,4,3,1,0,11,6,41,1,21,26,0,106
3,0,5,0,0,0,2,0,1,0,217,67,100,58,9,0,7,458
4,0,0,0,0,0,3,1,4,0,52,21,280,18,9,7,0,387


In [15]:
print len(train_data)
print len(test_data)

122907
30967


In [64]:
# split data
data_train, data_test, target_train, target_test = train_test_split(train_data, np.array(target), test_size=0.25, random_state=50)


We will be building models with several algorithmns to get a feel for what we may expect. 

- Random Forest 
- Ada Boost 
- Logistic REgression 
- KNN
- Gradient Boosting 
- Decision Tree


In [65]:
## build and test different tree based models

def build_model(data_train, target_train, data_test, target_test,  clf):
    clf.fit(data_train,target_train) 
    print(clf)
    expected = target_test 
    predicted = clf.predict(data_test) 
    predicted_probs = clf.predict_proba(data_test)
    #summarize
    print(metrics.classification_report(expected,predicted)) 
    print(metrics.confusion_matrix(expected,predicted))
    print 'ROC-AUC  : %0.3f' %(metrics.roc_auc_score (expected,predicted))
    print 'Accuracy : %0.3f' %(metrics.accuracy_score (expected,predicted))
        
    return predicted, predicted_probs, clf



def model_ensemble(*predictions, **prediction_mode):
    '''    
    Perform Model Averaging or ensembling

    (predictions instances for the different models, string) -> (array)
    Possible options prediction_modes : ['binary', 'probs_binary', 'probs_multi_class'] 
    
    binary : 0/1 binary predictions
    probs : likelihood / probabilities
    
    *prediction takes the prediction results from either a given set of instances of a prediction or    
    all the preictions for all the intances in the prediction set
    **prediction_mode takes the prediction mode
    
    binary
    ------
    bin_one
    >>> array([0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0])
    bin_two
    >>> array([0 0 0 1 0 0 1 1 1 1 0 0 0 0 1 0 1 1 0 0 1 0 1 1 1 1 0 0 0 0])
    bin_three
    >>> array([0 0 0 1 0 1 1 1 0 1 1 1 0 0 1 1 1 0 0 1 0 1 0 0 0 1 1 1 1 0])
    bin_four
    >>> array([1 0 0 0 1 0 1 1 1 0 1 0 1 1 1 0 1 0 1 0 0 0 1 1 0 1 0 1 0 1])
    model_ensemble(bin_one, bin_two, bin_three, bin_four, mode = 'binary')
    >>> array([0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0])
    
    probs_binary / probs_multi_class
    --------------------------------
    probs_one
    >>>[[ 0.14130402  0.63774128]
     [ 0.50817373  0.98390654]
     [ 0.11188119  0.7557338 ]
     [ 0.22295128  0.21470534]
     [ 0.0471746   0.18134759]]
    probs_two
    >>> [[ 0.43791867  0.2953648 ]
     [ 0.17505953  0.73938007]
     [ 0.55570754  0.91328122]
     [ 0.57098714  0.9477655 ]
     [ 0.03664345  0.21077864]]
    probs_three
    >>> [[ 0.94495286  0.31086103]
     [ 0.73768356  0.97090608]
     [ 0.47792986  0.06297078]
     [ 0.12999474  0.3582617 ]
     [ 0.22364166  0.59997857]]
    model_ensemble(probs_one, probs_two, probs_three, mode = 'probs')
    >>> array([[ 0.94285177,  0.45720821],
       [ 0.57108943,  0.3851022 ],
       [ 0.52443183,  0.89947945],
       [ 0.56761841,  0.13814018],
       [ 0.58913088,  0.45394568]])
     '''
    
    for name, value in prediction_mode.items():
        if value is 'probs':
            # model averaging - Soft Ensembling
            model_avg = np.array(predictions)
            return model_avg.mean(axis = 0)
        
        majority_vote = []
        if value is 'binary':    
            # perform model averaging or majority vote - Hard Ensembling
            # advisable to have odd number of models to break ties 
            for i in xrange(len(predictions)):
                majority_vote.append(np.argmax(np.bincount((np.vstack((predictions)))[:,i])))
            return np.array(majority_vote)
        

In [18]:
# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators = 100)  
rf_predicted,rf_predicted_probs, clf_ = build_model(data_train, target_train, data_test, target_test,  rf_model)
# clf_.predict_proba(data_test)  

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
             precision    recall  f1-score   support

          0       0.79      0.89      0.84     22509
          1       0.53      0.36      0.43      8218

avg / total       0.72      0.74      0.73     30727

[[19923  2586]
 [ 5269  2949]]
ROC-AUC  : 0.622
Accuracy : 0.744


In [19]:
# AdaBoost Classifier
ada_model=AdaBoostClassifier() 
ada_predicted,ada_predicted_probs, clf_ = build_model(data_train, target_train, data_test, target_test,  ada_model)
# clf_.predict_proba(test)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
             precision    recall  f1-score   support

          0       0.79      0.89      0.84     22509
          1       0.54      0.36      0.43      8218

avg / total       0.72      0.75      0.73     30727

[[20031  2478]
 [ 5277  2941]]
ROC-AUC  : 0.624
Accuracy : 0.748


In [20]:
# Decision Tree Classifier
tree_model=DecisionTreeClassifier() 
tree_predicted,tree_predicted_probs, clf_ = build_model(data_train, target_train, data_test, target_test,  tree_model)
# clf_.predict_proba(test)


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best')
             precision    recall  f1-score   support

          0       0.79      0.77      0.78     22509
          1       0.41      0.42      0.41      8218

avg / total       0.68      0.68      0.68     30727

[[17420  5089]
 [ 4746  3472]]
ROC-AUC  : 0.598
Accuracy : 0.680


In [26]:
# K Neighbors Classifier
knn_model=KNeighborsClassifier()
knn_predicted,knn_predicted_probs, clf_ =  build_model(data_train, target_train, data_test, target_test, knn_model)
# clf_.predict_proba(test)    

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=5, p=2, weights='uniform')
             precision    recall  f1-score   support

          0       0.74      0.87      0.80     22509
          1       0.29      0.15      0.20      8218

avg / total       0.62      0.67      0.64     30727

[[19500  3009]
 [ 6987  1231]]
ROC-AUC  : 0.508
Accuracy : 0.675


In [27]:
# Gradient Boosting Classifier
gbrt_model=GradientBoostingClassifier() 
gradient_predicted,gradient_predicted_probs, clf_=  build_model(data_train, target_train, data_test, target_test,  gbrt_model)
# clf_.predict_proba(test)  

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              random_state=None, subsample=1.0, verbose=0,
              warm_start=False)
             precision    recall  f1-score   support

          0       0.79      0.89      0.84     22509
          1       0.55      0.36      0.43      8218

avg / total       0.73      0.75      0.73     30727

[[20104  2405]
 [ 5277  2941]]
ROC-AUC  : 0.626
Accuracy : 0.750


In [28]:
# Logistic Regresssion Classifier
logit_model=LogisticRegression() 
logit_predicted,logit_predicted_probs, clf_=  build_model(data_train, target_train, data_test, target_test,logit_model)
# clf_.predict_proba(test)  

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)
             precision    recall  f1-score   support

          0       0.77      0.91      0.83     22509
          1       0.51      0.26      0.34      8218

avg / total       0.70      0.74      0.70     30727

[[20457  2052]
 [ 6090  2128]]
ROC-AUC  : 0.584
Accuracy : 0.735


In [None]:
def threshold_rebalancer(dt):
    if dt > 0.495:
        return 1
    else:
        return 0

    
## testing different class thresholds
prob_check = pd.DataFrame(ada_predicted_probs) 
prob_check['target_test'] =  target_test
prob_check['ada_predicted'] =  ada_predicted

prob_check.columns = ['neg','pos','target_test','ada_predicted']
x = prob_check[prob_check.target_test == 1]
# x[x.ada_predicted == 0]


prob_check['rebalance'] = prob_check.pos.apply(threshold_rebalancer)


prob_check.head()

print metrics.confusion_matrix(prob_check.target_test,prob_check.rebalance)
print 'ROC-AUC  : %0.3f' %(metrics.roc_auc_score (prob_check.target_test,prob_check.rebalance))
print 'Accuracy : %0.3f' %(metrics.accuracy_score (prob_check.target_test,prob_check.rebalance))
   

In [58]:
from collections import Counter
Counter(target)

Counter({0: 89960, 1: 32947})

Here, we are using the cross validated models to make predictions on the test data

In [36]:

# target = np.array(target_train)
# train = np.array(data_train)


# target = np.array(shuffle_target)
# train = np.array(shuffle_data)

target = target
train =  np.array(train_data)
test =  np.array(test_data)


test_data_results = {}
# Iterate throught the models
clfs = ['rf_model', 'ada_model', 'logit_model', 'knn_model', 'gbrt_model','tree_model']
for clf in clfs:
    print 'Building', clf
    # K-Fold cross validation. 10 folds.
    cv = StratifiedKFold(target, n_folds=5, indices=None, shuffle=True, random_state=23)
    print cv

    #iterate through the training and test cross validation segments, run the classifier on each one and aggregating the resultss
    results = []

    for traincv_indx, testcv_indx in cv:
        model_probas = eval(clf).fit(train[traincv_indx], target[traincv_indx])
        results.append(model_probas.predict_proba(test))
    
    # store the probabilities for each cross validated model
    probs = model_ensemble(*results, mode = 'probs')
    test_data_results[clf] = probs

Building rf_model
sklearn.cross_validation.StratifiedKFold(labels=[0 1 0 ..., 0 0 0], n_folds=5, shuffle=True, random_state=23)
Building ada_model
sklearn.cross_validation.StratifiedKFold(labels=[0 1 0 ..., 0 0 0], n_folds=5, shuffle=True, random_state=23)
Building logit_model
sklearn.cross_validation.StratifiedKFold(labels=[0 1 0 ..., 0 0 0], n_folds=5, shuffle=True, random_state=23)
Building knn_model
sklearn.cross_validation.StratifiedKFold(labels=[0 1 0 ..., 0 0 0], n_folds=5, shuffle=True, random_state=23)
Building gbrt_model
sklearn.cross_validation.StratifiedKFold(labels=[0 1 0 ..., 0 0 0], n_folds=5, shuffle=True, random_state=23)
Building tree_model
sklearn.cross_validation.StratifiedKFold(labels=[0 1 0 ..., 0 0 0], n_folds=5, shuffle=True, random_state=23)


In [37]:
# generate probabilities ofr belonging to either class by combining the results of the diffrent models.
predictions = pd.DataFrame(model_ensemble(*test_data_results.values(), mode = 'probs'))

In [39]:
predictions.columns= ['Prob_No_Purchase','Prob_Purchase']
predictions.head(10)

Unnamed: 0,Prob_No_Purchase,Prob_Purchase
0,0.855428,0.144572
1,0.790917,0.209083
2,0.709249,0.290751
3,0.519834,0.480166
4,0.762681,0.237319
5,0.800037,0.199963
6,0.804325,0.195675
7,0.787923,0.212077
8,0.904841,0.095159
9,0.609814,0.390186


In [41]:
predictions_bool = pd.DataFrame()

In [46]:
def get_predictions(dt):
    '''
    Get Predictions using a threshold of 0.5
    '''
    if dt > 0.5:
        return 1
    else:
        return 0
    
predictions_bool['predictions'] = predictions['Prob_Purchase'].apply(get_predictions)
predictions_bool.to_csv('predictions_bool.csv', index=False, header=False)

### Further Improvements

There are probably a few more things we could to improve our model. Some of them include 
- Rebalancing the classes in the training data by upsampling the minority class via bootstraping 
- Since one class is severly under-represented, we can assign higher importance weights to that class.
- We could probably always add on more enhanced features to our feature vectors to enable us extract more signals for each of the classes
- We could build feature from the frequent item set miniing of the activities
- We couldalso explore sequence mining to see if certain sequences give us more distinct signatures for each class
- We could also use support / confidence scores from doing associative mining on the activities
- Hyper-parameter tuning

Doing the things above might enable our training algorithm to essentially see more instances / examples of the minority class and hence it will be able to extract those signals better