In [61]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import csr_matrix
from sklearn.linear_model import LogisticRegression as LR
from matplotlib import pyplot as plt
%matplotlib inline

TreeStruct and RefinedRandomClassifier copied from dune_dweller 

In [62]:

class TreeStruct():
    TREE_LEAF = -1
    DELETED_LEAF = -5

    def __init__(self, tree):
        self.children_left = tree.children_left
        self.children_right = tree.children_right
        self.value = tree.value
        self.update_leaves()

    def update_leaves(self):
        self.leaves = np.nonzero(self.children_left==TreeStruct.TREE_LEAF)[0]
        if self.leaves.shape[0] == 1:
            # this tree has been pruned to the root, we should delete it from the list of estimators
            return True
        self.leaf_siblings = np.array([self.find_sibling_node(leaf) for leaf in self.leaves])
        self.leaf_pos = np.zeros(self.children_left.shape[0],dtype=np.int32) - 1
        positions = np.arange(self.leaves.shape[0])
        self.leaf_pos[self.leaves] = positions
        return False

    def find_sibling_node(self, node):
        left = np.nonzero((self.children_left==node))[0]
        if left.shape[0] > 0:
            return self.children_right[left[0]]
        right = np.nonzero((self.children_right==node))[0]
        return self.children_left[right[0]]

    def is_leaf(self, node):
        return self.children_left[node] == TreeStruct.TREE_LEAF
    def is_pruned(self, node):
        return self.children_left[node] == TreeStruct.DELETED_LEAF

    def sibling_leaf_positions(self):
        return self.leaf_pos[self.leaf_siblings]

    def merge_leaves(self, leaf):
        if self.is_pruned(leaf):
            return False # already merged
        else:
            assert self.is_leaf(leaf)
            sib = self.leaf_siblings[self.leaf_pos[leaf]]
            if not self.is_leaf(sib):
                return False # can't merge leaf with branch
            self.children_left[[leaf, sib]] = TreeStruct.DELETED_LEAF
            self.children_right[[leaf, sib]] =  TreeStruct.DELETED_LEAF
            parent = np.nonzero(np.logical_or(self.children_left==leaf,self.children_right == leaf))[0][0]
            self.children_left[parent] = TreeStruct.TREE_LEAF
            self.children_right[parent] = TreeStruct.TREE_LEAF
            return True


class RefinedRandomForest():
    def __init__(self, rf, C = 1.0, prune_pct = 0.1, n_prunings = 1, criterion = 'sumnorm'):
        self.rf_ = rf
        self.C = C
        self.prune_pct = prune_pct
        self.n_prunings = n_prunings
        self.criterion = criterion
        self.trees_ = [TreeStruct(tree.tree_) for tree in rf.estimators_]
        self.leaves()
    
    def leaves(self):
        self.n_leaves_ = [tree.leaves.shape[0] for tree in self.trees_]
        self.M = np.sum(self.n_leaves_)
        self.offsets_ = np.zeros_like(self.n_leaves_)
        self.offsets_[1:] = np.cumsum(self.n_leaves_)[:-1]
        self.ind_trees_ = np.zeros(self.M,dtype=np.int32)
        self.ind_leaves_ = np.zeros(self.M,dtype=np.int32)
        for tree_ind, tree in enumerate(self.trees_):
            start = self.offsets_[tree_ind]
            end = self.offsets_[tree_ind+1] if tree_ind+1<len(self.trees_) else self.M
            self.ind_trees_[start:end] = tree_ind
            self.ind_leaves_[start:end] = tree.leaves

    def get_indicators(self, X):
        leaf = self.rf_.apply(X)
        sample_ind = np.arange(X.shape[0])
        row_ind = []
        col_ind = []
        for tree_ind, tree in enumerate(self.trees_):
            X_leaves = leaf[:,tree_ind]
            row_ind.append(sample_ind)
            col_ind.append(self.offsets_[tree_ind]+tree.leaf_pos[X_leaves])
        row_ind = np.concatenate(row_ind)
        col_ind = np.concatenate(col_ind)
        data = np.ones_like(row_ind)
        indicators = csr_matrix((data, (row_ind, col_ind)), shape=(X.shape[0],self.M))
        return indicators

    def prune_trees(self):
        ind_siblings = np.zeros_like(self.ind_leaves_)
        for tree_ind, tree in enumerate(self.trees_):
            offset = self.offsets_[tree_ind]
            sibl_ind = tree.sibling_leaf_positions()
            sibl_ind[sibl_ind>=0] += offset
            start = self.offsets_[tree_ind]
            end = self.offsets_[tree_ind+1] if tree_ind+1<len(self.trees_) else self.M
            ind_siblings[start:end] = sibl_ind
        coef = self.lr.coef_
        sibl_coef = coef[:,ind_siblings]
        sibl_coef[:,ind_siblings < 0] = np.inf # so that we don't merge leaf with branch
        if self.criterion == 'sumnorm':
            sum_coef = np.sum(coef**2 + sibl_coef**2,axis=0)
        elif self.criterion == 'normdiff':
            sum_coef = np.sum((coef - sibl_coef)**2,axis=0) # = little difference between adjacent leaves. Also gives good results.
        ind = np.argsort(sum_coef)
        n_prunings = np.floor(coef.shape[1] * self.prune_pct).astype(int)
        pruned = 0
        i = 0
        while pruned < n_prunings:
            tree_ind = self.ind_trees_[ind[i]]
            leaf_ind = self.ind_leaves_[ind[i]]
            res = self.trees_[tree_ind].merge_leaves(leaf_ind)
            if res:
                pruned += 1
            i += 1
        to_delete = []
        for tree_ind, tree in enumerate(self.trees_):
            if tree.update_leaves():
                to_delete.append(tree)
        for tree in to_delete:
            treeind = self.trees_.index(tree)
            del self.rf_.estimators_[treeind]
            self.trees_.remove(tree)
        self.leaves()

    def fit(self, X, y):
        n_pruned = 0
        while n_pruned <= self.n_prunings:
            indicators = self.get_indicators(X)
            #print('Model size: {} leaves'.format(indicators.shape[1]))
            #self.svr = SVR(C=self.C,fit_intercept=False,epsilon=0.)
            self.lr = LR(C=self.C,
                            fit_intercept=False,
                            solver='lbfgs',
                            max_iter=100,
                            multi_class='multinomial', n_jobs=-1)
            self.lr.fit(indicators,y)
            if n_pruned < self.n_prunings:
                self.prune_trees()
            n_pruned += 1
        for tree_ind, tree in enumerate(self.trees_):
            offset = self.offsets_[tree_ind]
            tree.value[tree.leaves,0,:] = self.lr.coef_[:,offset:offset + tree.leaves.shape[0]].T

    def predict_proba(self, X):
        return self.lr.predict_proba(self.get_indicators(X))

In [63]:
def removestringprefix(string):
    return int(string.split(" ")[1])
def load_data():
    ##load the data
    train = pd.read_csv('data/train.csv',index_col='id',
                    converters = {'location':removestringprefix}) 
    train=train.fillna(0)
    test = pd.read_csv('data/test.csv',index_col='id',
                    converters = {'location':removestringprefix})
    test=test.fillna(0)
    event_type = pd.read_csv('data/event_type.csv',
                            converters={'event_type':removestringprefix})
    log_feature = pd.read_csv('data/log_feature.csv',
                             converters={'log_feature':removestringprefix})
    resource_type = pd.read_csv('data/resource_type.csv',
                               converters={'resource_type':removestringprefix})
    severity_type = pd.read_csv('data/severity_type.csv',index_col='id',converters={'severity_type':removestringprefix})
    loc = pd.concat((train[['location']],test[['location']]),axis=0)
    return dict(train=train,test=test,loc=loc,events=event_type,log=log_feature,res=resource_type,sev=severity_type)
def build_features(data):
    features = []
    sev=data['sev']
    loc=data['loc']
    events=data['events']
    log=data['log']
    res=data['res']
    ##length of severity and the total dataste is same. so it makes sence to
    ##merge severity data and location data as it is a 1:1 mapping
    df=pd.DataFrame(0,index=sev.index,columns=[])
    df['severity_type']=sev.severity_type
    df['fault_severity']=data['train'].fault_severity
    df['location']=loc.location
    #number the seve within location helps us to figure out which location is critical for worse sevs
    df['num']=df.groupby('location')['severity_type'].transform(lambda x:np.arange(x.shape[0])+1)
    df['normalizedsevcount']=df.groupby('location')['num'].transform(lambda x:x/(x.max()+1))
    loccount=pd.DataFrame(loc['location'].value_counts()).rename(columns={'location':'loc_count'})
    df=pd.merge(df,loccount,how='left',left_on='location',right_index=True)
    df=df.fillna(0)
    ##find out which ids caused the most events. may be this feature would be helpful
    event_count=pd.DataFrame(events['id'].value_counts()).rename(columns={'id':'event_count'})
    df=pd.merge(df,event_count,how='left',left_index=True,right_index=True)
    ## all such events ,logs,resources will be made as columns for the ids. So that we can compare the data better.
    eventsfreqperid=events.groupby(['id','event_type'])['id'].count().unstack().fillna(0).add_prefix('event_')
    ## the freq of a event for any id ..
    df=pd.merge(df,eventsfreqperid,how='left',right_index=True,left_index=True).fillna(0)
    ## as per dune dweller, instead of volume , tkae the log of it to have normalised range
    log['logvolume'] = np.log(log.volume + 1)
    df['volsumlog'] = np.log1p(log.groupby('id')['volume'].agg('sum'))
    logvolagg=log.groupby('id')['logvolume'].agg(['count','min','mean','max','std','sum']).fillna(0).add_prefix('logvolume_')
    logvol = log.groupby('id')['logvolume'].agg(['count','min','mean','max','std','sum']).fillna(0).add_prefix('logvolume_')
    df = pd.merge(df, logvol, how='left', right_index=True, left_index=True).fillna(0)
    logvolfreqperid=log.groupby(['id','log_feature'])['logvolume'].mean()
    logvolfreqperid=logvolfreqperid.unstack().fillna(0).add_prefix('logfeatvol_').fillna(0)
    df=pd.merge(df,logvolfreqperid,how='left',left_index=True,right_index=True)
    nresources = pd.DataFrame(res['id'].value_counts()).rename(columns={'id':'nresources'})
    df=pd.merge(df,nresources,how='left',left_index=True,right_index=True).fillna(0)
    resfreq=res.groupby(['id','resource_type'])['resource_type'].count()
    resfreq=resfreq.unstack().fillna(0).add_prefix('resourcetype_')
    df=pd.merge(df,resfreq,how='left',left_index=True,right_index=True).fillna(0)
    print("shape",df.shape)
    return df
def multiclass_log_loss(y_true, y_pred, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    https://www.kaggle.com/wiki/MultiClassLogLoss
    Parameters
    ----------
    y_true : array, shape = [n_samples]
        true class, intergers in [0, n_classes - 1)
    y_pred : array, shape = [n_samples, n_classes]
    Returns
    -------
    loss : float
    """
    predictions = np.clip(y_pred, eps, 1 - eps)

    # normalize row sums to 1
    predictions /= predictions.sum(axis=1)[:, np.newaxis]

    actual = np.zeros(y_pred.shape)
    n_samples = actual.shape[0]
    actual[np.arange(n_samples), y_true.astype(int)] = 1
    vectsum = np.sum(actual * np.log(predictions))
    loss = -1.0 / n_samples * vectsum
    return loss

In [64]:
data=load_data()
df = build_features(data)
y=data['train'].fault_severity
train = data['train']
test = data['test']
df.fillna(0)
kf = StratifiedKFold(y.values, n_folds=10, shuffle=True, random_state = 1234)
for itrain, itest in kf:
    clf = RandomForestClassifier(n_estimators=300, min_samples_leaf=15, max_features=45, random_state=1)
    clf1 = RandomForestClassifier(n_estimators=300, max_depth=7, max_features=5, random_state=1)
    if itrain is None:
        # No indices into train, so return all train data
            itrain = train.index
    else:
            itrain = train.index[itrain]
    if itest is None:
            itest = test.index
    else:
            itest = train.index[itest]
    Xtr=df.loc[itrain]
    Xtr = Xtr.drop(['fault_severity'], axis=1)
    Xte=df.loc[itest]
    Xte = Xte.drop(['fault_severity'], axis=1)
    ytr = df.loc[itrain, 'fault_severity']
    yte = df.loc[itest,'fault_severity']
    ##Xte.drop('fault_severity')
    ##Xtr.drop('fault_severity')
    clf.fit(Xtr, ytr)
    ##print("predict train",clf.predict_proba(Xtr))
    ##print("predict test",clf.predict_proba(Xte))
    loss2tr = multiclass_log_loss(ytr.values, clf.predict_proba(Xtr))
    loss2te = multiclass_log_loss(yte.values, clf.predict_proba(Xte))
    print("Random Forest when max features is 40: train loss {:.4f}, test loss {:.4f}".format(loss2tr, loss2te))
    clf1.fit(Xtr, ytr)
    loss5tr = multiclass_log_loss(ytr.values, clf1.predict_proba(Xtr))
    loss5te = multiclass_log_loss(yte.values, clf1.predict_proba(Xte))
    print("Random Forest when max features is 5: train loss {:.4f}, test loss {:.4f}".format(loss5tr, loss5te))
    rrf = RefinedRandomForest(clf, C = 0.01, n_prunings = 0)
    rrf.fit(Xtr, ytr)
    loss3tr = multiclass_log_loss(ytr.values, rrf.predict_proba(Xtr))
    loss3te = multiclass_log_loss(yte.values, rrf.predict_proba(Xte))
    ##print("predict train",rrf.predict_proba(Xtr))
    ##print("predict test",rrf.predict_proba(Xte))
    print("Refined Random Forest: train loss {:.4f}, test loss {:.4f}".format(loss3tr, loss3te))
    important_features = []
    print ('Most important features:')
    for x,i in enumerate(clf.feature_importances_):
        if i>np.average(clf.feature_importances_):
            ##important_features.append(str(x))
            print(Xtr.columns[x])
    break
print("The refined forest and the random forest is overfitting")
    

shape (18552, 464)
Random Forest when max features is 40: train loss 0.4877, test loss 0.5203
Random Forest when max features is 5: train loss 0.7008, test loss 0.7123
Refined Random Forest: train loss 0.2164, test loss 0.4273
Most important features:
severity_type
location
num
normalizedsevcount
loc_count
event_count
event_11
event_13
event_15
event_20
event_34
event_35
volsumlog
logvolume_count
logvolume_min
logvolume_mean
logvolume_max
logvolume_std
logvolume_sum
logfeatvol_54
logfeatvol_68
logfeatvol_70
logfeatvol_71
logfeatvol_73
logfeatvol_82
logfeatvol_134
logfeatvol_170
logfeatvol_193
logfeatvol_195
logfeatvol_203
logfeatvol_232
logfeatvol_233
logfeatvol_273
logfeatvol_291
logfeatvol_312
logfeatvol_313
logfeatvol_315
logfeatvol_368
nresources
resourcetype_2
resourcetype_6
resourcetype_8
The refined forest and the random forest is overfitting
