In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import make_classification

In [40]:
X,y=make_classification(n_samples=600, n_features=500, n_informative=10, n_redundant=490, n_classes=2, random_state=0)
df=pd.DataFrame(X)
rename_cols={col:'col_'+str(col) for col in df.columns}
df.rename(columns=rename_cols,inplace=True)
df['y']=y

In [41]:
df.shape

(600, 501)

### Encode target

In [5]:
from sklearn.cluster import KMeans
from sklearn.model_selection  import train_test_split,GridSearchCV,StratifiedKFold,KFold
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier

In [95]:
km=KMeans(n_clusters=50,random_state=56)
m=km.fit_predict(df.iloc[:,:-1])
df['cluster']=m

In [68]:
m=np.random.randint(0,20,df.shape[0])
df['cluster']=m

### The wrong way: target encode before spliting

In [98]:
df_train,df_test=train_test_split(df,test_size=0.2,random_state=20)
map_col=df_train.groupby('cluster')['y'].mean().to_dict()
df_train.loc[:,'cluster']=df_train['cluster'].map(map_col)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [99]:
map_col

{0: 1.0,
 1: 0.5,
 2: 0.0,
 3: 0.7368421052631579,
 4: 0.4,
 5: 0.0,
 6: 0.7333333333333333,
 7: 1.0,
 8: 0.9375,
 9: 0.6666666666666666,
 10: 0.5238095238095238,
 11: 1.0,
 12: 0.14285714285714285,
 13: 0.0,
 14: 0.5,
 15: 0.7333333333333333,
 16: 0.3333333333333333,
 17: 0.1111111111111111,
 18: 1.0,
 19: 0.0,
 20: 1.0,
 21: 0.0,
 22: 1.0,
 23: 0.9166666666666666,
 24: 0.07692307692307693,
 25: 0.0,
 26: 0.42857142857142855,
 27: 0.16666666666666666,
 28: 0.75,
 29: 1.0,
 30: 0.9,
 31: 0.4,
 32: 1.0,
 33: 0.14285714285714285,
 34: 0.8125,
 35: 0.2,
 36: 0.0,
 37: 0.2,
 38: 0.0,
 39: 0.125,
 40: 0.0,
 41: 1.0,
 42: 0.2,
 43: 0.25,
 44: 0.0,
 45: 0.2222222222222222,
 46: 0.0,
 48: 1.0,
 49: 0.9166666666666666}

In [100]:
X=df_train.drop('y',axis=1)
y=df_train['y']

In [101]:
rf=RandomForestClassifier(n_estimators=150,max_depth=20,random_state=70,n_jobs=-1,min_samples_leaf=10)
kf=KFold(n_splits=5,random_state=0,shuffle=True)

scores=[]
for train_id,val_id in kf.split(X):
    x_train,xval=X.iloc[train_id],X.iloc[val_id]
    y_train,yval=y.iloc[train_id],y.iloc[val_id]

    rf.fit(x_train,y_train)
    scores.append(rf.score(xval,yval))
scores=np.array(scores)
scores.mean()

0.8958333333333334

In [102]:
scores

array([0.88541667, 0.91666667, 0.89583333, 0.86458333, 0.91666667])

In [105]:
df_test['cluster']=df_test['cluster'].map(map_col)
Xtest=df_test.drop('y',axis=1)
Ytest=df_test['y']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['cluster']=df_test['cluster'].map(map_col)


In [None]:
rf.score(Xtest,Ytest)

In [68]:
rf.feature_importances_

array([0.0906415 , 0.05884074, 0.02685866, 0.02052799, 0.02598829,
       0.01227632, 0.04605283, 0.01435457, 0.00903182, 0.02182733,
       0.00645207, 0.04398979, 0.03613963, 0.02715003, 0.10551366,
       0.0195239 , 0.04028528, 0.01522972, 0.10386326, 0.00778623,
       0.26766637])

### The right way: first split then target encode

In [78]:
kf=KFold(n_splits=5,random_state=0,shuffle=True)
rf=RandomForestClassifier(n_estimators=100,max_depth=20,random_state=0,n_jobs=-1,min_samples_leaf=10)

for train_id,val_id in kf.split(df):
    xtrain,xval=df.iloc[train_id].copy(),df.iloc[val_id].copy()

    map_col=xtrain.groupby('cluster')['y'].mean().to_dict()
    xtrain['cluster']=xtrain['cluster'].map(map_col)
    map_col=xval.groupby('cluster')['y'].mean().to_dict()
    xval['cluster']=xval['cluster'].map(map_col)
    ytrain,yval=xtrain.loc[:,'y'],xval.loc[:,'y']
    xtrain,xval=xtrain.drop('y',axis=1),xval.drop('y',axis=1)

    rf.fit(xtrain,ytrain)
    print(rf.score(xval,yval))

0.925
0.8916666666666667
0.875
0.8833333333333333
0.8666666666666667


In [9]:
rf.feature_importances_

array([0.058017  , 0.05197257, 0.02116001, 0.01423254, 0.02679718,
       0.01227533, 0.03368493, 0.00607589, 0.00238761, 0.00801643,
       0.00181852, 0.04417946, 0.00906545, 0.00710951, 0.0656683 ,
       0.00409355, 0.02129129, 0.00814754, 0.09315664, 0.00304109,
       0.50780915])

In [76]:
kf=KFold(n_splits=10,random_state=0,shuffle=True)
rf=RandomForestClassifier(n_estimators=100,max_depth=6,random_state=0,n_jobs=-1)

for train_id,val_id in kf.split(df):
    xtrain,xval=df.iloc[train_id].copy(),df.iloc[val_id].copy()

    map_col=xtrain.groupby('col_20')['y'].mean().to_dict()
    xtrain['col_20']=xtrain['col_20'].map(map_col)
    xval['col_20']=xval['col_20'].map(map_col)
    ytrain,yval=xtrain.loc[:,'y'],xval.loc[:,'y']
    xtrain,xval=xtrain.iloc[:,:-1],xval.iloc[:,:-1]

    rf.fit(xtrain,ytrain)
    print(rf.score(xval,yval))

0.9235
0.925
0.9206
0.9219
0.9276
0.9268
0.9228
0.9242
0.9207
0.9218


In [73]:
xtrain.col_20.unique()

array([0.86070165, 0.18921643, 0.67510937, 0.89111253, 0.15115025,
       0.02135765, 0.8048486 , 0.10600352, 0.03658537, 0.98106236])

In [50]:
rf=RandomForestClassifier(n_estimators=100,max_depth=20,random_state=0)

In [51]:
rf.fit(xtrain,ytrain)

RandomForestClassifier(max_depth=20, random_state=0)

In [53]:
rf.score(xtest,ytest),rf.score(xtrain,ytrain)

(0.9617, 0.9934375)

## RFECV

In [79]:
from sklearn.feature_selection import RFECV

In [82]:
X=df.drop('y',axis=1)
y=df['y']

In [83]:
rfe=RFECV(estimator=rf,cv=5,scoring='accuracy')
rfe.fit(X,y)

RFECV(cv=5,
      estimator=RandomForestClassifier(max_depth=20, min_samples_leaf=10,
                                       n_jobs=-1, random_state=0),
      scoring='accuracy')

In [89]:
X_transformed = rfe.transform(X)

In [92]:
X_transformed

array([[ -4.60986106,  -4.99112217,   2.87487539, ...,   1.28606805,
         -8.25057439,   0.69973963],
       [ -1.64376471,  -1.11287986,   1.54645669, ...,   2.84063189,
          1.89108362,  -0.8285572 ],
       [ -1.04336582,   1.4244628 ,  -2.96323135, ...,  -3.34389493,
         -2.85111595,  -0.78496269],
       ...,
       [ -0.11275432,  -2.43382332,  -2.10411208, ...,  -4.91974894,
          1.16214469,   2.81247204],
       [ -4.21653541,  -2.3341965 ,  -2.45475625, ...,  -4.61342604,
         -4.70474029,   3.50355184],
       [-10.04963747,   5.648913  ,  -0.75871803, ...,  -3.2046927 ,
         -5.46153899,   1.84151251]])

## Feature Importance

In [122]:
from eli5 import show_weights, show_prediction, explain_prediction
from eli5.formatters import format_as_text

In [80]:
rf.feature_importances_

array([0.0906415 , 0.05884074, 0.02685866, 0.02052799, 0.02598829,
       0.01227632, 0.04605283, 0.01435457, 0.00903182, 0.02182733,
       0.00645207, 0.04398979, 0.03613963, 0.02715003, 0.10551366,
       0.0195239 , 0.04028528, 0.01522972, 0.10386326, 0.00778623,
       0.26766637])

In [79]:
show_weights(rf,feature_names=xtrain.columns.to_list())

Weight,Feature
0.2677  ± 0.4988,col_20
0.1055  ± 0.3098,col_14
0.1039  ± 0.3435,col_18
0.0906  ± 0.2547,col_0
0.0588  ± 0.1867,col_1
0.0461  ± 0.0759,col_6
0.0440  ± 0.2028,col_11
0.0403  ± 0.0682,col_16
0.0361  ± 0.0413,col_12
0.0272  ± 0.0727,col_13


In [134]:
pred=explain_prediction(rf, xtest.iloc[5,:], feature_names=xtrain.columns.to_list(),top=(4,3))

In [135]:
print(format_as_text(pred))

Explained as: decision path

Features with largest coefficients.

Feature weights are calculated by following decision paths in trees
of an ensemble (or a single tree for DecisionTreeClassifier).
Each node of the tree has an output score, and contribution of a feature
on the decision path is how much the score changes from parent to child.
Weights of all features sum to the output score or proba of the estimator.

Caveats:
1. Feature weights just show if the feature contributed positively or
   negatively to the final score, and does not show how increasing or
   decreasing the feature value will change the prediction.
2. In some cases, feature weight can be close to zero for an important feature.
   For example, in a single tree that computes XOR function, the feature at the
   top of the tree will have zero weight because expected scores for both
   branches are equal, so decision at the top feature does not change the
   expected score. For an ensemble predicting XOR functions it mi

In [133]:
rf.predict_proba(xtest)[:6]

array([[0.5875753 , 0.4124247 ],
       [0.94080996, 0.05919004],
       [0.94168722, 0.05831278],
       [0.91478625, 0.08521375],
       [0.97470156, 0.02529844],
       [0.57834607, 0.42165393]])