## Permutation Importance

In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE, RFECV, VarianceThreshold
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import export_graphviz

import eli5
from eli5.sklearn import PermutationImportance

In [3]:
train = pd.read_csv(r'C:\WORK\Kaggle\house_price\train.csv', header = 0)
test = pd.read_csv(r'C:\WORK\Kaggle\house_price\test.csv', header = 0)

print(train.shape)
print(test.shape)

(1460, 81)
(1459, 80)


In [4]:
# preprocess data
nulls = train.isnull().sum().sort_values(ascending = False)
nulls = nulls[nulls > 0]

to_drop = nulls[:5].index.tolist()
train.drop(to_drop, axis = 1, inplace = True)

obj_cols = train.select_dtypes('object').columns
train.drop(obj_cols, axis = 1, inplace = True)

train.drop(['Id'], axis =1, inplace = True)

train.fillna(train.mean(), inplace = True)

In [5]:
train.head(3)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,61,0,0,0,0,0,2,2008,208500
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,298,0,0,0,0,0,0,5,2007,181500
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,42,0,0,0,0,0,9,2008,223500


In [8]:
# prepare model
X = train.drop(['SalePrice'], axis = 1)
y = train['SalePrice']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 0)

model = RandomForestRegressor(n_estimators = 200, max_depth = 10, random_state = 0)
model.fit(X_train, y_train)

RandomForestRegressor(max_depth=10, n_estimators=200, random_state=0)

In [12]:
# permutation importance
perm = PermutationImportance(model, scoring = 'neg_mean_squared_error', n_iter = 5, random_state = 0).fit(X_val, y_val)
eli5.show_weights(perm, feature_names = X_val.columns.tolist())

Weight,Feature
3337689161.3432  ± 502271995.5509,OverallQual
813158066.8437  ± 131040417.0489,GrLivArea
171829373.8920  ± 53708664.3422,TotalBsmtSF
150738978.5180  ± 26114206.8861,BsmtFinSF1
89731204.2758  ± 27285086.7410,LotArea
81175072.5734  ± 4914248.3929,2ndFlrSF
63673678.6731  ± 26826195.3023,GarageCars
56219926.7863  ± 15645964.3926,YearBuilt
52278200.4794  ± 28174687.0049,YearRemodAdd
50690135.2880  ± 26621532.8437,1stFlrSF
