In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
priority_data = pd.read_csv('/kaggle/input/priority-and-all-values/priority_data_with_all_values.csv', delimiter = ';')

In [None]:
from sklearn.model_selection import train_test_split

# Separate target from predictors
y = priority_data.Priority
X = priority_data.drop('Priority', axis=1)
#X = priority_data[important_features]

# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 12 and 
                        X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()

X_valid = X_valid_full[my_cols].copy()


In [None]:
from sklearn.preprocessing import LabelEncoder

# Make copy to avoid changing original data 
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

# Apply label encoder to each column with categorical data
label_encoder = LabelEncoder()
for col in categorical_cols:
    label_X_train[col] = label_encoder.fit_transform(X_train[col])
    label_X_valid[col] = label_encoder.transform(X_valid[col])

In [None]:

from sklearn.impute import SimpleImputer

# Imputation for label
my_imputer = SimpleImputer()
label_imputed_X_train = pd.DataFrame(my_imputer.fit_transform(label_X_train))
label_imputed_X_valid = pd.DataFrame(my_imputer.transform(label_X_valid))

# Imputation removed column names; put them back
label_imputed_X_train.columns = label_X_train.columns
label_imputed_X_valid.columns = label_X_valid.columns

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
#model = XGBRegressor(n_estimators=20)
#model = LGBMRegressor(random_state=5)
model = RandomForestRegressor(n_estimators=400, random_state=0)
#model = DecisionTreeRegressor(random_state=1)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
model.fit(label_imputed_X_train, y_train)

# Preprocessing of validation data, get predictions
preds = model.predict(label_imputed_X_valid)
errors = abs(preds - y_valid)
# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)


In [None]:
import matplotlib.pyplot as plt
import shap
shap_values = shap.TreeExplainer(model).shap_values(label_imputed_X_train)
shap.summary_plot(shap_values, label_imputed_X_train, plot_type="bar")


In [None]:
shap.summary_plot(shap_values, label_imputed_X_train)

In [None]:
def ABS_SHAP(df_shap,df):
    #import matplotlib as plt
    # Make a copy of the input data
    shap_v = pd.DataFrame(df_shap)
    feature_list = df.columns
    shap_v.columns = feature_list
    df_v = df.copy().reset_index().drop('index',axis=1)
    
    # Determine the correlation in order to plot with different colors
    corr_list = list()
    for i in feature_list:
        b = np.corrcoef(shap_v[i],df_v[i])[1][0]
        corr_list.append(b)
    corr_df = pd.concat([pd.Series(feature_list),pd.Series(corr_list)],axis=1).fillna(0)
    # Make a data frame. Column 1 is the feature, and Column 2 is the correlation coefficient
    corr_df.columns  = ['Variable','Corr']
    corr_df['Sign'] = np.where(corr_df['Corr']>0,'red','blue')
    
    # Plot it
    shap_abs = np.abs(shap_v)
    k=pd.DataFrame(shap_abs.mean()).reset_index()
    k.columns = ['Variable','SHAP_abs']
    k2 = k.merge(corr_df,left_on = 'Variable',right_on='Variable',how='inner')
    k2 = k2.sort_values(by='SHAP_abs',ascending = True)
    colorlist = k2['Sign']
    ax = k2.plot.barh(x='Variable',y='SHAP_abs',color = colorlist, figsize=(5,6),legend=False)
    ax.set_xlabel("SHAP Value (Red = Positive Impact)")
    
ABS_SHAP(shap_values,label_imputed_X_train) 

In [None]:
X_output = label_imputed_X_valid.copy()
X_output.loc[:,'predict'] = np.round(model.predict(X_output),2)

random_picks = np.arange(1,330,20)

S = X_output.iloc[random_picks]
S

In [None]:
X_valid.iloc[random_picks]

In [None]:
def shap_plot(j):
    explainerModel = shap.TreeExplainer(model)
    shap_values_Model = explainerModel.shap_values(S)
    p = shap.force_plot(explainerModel.expected_value, shap_values_Model[j], S.iloc[[j]])
    return(p)

In [None]:
shap.initjs()
shap_plot(10)