In [145]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

from lightgbm import LGBMClassifier

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import shap

from econml.solutions.causal_analysis import CausalAnalysis

In [151]:
placement_df = pd.read_csv('/Users/jaydeepchakraborty/JC/git-projects/model_util/DataSets/Clg_Placement/CollegePlacement.csv')

In [152]:
placement_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2966 entries, 0 to 2965
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Age                2966 non-null   int64 
 1   Gender             2966 non-null   object
 2   Stream             2966 non-null   object
 3   Internships        2966 non-null   int64 
 4   CGPA               2966 non-null   int64 
 5   Hostel             2966 non-null   int64 
 6   HistoryOfBacklogs  2966 non-null   int64 
 7   PlacedOrNot        2966 non-null   int64 
dtypes: int64(6), object(2)
memory usage: 185.5+ KB


In [153]:
placement_df.head(3)

Unnamed: 0,Age,Gender,Stream,Internships,CGPA,Hostel,HistoryOfBacklogs,PlacedOrNot
0,22,Male,Electronics And Communication,1,8,1,1,1
1,21,Female,Computer Science,0,7,1,1,1
2,22,Female,Information Technology,1,6,0,0,1


In [154]:
placement_df.describe()

Unnamed: 0,Age,Internships,CGPA,Hostel,HistoryOfBacklogs,PlacedOrNot
count,2966.0,2966.0,2966.0,2966.0,2966.0,2966.0
mean,21.48584,0.703641,7.073837,0.269049,0.192178,0.552596
std,1.324933,0.740197,0.967748,0.44354,0.394079,0.49731
min,19.0,0.0,5.0,0.0,0.0,0.0
25%,21.0,0.0,6.0,0.0,0.0,0.0
50%,21.0,1.0,7.0,0.0,0.0,1.0
75%,22.0,1.0,8.0,1.0,0.0,1.0
max,30.0,3.0,9.0,1.0,1.0,1.0


In [155]:
placement_df=pd.get_dummies(placement_df)

In [157]:
placement_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2966 entries, 0 to 2965
Data columns (total 14 columns):
 #   Column                                Non-Null Count  Dtype
---  ------                                --------------  -----
 0   Age                                   2966 non-null   int64
 1   Internships                           2966 non-null   int64
 2   CGPA                                  2966 non-null   int64
 3   Hostel                                2966 non-null   int64
 4   HistoryOfBacklogs                     2966 non-null   int64
 5   PlacedOrNot                           2966 non-null   int64
 6   Gender_Female                         2966 non-null   uint8
 7   Gender_Male                           2966 non-null   uint8
 8   Stream_Civil                          2966 non-null   uint8
 9   Stream_Computer Science               2966 non-null   uint8
 10  Stream_Electrical                     2966 non-null   uint8
 11  Stream_Electronics And Communication  2966 

In [156]:
placement_df.head(5)

Unnamed: 0,Age,Internships,CGPA,Hostel,HistoryOfBacklogs,PlacedOrNot,Gender_Female,Gender_Male,Stream_Civil,Stream_Computer Science,Stream_Electrical,Stream_Electronics And Communication,Stream_Information Technology,Stream_Mechanical
0,22,1,8,1,1,1,0,1,0,0,0,1,0,0
1,21,0,7,1,1,1,1,0,0,1,0,0,0,0
2,22,1,6,0,0,1,1,0,0,0,0,0,1,0
3,21,0,8,0,1,1,0,1,0,0,0,0,1,0
4,22,0,8,1,0,1,0,1,0,0,0,0,0,1


In [160]:
y_col = ['PlacedOrNot']
X_col = placement_df.columns.difference(y_col).tolist()

In [161]:
print(X_col, y_col)

['Age', 'CGPA', 'Gender_Female', 'Gender_Male', 'HistoryOfBacklogs', 'Hostel', 'Internships', 'Stream_Civil', 'Stream_Computer Science', 'Stream_Electrical', 'Stream_Electronics And Communication', 'Stream_Information Technology', 'Stream_Mechanical'] ['PlacedOrNot']


In [116]:
# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.005, random_state=0, stratify=y)

In [117]:
# As the data is successfully split into train and test, let us now segregate the numerical and categorical features from the data as shown below.
categorical = []
for col, value in X_train.iteritems():
    if value.dtype == "object":
        categorical.append(col)

# Store the numerical columns in a list numerical
numerical = X_train.columns.difference(categorical).tolist()

print("categorical features:- ", categorical)
print("numerical features:- ", numerical)

categorical features:-  ['Gender', 'Stream']
numerical features:-  ['Age', 'CGPA', 'HistoryOfBacklogs', 'Hostel', 'Internships']


In [118]:
# Let us now build a pipeline that majorly involves two stages, 
# first stage is transformations stage to apply necessary transformations to the variables, 
# and the second stage is the machine learning model.

numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median")), 
                                      ("scaler", StandardScaler())])

categorical_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
                                          ("onehot", OneHotEncoder(handle_unknown="error", drop="first")),])

transformations = ColumnTransformer(transformers=[("num", numeric_transformer, numerical),
                                                  ("cat", categorical_transformer, categorical)])

clf = Pipeline(steps=[("preprocessor", transformations), 
                      ("classifier", LGBMClassifier())])

In [119]:
# model training
param_grid = {"classifier__learning_rate": [0.001, 0.05, 0.01],}
search = GridSearchCV(clf, param_grid, n_jobs=1, error_score='raise')
search.fit(X_train, y_train)

GridSearchCV(error_score='raise',
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='median')),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         ['Age',
                                                                          'CGPA',
                                                                          'HistoryOfBacklogs',
                                                                          'Hostel',
                                                   

In [90]:
best_estimator = search.best_estimator_
print(best_estimator)
fitted_transformer = best_estimator['preprocessor']
fitted_model = best_estimator['classifier']

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['Age', 'CGPA',
                                                   'HistoryOfBacklogs',
                                                   'Hostel', 'Internships']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
       

In [103]:
background = shap.maskers.Independent(fitted_transformer.transform(X_train), max_samples=1000)
explainer = shap.TreeExplainer(fitted_model, data=background, feature_names=X_train.columns)
shap_values = explainer(fitted_transformer.transform(X_test))

In [105]:
shap.summary_plot(shap_values, fitted_transformer.transform(X_test))

IndexError: index 7 is out of bounds for axis 0 with size 7

In [99]:
X_train.columns

Index(['Age', 'Gender', 'Stream', 'Internships', 'CGPA', 'Hostel',
       'HistoryOfBacklogs'],
      dtype='object')

In [100]:
X_test.columns

Index(['Age', 'Gender', 'Stream', 'Internships', 'CGPA', 'Hostel',
       'HistoryOfBacklogs'],
      dtype='object')

In [139]:
fitted_transformer.transform(X_test)

array([[-0.36717234,  0.95531779, -0.48768388,  1.65248922, -0.95021757,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 1.89646049, -2.14377653, -0.48768388, -0.60514767, -0.95021757,
         1.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [-0.36717234,  0.95531779, -0.48768388,  1.65248922,  0.40180367,
         1.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [-0.36717234, -0.07771365,  2.0505086 , -0.60514767,  0.40180367,
         1.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ],
       [-0.36717234, -1.11074509, -0.48768388,  1.65248922, -0.95021757,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 1.89646049, -0.07771365, -0.48768388,  1.65248922,  0.40180367,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.   

In [121]:
top_features = X_train.columns.tolist()

ca = CausalAnalysis(
    feature_inds=top_features,
    categorical=categorical,
    heterogeneity_inds=None,
    classification=True,
    nuisance_models="automl",
    heterogeneity_model="forest",
    n_jobs=1,
    random_state=123,
    upper_bound_on_cat_expansion=6
)

ca.fit(X_train, y_train.values)

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please cha

<econml.solutions.causal_analysis._causal_analysis.CausalAnalysis at 0x7fb2ae90d8d0>

In [133]:
global_summ = ca.global_causal_effect(alpha=0.05)
global_summ.sort_values(by="p_value")

Unnamed: 0_level_0,Unnamed: 1_level_0,point,stderr,zstat,p_value,ci_lower,ci_upper
feature,feature_value,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Stream,Computer SciencevCivil,0.122475,0.007065,17.335339,2.545618e-67,0.108628,0.136323
Stream,Information TechnologyvCivil,0.116797,0.007107,16.434687,1.079806e-60,0.102868,0.130726
CGPA,num,0.313591,0.021772,14.403599,4.911756e-47,0.270919,0.356263
Stream,Electronics And CommunicationvCivil,0.09765,0.007527,12.973247,1.7352039999999999e-38,0.082897,0.112403
Stream,ElectricalvCivil,0.091591,0.007308,12.532815,4.9378599999999995e-36,0.077268,0.105915
Internships,num,0.117061,0.030249,3.869961,0.0001088526,0.057775,0.176348
Stream,MechanicalvCivil,0.040728,0.015796,2.578348,0.009927401,0.009768,0.071688
Age,num,0.041162,0.020613,1.996865,0.04583981,0.000761,0.081564
HistoryOfBacklogs,num,-0.023322,0.054732,-0.426107,0.6700301,-0.130595,0.083951
Gender,MalevFemale,0.008091,0.04171,0.193976,0.8461943,-0.073659,0.089841


In [137]:
# helper function to plot error bar
def errorbar(res):
    xticks = res.index.get_level_values(0)
    lowererr = res["point"] - res["ci_lower"]
    uppererr = res["ci_upper"] - res["point"]
    xticks = [
        "{}***".format(t)
        if p < 1e-6
        else ("{}**".format(t) if p < 1e-3 else ("{}*".format(t) if p < 1e-2 else t))
        for t, p in zip(xticks, res["p_value"])
    ]
    print(xticks)
    plot_title = "Direct Causal Effect of Each Feature with 95% Confidence Interval, "
    plt.figure(figsize=(15, 5))
    plt.errorbar(
        np.arange(len(xticks)),
        res["point"],
        yerr=[lowererr, uppererr],
        fmt="o",
        capsize=5,
        capthick=1,
        barsabove=True,
    )
    plt.xticks(np.arange(len(xticks)), xticks, rotation=45)
    plt.title(plot_title)
    plt.axhline(0, color="r", linestyle="--", alpha=0.5)
    plt.ylabel("Average Treatment Effect")

In [138]:
errorbar(global_summ)

['Age', 'Gender', 'Stream***', 'Stream***', 'Stream***', 'Stream***', 'Stream*', 'Internships**', 'CGPA***', 'Hostel', 'HistoryOfBacklogs']


In [136]:
plt.figure(figsize=(12, 8))
ca.plot_heterogeneity_tree(X_test, 
                           "Stream", 
                           max_depth=2,
                           min_impurity_decrease=1e-7) 

Criterion 'mse' was deprecated in v1.0 and will be removed in version 1.2. Use `criterion='squared_error'` which is equivalent.
Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.
