# Explainable machine learning for project control

# Model selection

## Dataset

In [11]:
import pandas as pd
import numpy as np

In [12]:
# Simulation  dataset
# Null model (5-rand) of comparison
data=pd.read_csv('./data/simulation_EV0.75_5-rand.csv',index_col=0)
data['critical_path']=data['critical_path'].astype('str')

## Regresion models

### Model selection
We use nested cross-validation with the null model (5-rand) simulation dataset for the backward regression problem DBAC~ {activity i's duration at 75%EV} i=1,...,8 

In [13]:
# Reegression models
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost.sklearn import XGBRegressor

In [14]:
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error

In [18]:
# random seed
seed=1123

In [19]:
# DBAC regression
y=data.loc[:,'duration']
X=data.loc[:,['duration@1','duration@2', 'duration@3','duration@4', 'duration@5','duration@6', 'duration@7','duration@8']]

In [20]:
# outer kfolds
kfold = KFold(n_splits=5, random_state=seed,shuffle=True)

In [21]:
# save outer kfolds
kfs=pd.DataFrame([],columns=['kn','type','i'])
k=0
for train_index, test_index in kfold.split(X):
  p1=pd.DataFrame(train_index,columns=['i'])
  p1['kn']=k
  p1['type']='train'
  p2=pd.DataFrame(test_index,columns=['i'])
  p2['kn']=k
  p2['type']='test'
  p1=p1.append(p2)
  kfs=kfs.append(p1)
  k+=1
kfs.reset_index(inplace=True,drop=True)
kfs.to_csv('./data/regression_model_selection_kf_index.csv')

In [23]:
# load outer kfolds
kfs=pd.read_csv('./data/regression_model_selection_kf_index.csv',index_col=0)

In [24]:
# results
results = pd.DataFrame([],columns=['model','kf','MSE'])

In [25]:
# RandomForestRegressor
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
model='RFR'
k=0
for train_index, test_index in kfold.split(X):

  # parameter optimization in the inner folds
    # n_estimators: number of trees
    # max_depth of the trees
  mdr = GridSearchCV(RandomForestRegressor(),
                    param_grid={"n_estimators": np.linspace(100,1000,5).astype('int'), 
                            "max_depth": np.linspace(1,10,5).astype('int')},
                    n_jobs=6)  
  mdr.fit(X.iloc[train_index,:], y.iloc[train_index])
  
  # score of the best model in the outer fold
  results = results.append( pd.DataFrame([[model,k, 
        mean_squared_error(y.iloc[test_index],mdr.best_estimator_.predict(X.iloc[test_index,:]))]],columns=['model','kf','MSE']))
  k+=1

In [26]:
# GradientBoostingRegressor
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html
model='GradientBoostingR'
k=0
for train_index, test_index in kfold.split(X):

  # parameter optimization in the inner folds
    # n_estimators: number of  boosting stages
    # max_depth of the regression estimators
  mdr = GridSearchCV(GradientBoostingRegressor(),
                    param_grid={"n_estimators": np.linspace(100,1000,5).astype('int'),  
                                "max_depth": np.linspace(1,10,5).astype('int')},
                    n_jobs=6)  
  mdr.fit(X.iloc[train_index,:], y.iloc[train_index])
  
  # score of the best model in the outer fold
  results = results.append( pd.DataFrame([[model,k, mean_squared_error(y.iloc[test_index],mdr.best_estimator_.predict(X.iloc[test_index,:]))]],columns=['model','kf','MSE']))
  k+=1

In [27]:
# AdaBoostRegressor
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html
model='AdaBoostR'
k=0
for train_index, test_index in kfold.split(X):

  # parameter optimization in the inner folds
    # n_estimators: maximum number of estimators at which boosting is terminated
  mdr = GridSearchCV(AdaBoostRegressor(),
                    param_grid={"n_estimators": np.linspace(100,1000,5).astype('int'),
                                "learning_rate": np.linspace(0.1,1,5)},
                    n_jobs=6)
  mdr.fit(X.iloc[train_index,:], y.iloc[train_index])
  
  # score of the best model in the outer fold
  results = results.append( pd.DataFrame([[model,k, mean_squared_error(y.iloc[test_index],mdr.best_estimator_.predict(X.iloc[test_index,:]))]],columns=['model','kf','MSE']))  
  k+=1

In [28]:
# XGBRegressor
# https://xgboost.readthedocs.io/en/stable/python/python_api.html#module-xgboost.sklearn
model='XGBoostR'
k=0
for train_index, test_index in kfold.split(X):

  # parameter optimization in the inner folds
    # n_estimators: number of gradient boosted trees
    # max_depth: maximum tree depth for base learners
  mdr = GridSearchCV(XGBRegressor(verbosity = 0),
                    param_grid={"n_estimators": np.linspace(100,1000,5).astype('int'),  
                                "max_depth": np.linspace(1,10,5).astype('int')},
                    n_jobs=6)    
  mdr.fit(X.iloc[train_index,:], y.iloc[train_index])
  
  # score of the best model in the outer fold
  results = results.append( pd.DataFrame([[model,k, mean_squared_error(y.iloc[test_index],mdr.best_estimator_.predict(X.iloc[test_index,:]))]],columns=['model','kf','MSE']))
  k+=1

In [30]:
# Save results
results.reset_index(inplace=True,drop=True)
results.to_csv('./data/regression_model_selection_results.csv')

In [None]:
# Load results
results=pd.read_csv('./data/regression_model_selection_results.csv',index_col=0)

In [31]:
# Average MSE over 5-fold cross-validation
results.groupby('model').agg({'MSE':['mean','std']}).sort_values(by=('MSE', 'mean'))

Unnamed: 0_level_0,MSE,MSE
Unnamed: 0_level_1,mean,std
model,Unnamed: 1_level_2,Unnamed: 2_level_2
GradientBoostingR,9.193957,0.273381
RFR,9.205236,0.251368
XGBoostR,9.216796,0.268691
AdaBoostR,11.061972,0.123189


### Hyper-parameters selection of Gradient Boosting

#### Forward analysis
DBAC~ {activity i's duration} i=1,...,8

In [53]:
# Regression
y=data.loc[:,'duration']
X=data.loc[:,['duration1','duration2', 'duration3','duration4', 'duration5','duration6', 'duration7','duration8']]

In [54]:
# GradientBoostingRegressor
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html
mdr = GridSearchCV(GradientBoostingRegressor(),
    param_grid={"n_estimators": np.linspace(100,1000,5).astype('int'),  
                "max_depth": np.linspace(1,10,5).astype('int')},
    n_jobs=8) 
mdr.fit(X, y)
mdr.best_params_ # 'max_depth': 10, 'n_estimators': 1000

{'max_depth': 10, 'n_estimators': 1000}

#### Backward analysis
DBAC~ {activity i's duration at 75%EV} i=1,...,8

In [None]:
# Regression
y=data.loc[:,'duration']
X=data.loc[:,['duration@1','duration@2', 'duration@3','duration@4', 'duration@5','duration@6', 'duration@7','duration@8']]

In [33]:
# GradientBoostingRegressor
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html
mdr = GridSearchCV(GradientBoostingRegressor(),
    param_grid={"n_estimators": np.linspace(100,1000,5).astype('int'),  
                "max_depth": np.linspace(1,10,5).astype('int')},
    n_jobs=8) 
mdr.fit(X, y)
mdr.best_params_ # 'max_depth': 3, 'n_estimators': 100

{'max_depth': 3, 'n_estimators': 100}

#### Backward analysis
TB~ {activity i's duration at 75%EV} i=1,...,8

In [55]:
# Regression
y=data.loc[:,'duration@']
X=data.loc[:,['duration@1','duration@2', 'duration@3','duration@4', 'duration@5','duration@6', 'duration@7','duration@8']]

In [56]:
# GradientBoostingRegressor
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html
mdr = GridSearchCV(GradientBoostingRegressor(),
    param_grid={"n_estimators": np.linspace(100,1000,5).astype('int'),  
                "max_depth": np.linspace(1,10,5).astype('int')},
    n_jobs=8) 
mdr.fit(X, y)
mdr.best_params_ # 'max_depth': 5, 'n_estimators': 1000

{'max_depth': 5, 'n_estimators': 1000}

## Classifier models

In [57]:
# Classifier models
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from xgboost.sklearn import XGBClassifier

In [58]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

In [36]:
# Classes 
# Expected time of the project 13
data['delay']=data['duration']>13

In [37]:
y=data.loc[:,'delay']
X=data.loc[:,['duration@1','duration@2', 'duration@3','duration@4', 'duration@5','duration@6', 'duration@7','duration@8']]

In [38]:
# outer kfolds
kfold =StratifiedKFold(n_splits=5, random_state=seed,shuffle=True)

In [39]:
# results
results = pd.DataFrame([],columns=['model','kf','Accuracy'])

In [44]:
# RandomForestClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
model='RFC'
k=0
for train_index, test_index in kfold.split(X,y):

  # parameter optimization in the inner folds
    # n_estimators: number of trees
    # max_depth of the trees
  mdc = GridSearchCV(RandomForestClassifier(),
                    param_grid={"n_estimators": np.linspace(100,1000,5).astype('int'), 
                            "max_depth": np.linspace(1,10,5).astype('int')},
                    n_jobs=6)  
  mdc.fit(X.iloc[train_index,:], y.iloc[train_index])
  
  # score of the best model in the outer fold
  results = results.append( pd.DataFrame([[model,k, accuracy_score(y.iloc[test_index],mdc.best_estimator_.predict(X.iloc[test_index,:]))]],columns=['model','kf','Accuracy']))
  k+=1

In [45]:
# GradientBoostingClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
model='GradientBoostingC'
k=0
for train_index, test_index in kfold.split(X,y):

  # parameter optimization in the inner folds
    # n_estimators: number of  boosting stages
    # max_depth of the regression estimators
  mdc = GridSearchCV(GradientBoostingClassifier(),
                    param_grid={"n_estimators": np.linspace(100,1000,5).astype('int'),  
                                "max_depth": np.linspace(1,10,5).astype('int')},
                    n_jobs=6)  
  mdc.fit(X.iloc[train_index,:], y.iloc[train_index])
  
  # score of the best model in the outer fold
  results = results.append( pd.DataFrame([[model,k, accuracy_score(y.iloc[test_index],mdc.best_estimator_.predict(X.iloc[test_index,:]))]],columns=['model','kf','Accuracy']))
  k+=1

In [46]:
# AdaBoostClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
model='AdaBoostC'
k=0
for train_index, test_index in kfold.split(X,y):

  # parameter optimization in the inner folds
    # n_estimators: maximum number of estimators at which boosting is terminated
  mdc = GridSearchCV(AdaBoostClassifier(),
                    param_grid={"n_estimators": np.linspace(100,1000,5).astype('int'),
                                "learning_rate": np.linspace(0.1,1,5)},
                    n_jobs=6)
  mdc.fit(X.iloc[train_index,:], y.iloc[train_index])
  
  # score of the best model in the outer fold
  results = results.append( pd.DataFrame([[model,k, accuracy_score(y.iloc[test_index],mdc.best_estimator_.predict(X.iloc[test_index,:]))]],columns=['model','kf','Accuracy']))  
  k+=1

In [None]:
# XGBClassifier
# https://xgboost.readthedocs.io/en/stable/python/python_api.html#module-xgboost.sklearn
model='XGBoostC'
k=0
for train_index, test_index in kfold.split(X,y):

  # parameter optimization in the inner folds
    # n_estimators: number of gradient boosted trees
    # max_depth: maximum tree depth for base learners
  mdc = GridSearchCV(XGBClassifier(verbosity = 0),
                    param_grid={"n_estimators": np.linspace(100,1000,5).astype('int'),  
                                "max_depth": np.linspace(1,10,5).astype('int')},
                    n_jobs=6)    
  mdc.fit(X.iloc[train_index,:], y.iloc[train_index])
  
  # score of the best model in the outer fold
  results = results.append( pd.DataFrame([[model,k, accuracy_score(y.iloc[test_index],mdc.best_estimator_.predict(X.iloc[test_index,:]))]],columns=['model','kf','Accuracy']))
  k+=1

In [49]:
# Save results
results.reset_index(inplace=True,drop=True)
results.to_csv('./data/classification_model_selection_results.csv')

In [None]:
# Load results
results=pd.read_csv('./data/classification_model_selection_results.csv',index_col=0)

In [52]:
# Average Accuracy over 5-fold cross-validation
results.groupby('model').agg({'Accuracy':['mean','std']}).sort_values(by=('Accuracy', 'mean'),ascending=False)

Unnamed: 0_level_0,Accuracy,Accuracy
Unnamed: 0_level_1,mean,std
model,Unnamed: 1_level_2,Unnamed: 2_level_2
GradientBoostingC,0.85704,0.002016
XGBoostC,0.85642,0.002279
RFC,0.85444,0.003512
AdaBoostC,0.85368,0.002894


### Hyper-parameters selection

#### Backward analysis
C_dbac~ {activity i's duration at 75%EV} i=1,...,8

In [None]:
y=data.loc[:,'delay']
X=data.loc[:,['duration@1','duration@2', 'duration@3','duration@4', 'duration@5','duration@6', 'duration@7','duration@8']]

In [51]:
# GradientBoostingClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
mdc = GridSearchCV(GradientBoostingClassifier(),
    param_grid={"n_estimators": np.linspace(100,1000,5).astype('int'),  
                "max_depth": np.linspace(1,10,5).astype('int')},
    n_jobs=8) 
mdc.fit(X, y)
mdc.best_params_ # {'max_depth': 5, 'n_estimators': 100}

{'max_depth': 5, 'n_estimators': 100}

#### Backward analysis
C_tb~ {activity i's duration at 75%EV} i=1,...,8

In [60]:
# Expected time of the project at 75%EV 9.1763
data['delay@']=data['duration@']>9.1763

In [61]:
y=data.loc[:,'delay@']
X=data.loc[:,['duration@1','duration@2', 'duration@3','duration@4', 'duration@5','duration@6', 'duration@7','duration@8']]

In [62]:
# GradientBoostingClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
mdc = GridSearchCV(GradientBoostingClassifier(),
    param_grid={"n_estimators": np.linspace(100,1000,5).astype('int'),  
                "max_depth": np.linspace(1,10,5).astype('int')},
    n_jobs=8) 
mdc.fit(X, y)
mdc.best_params_ # {'max_depth': 7, 'n_estimators': 100}

{'max_depth': 7, 'n_estimators': 100}