# SHEVA Shapley values for Earned VAlue management

# Model selection

## Dataset

In [3]:
import pandas as pd
import numpy as np

In [5]:
# Simulation  dataset
# Null model (5-rand) of comparison
data=pd.read_csv('./data/simulation_EV0.75_5-rand.csv',index_col=0)
data['critical_path']=data['critical_path'].astype('str')

## Regresion models

In [1]:
# Reegression models
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost.sklearn import XGBRegressor

In [2]:
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error

In [8]:
# random seed
seed=1123

In [7]:
# DBAC regression
y=data.loc[:,'duration']
X=data.loc[:,['duration@1','duration@2', 'duration@3','duration@4', 'duration@5','duration@6', 'duration@7','duration@8']]

In [9]:
# AdaBoostRegressor: grid search
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html
mdr = GridSearchCV(AdaBoostRegressor(),
    param_grid={"n_estimators": np.linspace(100,1000,10).astype('int'),
                "learning_rate": np.linspace(0.1,1,10)},n_jobs=8)
mdr.fit(X, y)
mdr.best_params_ # 'learning_rate': 0.1, 'n_estimators': 100

{'learning_rate': 0.1, 'n_estimators': 100}

In [10]:
# GradientBoostingRegressor
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html
mdr = GridSearchCV(GradientBoostingRegressor(),
    param_grid={"n_estimators": np.linspace(100,1000,7).astype('int'),  
                "max_depth": np.linspace(1,10,5).astype('int')},n_jobs=8) 
mdr.fit(X, y)
mdr.best_params_ # 'max_depth': 3, 'n_estimators': 100

{'max_depth': 3, 'n_estimators': 100}

In [11]:
# RandomForestRegressor
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
mdr = GridSearchCV(RandomForestRegressor(),
param_grid={"n_estimators": np.linspace(100,1000,7).astype('int'), 
           "max_depth": np.linspace(1,10,5).astype('int')},n_jobs=8)
mdr.fit(X, y)
mdr.best_params_ # 'max_depth': 10, 'n_estimators': 850

{'max_depth': 10, 'n_estimators': 850}

In [12]:
# XGBRegressor
# https://xgboost.readthedocs.io/en/stable/python/python_api.html#module-xgboost.sklearn
mdr = GridSearchCV(XGBRegressor(verbosity = 0),
    param_grid={"n_estimators": np.linspace(100,1000,7).astype('int'),  
                "max_depth": np.linspace(1,10,5).astype('int')},n_jobs=8)    
mdr.fit(X, y)
mdr.best_params_ # 'max_depth': 1, 'n_estimators': 250

{'max_depth': 1, 'n_estimators': 250}

In [13]:
# Selection of models by 10-fold CV
kfold = KFold(n_splits=10, random_state=seed,shuffle=True)
results = pd.DataFrame([],columns=['model','kf','MSE'])
models=[ ('AdaB',AdaBoostRegressor(learning_rate=0.1, n_estimators=100)),
         ('GBR',GradientBoostingRegressor(max_depth=3, n_estimators=100)),
         ('RF',RandomForestRegressor(max_depth=10, n_estimators=850)),
         ('XGB',XGBRegressor(max_depth=1, n_estimators=250))]
k=0
for train_index, test_index in kfold.split(X):
  for model in models:
    mdr = model[1]
    mdr.fit(X.iloc[train_index,:], y.iloc[train_index])
    results = results.append( 
        pd.DataFrame([[model[0],
                    k, 
                    mean_squared_error(y.iloc[test_index],mdr.predict(X.iloc[test_index,:]))]],
            columns=['model','kf','MSE']))
  k+=1
results.reset_index(inplace=True,drop=True)

In [14]:
# 10-fold CV results
results.groupby('model').mean().sort_values('MSE',ascending=True)

Unnamed: 0_level_0,MSE
model,Unnamed: 1_level_1
GBR,9.182019
RF,9.188798
XGB,9.207503
AdaB,11.076624


## Classifier models

In [15]:
# Classifier models
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from xgboost.sklearn import XGBClassifier

In [16]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

In [17]:
# Classes 
# Expected time of the project 13
data['delay']=data['duration']>13

In [18]:
y=data.loc[:,'delay']
X=data.loc[:,['duration@1','duration@2', 'duration@3','duration@4', 'duration@5','duration@6', 'duration@7','duration@8']]

In [19]:
# AdaBoostClassifier: grid search
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html
mdr = GridSearchCV(AdaBoostClassifier(),
    param_grid={"n_estimators": np.linspace(100,1000,10).astype('int'),
                "learning_rate": np.linspace(0.1,1,10)},n_jobs=8)
mdr.fit(X, y)
mdr.best_params_ # 'learning_rate': 0.9, 'n_estimators': 300

{'learning_rate': 0.9, 'n_estimators': 300}

In [20]:
# GradientBoostingClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html
mdr = GridSearchCV(GradientBoostingClassifier(),
    param_grid={"n_estimators": np.linspace(100,1000,7).astype('int'),  
                "max_depth": np.linspace(1,10,5).astype('int')},n_jobs=8) 
mdr.fit(X, y)
mdr.best_params_ # 'max_depth': 5, 'n_estimators': 100

{'max_depth': 5, 'n_estimators': 100}

In [21]:
# RandomForestClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
mdr = GridSearchCV(RandomForestClassifier(),
param_grid={"n_estimators": np.linspace(100,1000,7).astype('int'), 
           "max_depth": np.linspace(1,10,5).astype('int')},n_jobs=8)
mdr.fit(X, y)
mdr.best_params_ # 'max_depth': 10, 'n_estimators': 850

{'max_depth': 10, 'n_estimators': 850}

In [22]:
# XGBClassifier
# https://xgboost.readthedocs.io/en/stable/python/python_api.html#module-xgboost.sklearn
mdr = GridSearchCV(XGBClassifier(verbosity = 0),
    param_grid={"n_estimators": np.linspace(100,1000,7).astype('int'),  
                "max_depth": np.linspace(1,10,5).astype('int')},n_jobs=8)    
mdr.fit(X, y)
mdr.best_params_ # 'max_depth': 3, 'n_estimators': 100



{'max_depth': 3, 'n_estimators': 100}

In [None]:
# Selection of models by 10-fold CV
kfold = StratifiedKFold(n_splits=10, random_state=seed,shuffle=True)
results2 = pd.DataFrame([],columns=['model','kf','Accuracy'])
models=[ ('AdaB',AdaBoostClassifier(learning_rate=0.9, n_estimators=300)),
         ('GBR',GradientBoostingClassifier(max_depth=5, n_estimators=100)),
         ('RF',RandomForestClassifier(max_depth=10, n_estimators=850)),
         ('XGB',XGBClassifier(max_depth=3, n_estimators=100))]
k=0
for train_index, test_index in kfold.split(X,y):
  for model in models:
    mdr = model[1]
    mdr.fit(X.iloc[train_index,:], y.iloc[train_index])
    results2 = results2.append( 
        pd.DataFrame([[model[0],
                    k, 
                    accuracy_score(y.iloc[test_index],mdr.predict(X.iloc[test_index,:]))]],
            columns=['model','kf','Accuracy']))
  k+=1
results2.reset_index(inplace=True,drop=True)

In [25]:
# 10-fold CV results
results2.groupby('model').mean().sort_values('Accuracy',ascending=True)

Unnamed: 0_level_0,Accuracy
model,Unnamed: 1_level_1
AdaB,0.85314
RF,0.85482
GBR,0.85646
XGB,0.85694
