In [33]:
import numpy as np
import pandas as pd
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns
import os as os
sys.path.append('../../../../')
import Constants as c
import Utilities as utils
from Scaler import Scaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import model_selection
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
%matplotlib inline
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from fbprophet import Prophet
from fbprophet.diagnostics import performance_metrics
from fbprophet.diagnostics import cross_validation
from fbprophet.plot import plot_cross_validation_metric
from scipy.special import inv_boxcox
from scipy.stats import boxcox
sns.set_style("darkgrid")

In [34]:
# train effort model
# validate effort model
# train forecasting model on 5 years of data
# test on 1 year of data
# calculate ESP for that 1 year (observed data)
# calculate ESP for that 1 year (forecasted data)
# compare

In [35]:
def forecast_variable(df, variable):
    y_df = df[[c.DATE, variable]]
    y_df.columns = ['ds','y']

    y_df['y_orig'] = y_df['y']
    y_df['y'], lam = boxcox(y_df['y'] + 1)

    m_y_df = Prophet(uncertainty_samples=0, growth='linear')
    m_y_df.fit(y_df)

    m_y_df_cv = cross_validation(m_y_df, initial='1825 days', period='365 days', horizon ='365 days')

    m_y_df_cv[['yhat']] = m_y_df_cv[['yhat']].apply(lambda x: inv_boxcox(x, lam))

    m_y_df_cv = m_y_df_cv[['ds', 'yhat']]
    m_y_df_cv.columns = [c.DATE, variable]
    return m_y_df_cv

In [36]:
def predict_effort(df, X, Y):
    splits = 10
    num_records = len(X)

    if num_records <= splits:
        splits = num_records

    pipeline = Pipeline(steps=[('scaler', transformer), ('predictor', regressor)])
    model = TransformedTargetRegressor(regressor=pipeline, transformer=transformer)

    model.fit(X, Y)

    kfold = model_selection.KFold(n_splits=splits)
    predictions = cross_val_predict(model, X, Y, cv=kfold)
    return predictions

In [37]:
regressors = {
  "DecisionTreeRegressor": DecisionTreeRegressor(random_state=0, max_depth=10, min_samples_split=10, max_features="sqrt"),
  "RandomForestRegressor": RandomForestRegressor(random_state=0, max_depth=10, min_samples_split=10, max_features="sqrt", n_estimators=10),
  "LinearRegression": LinearRegression(),
  "RidgeCV": RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1])
}

transformers = {
  "RobustScaler": RobustScaler(),
  "StandardScaler": StandardScaler(),
  "MinMaxScaler": MinMaxScaler(),
  "QuantileTransformer": QuantileTransformer(),
  "FunctionTransformer": FunctionTransformer(np.log1p)
}

regressor = regressors["DecisionTreeRegressor"]
transformer = transformers["QuantileTransformer"]

In [38]:
directoryPath = "../../../../exports"
project_name = "angular.js"
task = "BUG"
file = "{0}/{1}/{2}_dataset_{3}.csv"

df = pd.read_csv(file.format(directoryPath, project_name, project_name, task))

In [39]:
df[c.DATE] = pd.to_datetime(df[c.DATE])
df[c.T_MODULE_P] = df[c.T_MODULE].shift()
df[c.T_LINE_P] = df[c.T_LINE].shift()

if df.isna().values.any():
    df.fillna(0, inplace=True)

In [40]:
# Core Contributors
df_nt_cc = forecast_variable(df, c.NT_CC)
df_no_cc = forecast_variable(df, c.NO_CC)
df_t_cc = forecast_variable(df, c.T_CC)
df_t_line_p = forecast_variable(df, c.T_LINE_P)

# External Contributors
df_nt_ec = forecast_variable(df, c.NT_EC)
df_no_ec = forecast_variable(df, c.NO_EC)
df_t_ec = forecast_variable(df, c.T_EC)

INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Making 4 forecasts with cutoffs between 2015-11-20 00:00:00 and 2018-11-19 00:00:00
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Making 4 forecasts with cutoffs between 2015-11-20 00:00:00 and 2018-11-19 00:00:00
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Making 4 forecasts with cutoffs between 2015-11-20 00:00:00 and 2018-11-19 00:00:00
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Making 4 forecasts with cutoffs between 2015-11-20 00:00:00 and 2018-11-19 00:00:00
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Making 4 forecasts with cutoffs between 2015-11-20 00:00:00 and 2018-11-19 00:0

In [41]:
df = df.join(df_nt_cc.set_index(c.DATE), on=c.DATE, rsuffix='_PRED', how='inner')
df = df.join(df_no_cc.set_index(c.DATE), on=c.DATE, rsuffix='_PRED', how='inner')
df = df.join(df_t_cc.set_index(c.DATE), on=c.DATE, rsuffix='_PRED', how='inner')
df = df.join(df_t_line_p.set_index(c.DATE), on=c.DATE, rsuffix='_PRED', how='inner')
df = df.join(df_nt_ec.set_index(c.DATE), on=c.DATE, rsuffix='_PRED', how='inner')
df = df.join(df_no_ec.set_index(c.DATE), on=c.DATE, rsuffix='_PRED', how='inner')
df = df.join(df_t_ec.set_index(c.DATE), on=c.DATE, rsuffix='_PRED', how='inner')

In [42]:
df = df[[c.DATE, c.MODULE_CC, c.MODULE_EC, c.LINE_CC, c.LINE_EC, c.NT_CC, "NT_CC_PRED", "NO_CC_PRED", "T_CC_PRED", "NT_EC_PRED", "NO_EC_PRED", "T_EC_PRED", "T_Line_P_PRED"]]
# df.head()

In [43]:
X = df[["NT_CC_PRED", "NO_CC_PRED", "T_CC_PRED", "T_Line_P_PRED"]]
Y = df[c.MODULE_CC]
predictions = predict_effort(df, X, Y)
df["MODULE_CC_PRED"] = predictions

In [44]:
X = df[["NT_EC_PRED", "NO_EC_PRED", "T_EC_PRED", "T_Line_P_PRED"]]
Y = df[c.MODULE_EC]
predictions = predict_effort(df, X, Y)
df["MODULE_EC_PRED"] = predictions

In [45]:
X = df[["NT_CC_PRED", "NO_CC_PRED", "T_CC_PRED", "T_Line_P_PRED"]]
Y = df[c.LINE_CC]
predictions = predict_effort(df, X, Y)
df["LINE_CC_PRED"] = predictions

In [46]:
X = df[["NT_EC_PRED", "NO_EC_PRED", "T_EC_PRED", "T_Line_P_PRED"]]
Y = df[c.LINE_EC]
predictions = predict_effort(df, X, Y)
df["LINE_EC_PRED"] = predictions

In [47]:
# df.head()

In [48]:
df['Year'] = df[c.DATE].apply(lambda x: x.year)
results = pd.pivot_table(df,index=["Year"],values=[c.MODULE_CC, c.MODULE_EC, c.LINE_CC, c.LINE_EC, "MODULE_CC_PRED", "MODULE_EC_PRED", "LINE_CC_PRED", "LINE_EC_PRED"], aggfunc=np.sum)

In [49]:
results.head()

Unnamed: 0_level_0,LINE_CC_PRED,LINE_EC_PRED,Line_CC,Line_EC,MODULE_CC_PRED,MODULE_EC_PRED,Module_CC,Module_EC
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015,54.0,0.0,5555.0,352.0,4.0,0.0,209.0,40.0
2016,497964900.0,520672193.0,397368225.0,472909023.0,10975766.5,12702205.0,11471523.0,10301438.0
2017,255604.0,628640.0,21057.0,95583.0,7582.0,10985.0,1010.0,2784.0
2018,493361.0,34380.0,21516.0,170873.0,12694.0,2070.0,338.0,4485.0
2019,58436.0,15280.0,861.0,707.0,1936.0,920.0,41.0,47.0
