In [1]:
import numpy as np
import pandas as pd
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns
import os as os
sys.path.append('../../../../')
import Constants as c
import Utilities as utils
from Scaler import Scaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import model_selection
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
%matplotlib inline
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from fbprophet import Prophet
from fbprophet.diagnostics import performance_metrics
from fbprophet.diagnostics import cross_validation
from fbprophet.plot import plot_cross_validation_metric
sns.set_style("darkgrid")

In [2]:
# train effort model
# validate effort model
# train forecasting model on 5 years of data
# test on 1 year of data
# calculate ROI for that 1 year (observed data)
# calculate ROI for that 1 year (forecasted data)
# compare

In [3]:
def forecast_variable(df, variable):
    y_df = df[[c.DATE, variable]]
    y_df.columns = ['ds','y']
    m_y_df = Prophet(uncertainty_samples=0, growth='linear')
    m_y_df.fit(y_df)

    m_y_df_cv = cross_validation(m_y_df, initial='1825 days', period='365 days', horizon ='365 days')
    m_y_df_cv = m_y_df_cv[['ds', 'yhat']]
    m_y_df_cv.columns = [c.DATE, variable]
    return m_y_df_cv

In [4]:
regressors = {
  "DecisionTreeRegressor": DecisionTreeRegressor(random_state=0, max_depth=10, min_samples_split=10, max_features="sqrt"),
  "RandomForestRegressor": RandomForestRegressor(random_state=0, max_depth=10, min_samples_split=10, max_features="sqrt", n_estimators=10),
  "LinearRegression": LinearRegression(),
  "RidgeCV": RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1])
}

transformers = {
  "RobustScaler": RobustScaler(),
  "StandardScaler": StandardScaler(),
  "MinMaxScaler": MinMaxScaler(),
  "QuantileTransformer": QuantileTransformer(),
  "FunctionTransformer": FunctionTransformer(np.log1p)
}

regressor = regressors["DecisionTreeRegressor"]
transformer = transformers["QuantileTransformer"]

In [5]:
directoryPath = "../../../../exports"
project_name = "angular.js"
task = "BUG"
file = "{0}/{1}/{2}_dataset_{3}.csv"

df = pd.read_csv(file.format(directoryPath, project_name, project_name, task))

In [6]:
df[c.DATE] = pd.to_datetime(df[c.DATE])
df[c.T_MODULE_P] = df[c.T_MODULE].shift()
df[c.T_LINE_P] = df[c.T_LINE].shift()

if df.isna().values.any():
    df.fillna(0, inplace=True)

In [7]:
# Core Contributors
df_nt_cc = forecast_variable(df, c.NT_CC)
df_no_cc = forecast_variable(df, c.NO_CC)
df_t_cc = forecast_variable(df, c.T_CC)
df_t_line_p = forecast_variable(df, c.T_LINE_P)

# External Contributors
df_nt_ec = forecast_variable(df, c.NT_EC)
df_no_ec = forecast_variable(df, c.NO_EC)
df_t_ec = forecast_variable(df, c.T_EC)

INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Making 4 forecasts with cutoffs between 2015-11-20 00:00:00 and 2018-11-19 00:00:00
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Making 4 forecasts with cutoffs between 2015-11-20 00:00:00 and 2018-11-19 00:00:00
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Making 4 forecasts with cutoffs between 2015-11-20 00:00:00 and 2018-11-19 00:00:00
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Making 4 forecasts with cutoffs between 2015-11-20 00:00:00 and 2018-11-19 00:00:00
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Making 4 forecasts with cutoffs between 2015-11-20 00:00:00 and 2018-11-19 00:0

In [8]:
df = df.join(df_nt_cc.set_index(c.DATE), on=c.DATE, rsuffix='_PRED', how='inner')
df = df.join(df_no_cc.set_index(c.DATE), on=c.DATE, rsuffix='_PRED', how='inner')
df = df.join(df_t_cc.set_index(c.DATE), on=c.DATE, rsuffix='_PRED', how='inner')
df = df.join(df_t_line_p.set_index(c.DATE), on=c.DATE, rsuffix='_PRED', how='inner')
df = df.join(df_nt_ec.set_index(c.DATE), on=c.DATE, rsuffix='_PRED', how='inner')
df = df.join(df_no_ec.set_index(c.DATE), on=c.DATE, rsuffix='_PRED', how='inner')
df = df.join(df_t_ec.set_index(c.DATE), on=c.DATE, rsuffix='_PRED', how='inner')

In [9]:
df = df[[c.DATE, c.MODULE_CC, c.MODULE_EC, c.NT_CC, "NT_CC_PRED", "NO_CC_PRED", "T_CC_PRED", "NT_EC_PRED", "NO_EC_PRED", "T_EC_PRED", "T_Line_P_PRED"]]
df.head()

Unnamed: 0,Date,Module_CC,Module_EC,NT_CC,NT_CC_PRED,NO_CC_PRED,T_CC_PRED,NT_EC_PRED,NO_EC_PRED,T_EC_PRED,T_Line_P_PRED
155,2015-12-09,209.0,40.0,109.0,238.661717,483.746395,9.451922,80.382702,355.086716,5.081087,427316.979243
156,2016-01-15,78.0,60.0,43.0,71.185603,183.255222,6.506013,50.669099,132.854525,3.668762,432571.832739
157,2016-01-21,243.0,88.0,128.0,15.340032,17.017543,4.98371,-1.227262,12.885198,1.85268,441421.65007
158,2016-01-28,343.0,106.0,178.0,24.419402,27.320723,5.074618,3.402674,24.979169,1.784622,444964.704107
159,2016-02-05,8.0,38.0,5.0,85.64131,188.051982,6.814857,54.382125,146.591886,3.661353,438023.987957


In [10]:
X = df[["NT_CC_PRED", "NO_CC_PRED", "T_CC_PRED", "T_Line_P_PRED"]]
Y = df[c.MODULE_CC]

splits = 10
num_records = len(X)

if num_records <= splits:
    splits = num_records

pipeline = Pipeline(steps=[('scaler', transformer), ('predictor', regressor)])
model_cc = TransformedTargetRegressor(regressor=pipeline, transformer=transformer)

model_cc.fit(X, Y)

kfold = model_selection.KFold(n_splits=splits)
predictions_cc = cross_val_predict(model_cc, X, Y, cv=kfold)
df["MODULE_CC_PRED"] = predictions_cc

In [11]:
X = df[["NT_EC_PRED", "NO_EC_PRED", "T_EC_PRED", "T_Line_P_PRED"]]
Y = df[c.MODULE_EC]

splits = 10
num_records = len(X)

if num_records <= splits:
    splits = num_records

pipeline = Pipeline(steps=[('scaler', transformer), ('predictor', regressor)])
model_ec = TransformedTargetRegressor(regressor=pipeline, transformer=transformer)

model_ec.fit(X, Y)

kfold = model_selection.KFold(n_splits=splits)
predictions_ec = cross_val_predict(model_ec, X, Y, cv=kfold)
df["MODULE_EC_PRED"] = predictions_ec

In [12]:
df.head()

Unnamed: 0,Date,Module_CC,Module_EC,NT_CC,NT_CC_PRED,NO_CC_PRED,T_CC_PRED,NT_EC_PRED,NO_EC_PRED,T_EC_PRED,T_Line_P_PRED,MODULE_CC_PRED,MODULE_EC_PRED
155,2015-12-09,209.0,40.0,109.0,238.661717,483.746395,9.451922,80.382702,355.086716,5.081087,427316.979243,1594.0,350.0
156,2016-01-15,78.0,60.0,43.0,71.185603,183.255222,6.506013,50.669099,132.854525,3.668762,432571.832739,522.0,350.0
157,2016-01-21,243.0,88.0,128.0,15.340032,17.017543,4.98371,-1.227262,12.885198,1.85268,441421.65007,522.0,350.0
158,2016-01-28,343.0,106.0,178.0,24.419402,27.320723,5.074618,3.402674,24.979169,1.784622,444964.704107,522.0,350.0
159,2016-02-05,8.0,38.0,5.0,85.64131,188.051982,6.814857,54.382125,146.591886,3.661353,438023.987957,522.0,350.0
