In [36]:
import numpy as np
import pandas as pd
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns
import os as os
sys.path.append('../../')
import Constants as c
import Utilities as utils
from Scaler import Scaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import model_selection
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
%matplotlib inline
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from fbprophet import Prophet
from fbprophet.diagnostics import performance_metrics
from fbprophet.diagnostics import cross_validation
from fbprophet.plot import plot_cross_validation_metric
sns.set_style("darkgrid")

In [37]:
# train effort model
# validate effort model
# train forecasting model on 5 years of data
# test on 1 year of data
# calculate ROI for that 1 year (observed data)
# calculate ROI for that 1 year (forecasted data)
# compare

In [38]:
regressors = {
  "DecisionTreeRegressor": DecisionTreeRegressor(random_state=0, max_depth=10, min_samples_split=10, max_features="sqrt"),
  "RandomForestRegressor": RandomForestRegressor(random_state=0, max_depth=10, min_samples_split=10, max_features="sqrt", n_estimators=10),
  "LinearRegression": LinearRegression(),
  "RidgeCV": RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1])
}

transformers = {
  "RobustScaler": RobustScaler(),
  "StandardScaler": StandardScaler(),
  "MinMaxScaler": MinMaxScaler(),
  "QuantileTransformer": QuantileTransformer(),
  "FunctionTransformer": FunctionTransformer(np.log1p)
}

regressor = regressors["DecisionTreeRegressor"]
transformer = transformers["QuantileTransformer"]
# df = utils.remove_outlier(df, c.LINE)
# df = utils.remove_outlier(df, c.MODULE)


In [39]:
directoryPath = "../../exports"
project_name = "angular.js"
task = "BUG"
file = "{0}/{1}/{2}_dataset_{3}.csv"

df = pd.read_csv(file.format(directoryPath, project_name, project_name, task))
# df = utils.isRegularVersion(df)

In [40]:
df[c.DATE] = pd.to_datetime(df[c.DATE])
df[c.T_MODULE_P] = df[c.T_MODULE].shift()
df[c.T_LINE_P] = df[c.T_LINE].shift()

# df[c.DATE_P] = df[c.DATE].shift()
# df[c.DATE_P].fillna(df[c.DATE].min(), inplace=True)

if df.isna().values.any():
    df.fillna(0, inplace=True)

In [41]:
X = df[[c.NT_CC, c.NO_CC, c.T_LINE_P, c.T_CC]]
Y = df[c.MODULE_EC]

splits = 10
num_records = len(X)

if num_records <= splits:
    splits = num_records

pipeline = Pipeline(steps=[('scaler', transformer), ('predictor', regressor)])
model_cc = TransformedTargetRegressor(regressor=pipeline, transformer=transformer)

model_cc.fit(X, Y)

kfold = model_selection.KFold(n_splits=splits)
predictions_cc = cross_val_predict(model_cc, X, Y, cv=kfold)
# df["MODULE_CC_PRED"] = predictions_cc
# scores = cross_val_score(model_cc, X, Y, cv=splits, scoring=utils.pred_25_scorer)
# print(scores)  

In [42]:
X = df[[c.NT_EC, c.NO_EC, c.T_LINE_P, c.T_EC]]
Y = df[c.MODULE_EC]

splits = 10
num_records = len(X)

if num_records <= splits:
    splits = num_records

pipeline = Pipeline(steps=[('scaler', transformer), ('predictor', regressor)])
model_ec = TransformedTargetRegressor(regressor=pipeline, transformer=transformer)

model_ec.fit(X, Y)

kfold = model_selection.KFold(n_splits=splits)
predictions_ec = cross_val_predict(model_ec, X, Y, cv=kfold)
# df["MODULE_EC_PRED"] = predictions_ec
# scores = cross_val_score(model_ec, X, Y, cv=splits, scoring=utils.pred_25_scorer)
# print(scores)  

In [43]:
nt_cc = df[[c.DATE, c.NT_CC]]
nt_cc.columns = ['ds','y']
m_nt_cc = Prophet(uncertainty_samples=0, growth='linear')
m_nt_cc.fit(nt_cc)

m_nt_cc_cv = cross_validation(m_nt_cc, initial='1825 days', period='365 days', horizon ='365 days')
# m_nt_cc_cv.head()
m_nt_cc_cv = m_nt_cc_cv[['ds', 'yhat']]
m_nt_cc_cv.columns = [c.DATE, c.NT_CC]
m_nt_cc_cv.columns 

INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Making 4 forecasts with cutoffs between 2015-11-20 00:00:00 and 2018-11-19 00:00:00


Index(['Date', 'NT_CC'], dtype='object')

In [44]:
df = df.join(m_nt_cc_cv.set_index(c.DATE), on=c.DATE, rsuffix='_PRED', how='inner')

In [45]:
# df[[c.DATE, c.MODULE_CC, c.NT_CC, "NT_CC_PRED"]]

In [46]:
# m_nt_cc_p = performance_metrics(m_nt_cc_cv)
# m_nt_cc_p.head()

In [47]:
# plot_cross_validation_metric(m_nt_cc_cv, metric='mdape')