## Estimate Return on Investment for Recommended Content

Calculates predictions of how additional content completed in the current month would have influenced the prediction for the following month. It adds 5 columns to the existing predictions from the random_forest_time script, one for completion of 1 recommended activity, a second for 2 activities... etc.

## Load Dependencies

In [2]:
import subscript.config as cn
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn import preprocessing
from sklearn.model_selection import ShuffleSplit
from sklearn import metrics
import pickle
import joblib
import matplotlib.pyplot as plt

## Read in the original test set

In [3]:
df = pd.read_csv(os.path.join(cn.clean_dir, 'random_forest_time',
        'whole_test_set.csv'))
dfo = df.copy() # For constructing the output at the end


## Read in the trained time series model

In [4]:
model = joblib.load(os.path.join(cn.clean_dir,'random_forest_time',
        'final_time_model.sav'))
features = ['player', 'realm','last_login', 'time_since_login',
           'engagement', 'status']



## Prepare the test set

In [5]:
y = df.engagement
X = df.fillna(0).drop(features, axis = 1)

## Add the actual and predicted values for June 2020 to the original dataset

In [6]:
pred = model.predict(X)
dfo['pred'] = pred
dfo['actual'] = y
print_cols = features + ['pred']

## Repeat the predictions as if the player had completed 1-5 recommended activities in May 2020

In [7]:
i = 1
for i in np.arange(1,6):
    df['2020-05'] = df['2020-05'].values.astype(float) + 1
    new_pred = model.predict(df.drop(features, axis = 1))
    dfo['pred' + str(i)] = new_pred
dfo.to_csv(os.path.join(cn.clean_dir, 'random_forest_time',
        'test_predictions.csv'), index = False)