In [1]:
import warnings
import itertools
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import matplotlib
import scipy.stats as scs

from dateutil.relativedelta import relativedelta
from scipy.optimize import minimize
from itertools import product

from sklearn.linear_model import RidgeCV, LassoCV, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor

from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

In [2]:
def preprocessing_data(lag_range, data):
    p_data = pd.DataFrame(data)
    p_data.columns = ['y']
    
    for i in range(lag_range[0], lag_range[1]):
        p_data['lag_{}'.format(i)] = p_data.y.shift(i)
        
    from sklearn.model_selection import TimeSeriesSplit
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import cross_val_score

    tscv = TimeSeriesSplit(n_splits=5)
    
    p_data.index = pd.to_datetime(p_data.index)

    p_data['weekday'] = p_data.index.weekday
    p_data['is_weekend'] = p_data.index.weekday.isin([4, 5])*1

    p_data['weekday_to_predict'] = p_data.index.shift(periods=lag_range[0], freq='D').weekday
    p_data['is_weekend_to_predict'] = p_data.weekday_to_predict.isin([4,5])*1

    X = p_data.dropna().drop(['y'], axis=1)
    y = p_data.dropna().y
    
    return X, y

# Global data

In [3]:
# df = pd.read_excel('gl_uninstall_20190904.xlsx', index_col=0, parse_dates=True)
df = pd.read_excel('gl_uninstall_20190831.xlsx', index_col=0, parse_dates=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 608 entries, 2018-01-01 to 2019-08-31
Data columns (total 1 columns):
uninstall    608 non-null int64
dtypes: int64(1)
memory usage: 9.5 KB


In [4]:
data = df['uninstall']

##  --Global 7 days

In [5]:
lag_range = [7, 17]
g_7days_X, g_7days_y = preprocessing_data(lag_range=lag_range, data=data)

In [6]:
from sklearn.linear_model import LassoCV

reg1 = RidgeCV(alphas=[0.3, 0.5, 0.7, 0.9],cv=tscv, )
reg2 = LassoCV(eps=0.01, n_alphas=1, cv=tscv)
reg3 = GradientBoostingRegressor(random_state=1)
reg4 = ElasticNet(alpha=0.4, l1_ratio=0.8)
reg5 = RandomForestRegressor(max_features=0.7, min_samples_leaf=3, n_estimators=2000, bootstrap=True)

vr = VotingRegressor(
    estimators=[('ridge', reg1), ('lasso', reg2), ('gbr', reg3), ('elastic', reg4), ('randomf', reg5)],
    weights=[2, 1, 1, 1, 3]
)
vr = vr.fit(g_7days_X, g_7days_y)

  positive)


In [8]:
import pickle

with open('gl_7.pkl', 'wb') as handle:
    pickle.dump(vr, handle, pickle.HIGHEST_PROTOCOL)

## --Global 5 days

In [9]:
lag_range = [5, 15]
g_5days_X, g_5days_y = preprocessing_data(lag_range=lag_range, data=data)

In [10]:
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier

reg1 = RidgeCV(alphas=[0.3, 0.5, 0.7, 0.9],cv=tscv, )
reg2 = LassoCV(eps=0.01, n_alphas=1, cv=tscv)
reg3 = GradientBoostingRegressor(random_state=1)
reg4 = ElasticNet(alpha=0.4, l1_ratio=0.8)
reg5 = RandomForestRegressor(max_features=0.7, min_samples_leaf=3, n_estimators=2000, bootstrap=True)

vr = VotingRegressor(
    estimators=[('ridge', reg1), ('lasso', reg2), ('gbr', reg3), ('elastic', reg4), ('randomf', reg5)],
    weights=[0.15, 0.35, 0.10, 0.3, 0.10]
)
vr = vr.fit(g_5days_X, g_5days_y)

  positive)


In [11]:
with open('gl_5.pkl', 'wb') as handle:
    pickle.dump(vr, handle, pickle.HIGHEST_PROTOCOL)

# US data

In [12]:
us_df = pd.read_excel('us_uninstall_20190831.xlsx', index_col=0, parse_dates=True)
us_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 699 entries, 2018-01-01 to 2019-10-30
Data columns (total 1 columns):
us    699 non-null int64
dtypes: int64(1)
memory usage: 10.9 KB


In [13]:
us_data = us_df['us']

## --US 7 days

In [16]:
lag_range = [7, 17]
X_us_7, y_us_7 = preprocessing_data(lag_range=lag_range, data=us_data)

In [17]:
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import GradientBoostingRegressor

r1 = RidgeCV(cv=tscv)
r2 = LassoCV(cv=tscv)
r3 = GradientBoostingRegressor(random_state=1)

vr_us_7 = VotingRegressor(estimators=[('ridge', r1), ('lasso', r2), ('gbr', r3)])
vr_us_7 = vr_us_7.fit(X_us_7, y_us_7)

In [18]:
with open('us_7.pkl', 'wb') as handle:
    pickle.dump(vr_us_7, handle, pickle.HIGHEST_PROTOCOL)

## --US 5 days

In [22]:
lag_range = [5, 15]
X_us_5, y_us_5 = preprocessing_data(lag_range=lag_range, data=us_data)

In [23]:
r1 = RidgeCV(cv=tscv)
r2 = LassoCV(cv=tscv)
r3 = GradientBoostingRegressor(random_state=1)

vr_us_5 = VotingRegressor(estimators=[('ridge', r1), ('lasso', r2), ('gbr', r3)])
vr_us_5 = vr_us_5.fit(X_us_5, y_us_5)

In [24]:
with open('us_5.pkl', 'wb') as handle:
    pickle.dump(vr_us_5, handle, pickle.HIGHEST_PROTOCOL)