In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge

import os
imgdir = 'output_images'
if not os.path.isdir(imgdir):
    os.mkdir(imgdir)

brands = ['Gummies', 'Sugar Buns', 'Sparkle Pop', 'Candys', 'Garlic Bombs', 'Snackmonds', 'Caffeinos', 'moon drops']
spend_types = ['Trade', 'Search', 'Display', 'TV', 'Radio']
num_brands, num_types = len(brands), len(spend_types)

In [2]:
np.random.seed(1)

In [3]:
import tqdm
try:
    tqdm_disp = tqdm.tqdm_notebook if get_ipython(
    ).__class__.__name__ == 'ZMQInteractiveShell' else tqdm.tqdm
except NameError:
    tqdm_disp = tqdm.tqdm

In [4]:
num_promos = 40000
start_date = pd.to_datetime('2016-01-01')
end_date = pd.to_datetime('2018-06-01')
dates = pd.date_range(start_date, end_date)
num_dates = len(dates)

In [5]:
promo = pd.DataFrame(
    {'Brand':np.random.choice(brands, num_promos),
     'Type':np.random.choice(spend_types, num_promos), 
     'Spend':np.random.rand(num_promos) * 500,
     'date' : np.random.choice(dates, num_promos)})
promo = promo.groupby(['Brand', 'Type', 'date'])['Spend'].sum().to_frame()
promo = promo.unstack(fill_value=0).stack()
promo.shape

(35320, 1)

In [6]:
promo.iloc[np.random.choice(promo.shape[0], (promo.shape[0]*99)//100, replace=False)] = 0

In [7]:
initial_promo = promo.copy()

In [8]:
promo.shape, promo.rolling(40).mean().shape

((35320, 1), (35320, 1))

In [9]:
# for _ in range(10):
#     promo.iloc[10:] = promo.iloc[10:] + promo.iloc[:-10]

In [11]:
promo.shape

(35320, 1)

In [10]:

# promo = pd.concat([promo, initial_promo], axis=1).min(1).to_frame()
promo = promo.rolling(10).max().fillna(0)
promo = promo + np.random.rand(promo.shape[0]) * 20

ValueError: Unable to coerce to Series, length must be 1: given 35320

In [None]:
promo

In [None]:
assert(promo.shape[0] == num_brands * num_types * num_dates)

In [None]:
num_brands * num_types * num_dates

In [None]:
true_baselines = {brand: [np.random.randn() * 100 + 1000] for brand in brands}

In [None]:
spend_rois = {t : [np.random.rand()+.8] for t in spend_types}
brand_rois = {t : [np.random.rand()/3] for t in brands}

In [None]:
brand_spend_combos = pd.DataFrame(
    data=np.random.rand(num_brands, num_types) - .5, index=brands, columns = spend_types)

In [None]:
brand_spend_combos  = brand_spend_combos + \
    pd.DataFrame(brand_rois, index=spend_rois).T + pd.DataFrame(spend_rois, index = brands)

In [None]:
brand_spend_combos

In [None]:
promo2 = promo.unstack(1)

In [None]:
promo2.columns = promo2.columns.droplevel(0)
promo2 = promo2.reset_index(1)

In [None]:
effect = promo2.copy()
effect[effect.columns[1:]] = effect[effect.columns[1:]] * brand_spend_combos
effect = effect.reset_index().set_index(['Brand', 'date'])
effect = effect.sum(1).to_frame(name='spend').unstack(0)
effect.columns = effect.columns.droplevel(0)

In [None]:
effect.head()

In [None]:
salesnoise = pd.DataFrame(data = np.random.rand(num_dates, num_brands) * (-30), columns= brands, index=dates)
salesnoise.head()

In [None]:
seasonal_effects = pd.DataFrame(
    {brand: {'cos':np.random.randn() * 300, 'sin':np.random.randn()*300} for brand in brands})

In [None]:
seasonal_effects

In [None]:
seasonal_effects.T

In [None]:
brand_spend_combos

In [None]:
brand_spend_combos_with_seasonality = brand_spend_combos.join(seasonal_effects.T)

In [None]:
seasonal = salesnoise.index.dayofyear

seasonal_sine, seasonal_cos = np.sin(seasonal*2*np.pi/365), np.cos(seasonal*2*np.pi/365)

In [None]:
seasonal_effects = seasonal_effects[effect.columns]

In [None]:
seasonal_effects.loc[['cos'],:].values * np.expand_dims(seasonal_cos.values, 1)

In [None]:
seasonal_effects.loc[['cos'],:]

In [None]:
seasonal_impact = seasonal_effects.loc[['cos'],:].values * np.expand_dims(seasonal_cos.values, 1) + \
    seasonal_effects.loc[['sin'],:].values * np.expand_dims(seasonal_sine.values, 1)

In [None]:
total_sales = effect + salesnoise + pd.DataFrame(true_baselines).iloc[0,:] + seasonal_impact

In [None]:
# total_sales = pd.DataFrame(
#     data=total_sales.values + np.expand_dims(sine.values, 1) + np.expand_dims(cos.values, 1),
#     index=total_sales.index, columns=total_sales.columns)

In [None]:
total_sales.head()

In [None]:
total_sales = total_sales.stack().to_frame(name='sales')

In [None]:
total_sales.head()

In [None]:
promo_viewed = promo.unstack(1)
promo_viewed.columns = promo_viewed.columns.droplevel(0)
promo_viewed = promo_viewed.reset_index().sort_values('date').set_index(['date', 'Brand'])
promo_viewed.head(10)

In [None]:
total_sales = total_sales - total_sales.min()
#total_sales = total_sales * 1000 / total_sales.max()

In [None]:
sales_viewed = total_sales.clip(0, np.inf)

In [None]:
data_viewed = promo_viewed.join(sales_viewed, on=['date', 'Brand'])

In [None]:
data_viewed.round(2).head(10)

In [None]:
sales_viewed.head(10)

In [None]:
spend = data_viewed.drop('sales', 1)
Yraw = data_viewed[['sales']]
spend.head(10)

In [None]:
seasonal_X = spend.reset_index()['date'].dt.dayofyear

sine_X, cos_X = np.sin(seasonal_X*2*np.pi/365), np.cos(seasonal_X*2*np.pi/365)
#sine_X = sine_X.to_frame('sine')
#cos_X = cos_X.to_frame('cos')

In [None]:
dummies = pd.get_dummies(spend.reset_index()['Brand'])
dummies = dummies.set_index(spend.index)

In [None]:

spend['cos'] = cos_X.values
spend['sin'] = sine_X.values

In [None]:
spend.reset_index().groupby('date')[['cos', 'sin']].max().head(10)

In [None]:
spend.reset_index().set_index('date')['Brand'].to_frame().head()

In [None]:
def color_background(val):
    color = '' if val < 1 else 'lightgreen'
    return 'background-color: ' + color
def color_number(val):
    color = 'red' if val < 1 else ''
    return 'color: %s' % color

In [None]:
dummies.head().style.applymap(color_background).applymap(color_number)

In [None]:
dummies.head()

In [None]:
dummy_spends = {}
for dummy in dummies.columns:
    brand_spend = pd.DataFrame(
        dummies[dummy].to_frame().values * spend.values, 
        index=spend.index, columns=['type:' + c + '_brand:' + dummy for c in spend.columns])
    dummy_spends[dummy] = brand_spend

In [None]:
Xraw = pd.concat((spend, dummies, *list(dummy_spends.values())), 1)

In [None]:
Xraw.head(10).tail(5).round()

In [None]:
xmeans, xstds = Xraw.mean(), Xraw.std()
ymeans, ystds = Yraw.mean(), Yraw.std()

In [None]:
xstds[spend.columns] = xstds[spend.columns] / 10
for col in xstds.index:
    if 'sin_' in col or 'cos_' in col:
        xstds[col] = xstds[col] / 10


In [None]:
xstds[brands] = xstds[brands] / 100

In [None]:
X = (Xraw - xmeans) / xstds
Y = (Yraw - ymeans)/ystds

In [None]:
X.round(2).head()

In [None]:
clf = Ridge(1e-5).fit(X, Y)

In [None]:
import tensorflow as tf
lr = tf.placeholder(tf.float32, ())
x_in = tf.placeholder(tf.float32, (None, X.shape[1]))
y_in = tf.placeholder(tf.float32, (None))
w = tf.Variable(np.random.randn(X.shape[1]), dtype=tf.float32)
b = tf.Variable(0, dtype=tf.float32)
yhat = tf.reduce_sum(x_in * tf.expand_dims(w, 0)) + b
loss = tf.reduce_mean(tf.square(y_in-yhat)) + 1e-15 * (tf.reduce_sum(tf.square(w)))

In [None]:
# optim = tf.train.AdamOptimizer(lr).minimize(loss)

# feed_dict = {
#     x_in:X, y_in:Y
# }

# s = tf.Session()
# s.run(tf.global_variables_initializer())
# for i in range(1000):
#     feed_dict[lr] = 1e0/np.sqrt(i+1)
#     l, _ = s.run([loss, optim], feed_dict)
#     if i % 100 == 0:
#         print(l)

In [None]:
# optim = tf.contrib.opt.ScipyOptimizerInterface(
#         loss, tol=0
#     )
# feed_dict = {
#     x_in:X, y_in:Y
# }

# s = tf.Session()
# s.run(tf.global_variables_initializer())

# with tqdm_disp() as tq:
#     y_hat_trace = []

#     def loss_callback(y_hat, loss):
#         y_hat_trace.append(y_hat)
#         tq.update()
#         tq.set_postfix_str('loss = %.6f' % (loss))

#     optim.minimize(
#         s,
#         feed_dict=feed_dict,
#         fetches=[yhat, loss],
#         loss_callback=loss_callback,
#     )
#     y_hat = y_hat_trace[-1]

In [None]:
#s.run(loss, feed_dict)

In [None]:
#coef_tf = s.run(w)

In [None]:
clf.coef_

In [None]:
coef = clf.coef_ / xstds.values * ystds.values
#coef = (np.expand_dims(coef_tf, 0) / xstds.values) * ystds.values

In [None]:
display = pd.DataFrame(coef, columns=X.columns)

In [None]:
aggregated = spend.columns
bias = dummies.columns
deltas = np.array([col for col in X.columns if col not in aggregated and col not in bias])

In [None]:
results = pd.DataFrame(
    data=np.zeros((num_brands+1, spend.columns.shape[0]+1)), 
    index=['aggregated'] + brands, columns = ['bias'] + list(spend.columns))

In [None]:
results

In [None]:
results.loc[['aggregated'], aggregated]


In [None]:
display[aggregated]

In [None]:
display

In [None]:
display['type:Display_brand:Gummies']

In [None]:
results.loc[['aggregated'], aggregated] = display[aggregated].values
for brand in brands:
    results.loc[brand, 'bias'] = display[brand].values
    results.loc[brand, aggregated] = display.loc[:,[b for b in deltas if brand in b]].values + display[aggregated].values

In [None]:
results

In [None]:
import calendar

In [None]:
def add_month(df):
    df = df.copy()

    df['phase'] = np.angle((
            df['sin'] * 1j + df['cos']).astype(complex))
    df['peak_month'] = ((
            (df['phase'] * 12 / 2 / np.pi)) % 12).round().astype(int)
    df.loc[df['peak_month'] == 0, 'peak_month'] = 12
    df['peak_month'] = df['peak_month'].apply(
        lambda x: calendar.month_abbr[x])
    df['seasonal_impact'] = np.sqrt(
        (np.square(df['sin']) + np.square(
            df['cos'])).astype('float'))
    df['seasonal_impact'] = (df[
        'seasonal_impact'] - df['seasonal_impact'].min()) / df['seasonal_impact'].std() + 0.1
    return df

In [None]:
groundtruth = add_month(brand_spend_combos_with_seasonality)
results2 = add_month(results)

In [None]:
groundtruth[list(aggregated) + ['peak_month', 'seasonal_impact']].round(2)

In [None]:
display[[col for col in display.columns if 'sin' in col and 'Cand' in col]]

In [None]:
results2[list(aggregated) + ['peak_month', 'seasonal_impact']].round(2)

In [None]:
display_aggregated = display.rename({col: 'aggregated:'+col for col in spend.columns}, axis=1)

In [None]:
display_aggregated

In [None]:
X.columns.shape

In [None]:
data_viewed.head()

In [None]:
def view_brand(df, b):
    plt.figure(figsize=(15, 10))
    brand = data_viewed.query('Brand == "' + b + '"')
    x = brand.reset_index()['date']
    for spend in brand.columns:
        vals = brand[spend]
        vals = vals - vals.min()
        vals = vals/vals.max()
        plt.plot(x, vals, label=spend)
    sin = np.sin(x.dt.dayofyear * 2 * np.pi/365)
    cos = np.cos(x.dt.dayofyear * 2 * np.pi/365)
    sin, cos = sin - sin.min(), cos - cos.min()
    sin, cos = sin/sin.max(), cos/cos.max()
    plt.plot(x, sin, label='sine')
    plt.plot(x, cos, label='cos')
    plt.legend()
    plt.show()


In [None]:
view_brand(data_viewed, 'Caffeinos')