# Processing of overall statistics for dataset 

Author: Maksim Krivolapov maksim.krivolapov@roivenue.com

Goals:

How to use it:

In [50]:
import pandas as pd
import math
import matplotlib.pyplot as plt
import numpy as np

import ipywidgets as widgets
import cufflinks as cf
cf.go_offline()

import plotly.express as px
import plotly.graph_objects as go
pd.options.plotting.backend = "plotly"

## ToDo
1. Read CSV
2. Parse data
3. Get list of platforms, campaigns, channels
4. Choose platforms
5. Calculate statistics for every channel / campaign

In [51]:
DateTimeID = ['periodStartDate']

usecols = [
    "adCampaign", "platformCode","currencyCode", "channelCode", "marketingInvestment",
    "impressions", "clicks", "visits", "conversions", "deliveries",
    "netRevenue", "grossProfit", "adGroup", "periodStartDate","businessUnit"
          ]

dtype = {
    'adCampaign' : str,
    'platformCode' : str,
    'businessUnit':str,
    'currencyCode':str,
    'channelCode' : str,
    'impressions' : int,
    'clicks' : int,
    'visits' : int,
    'conversions' : int,
    'deliveries' : int,
    'netRevenue' : float,
    'grossProfit' : float,
    'adGroup' : str,
    'marketingInvestment': float
}

In [52]:
#file_name = 'MarketingReport_3.csv'
file_name = 'MarketingReport_new.csv'



## Function's prototypes 

In [53]:
stat_name = ['Mean','Std','Med','Sqew','Kurtosis','Var','Iqr','Shapiro-stat','Shapiro-Pr']
regr_name = ['a','b','c','a-err','b-err','c-err']
r2_stat_name = ['R2-mean','R2-std','R2-norm','R2-chi-sqr'] # func [line, log] if line -> c=NaN

In [54]:
def main_statistics(df, precision = 3):
    """ Function input pandas series or numpy array
        input: dataframe, precision -> digits after point
        output: mean, std, median, skewness, kurtosis, variance, interquartile range, Shapiro-Wilk Test [Stat, Pr] """
    from scipy.stats import kurtosis, iqr, shapiro
    result = []
    count,mean, std, *all = df.describe()
    result.append(np.round(mean, precision))
    result.append(np.round(std, precision))
    result.append(np.round(df.median(), precision))
    result.append(np.round(df.skew(axis = 0, skipna = True), precision ))
    result.append(np.round(kurtosis(df, fisher=True), precision ))
    result.append(np.round(df.var(), precision))
    result.append((np.round(iqr(df, axis=0, keepdims=True), precision)).item())
    if len(df) > 2:
        stat, p = shapiro(df)
    else:
        stat = np.nan
        p = np.nan
    result.append(np.round(stat,precision))
    result.append(np.round(p,precision))
    return result

In [55]:
# regression functions definition
def log_f(x, a, b, c):
    return a * (1-np.exp(-x/b)) + c #a * (1 - np.exp((x/b)))

def line_f(x, a, b):
    return a * x + b

def sine_f(x, a, b):
    return a * np.sin(b * x)

In [56]:
# function for genetic algorithm to minimize (sum of squared error)
def sumOfSquaredError(parameterTuple):
    function = regr_func # it's bad staff #TODO
    warnings.filterwarnings("ignore") # do not print warnings by genetic algorithm
    val = function(t_train, *parameterTuple)
    return np.sum((y_train - val) ** 2.0)

In [57]:
# function for search initial value for regression parameters
from scipy.optimize import differential_evolution
import warnings
def generate_Initial_Parameters(t_train, y_train,function):
    # min and max used for bounds
    maxX = max(t_train)
    minX = min(t_train)
    maxY = max(y_train)
    minY = min(y_train)
    maxXY = max(maxX, maxY)

    parameterBounds = []
    parameterBounds.append([-maxXY, maxXY]) # seach bounds for a
    parameterBounds.append([-maxXY, maxXY]) # seach bounds for b
    parameterBounds.append([-maxXY, maxXY]) # seach bounds for c

    # "seed" the numpy random number generator for repeatable results
    result = differential_evolution(sumOfSquaredError, parameterBounds, seed=3)
    return result.x

In [58]:
def regression_calc(df, function):
    """Calculate paramiters of regression function
    input: pandas dataframe [x,y]
    output: list[paramiters], list[std diviation err of paramiters]"""
    from scipy.optimize import curve_fit
    t_train = df[df.columns[0]]  #
    y_train = df[df.columns[1]]
    geneticParameters = generate_Initial_Parameters(t_train, y_train, function)
    popt, pcov = curve_fit(function, t_train, y_train, geneticParameters)
    #popt, pcov = curve_fit(function, t_train, y_train)
    error  = np.sqrt(np.diag(pcov))
    return popt, error

In [59]:
def select_business_unit(df, business_unit):
    """Select specific busines unti in dataframe """
    return df[df['businessUnit'] == business_unit]

In [60]:
def r2_test(df, function, *args):
    """General statistics for residual norm R2
    input: pandas dataframe[x,y], regression function, function arguments, argument's keywords
    output: input dataframe + residual column , residual mean, residual std, residual normality test, Chi-squared test"""
    result_df = df['residual'] = df[df.columns[1]] - df[df.columns[0]].apply(function(*args))
    median = np.round(result_df['residual'].median(), precision)
    return result_df

In [61]:
def select_agg_resample_df(df, index, granularity, use_nan):
    """Function for select and group  data in dataframe
    input: pandas dataframe, name of index, granularity['week','month'] default as in dataframe
    return: aggregated dataframe with extra column with indexes [ROI, CPI, CPC]"""
    group_df = df.groupby(['periodStartDate']).agg({
    'netRevenue': 'sum','marketingInvestment': 'sum', 'visits': 'sum', 'conversions': 'sum',
    'deliveries': 'sum', 'impressions': 'sum', 'clicks': 'sum', 'grossProfit': 'sum' })   

    if granularity == 'week':
        group_agg_df = group_df.resample('W-MON').agg('sum')
    elif granularity == 'month':
        group_agg_df = group_df.resample('M', convention='end').agg('sum')
    elif granularity == 'day':
        group_agg_df = group_df
    else:
        raise ValueError("Incorrect aggregation period, shuld be 'day', 'week' or 'month'")

    group_agg_df['ROI'] = (group_agg_df['netRevenue']/group_agg_df['marketingInvestment'])
    group_agg_df['CPI'] = (group_agg_df['marketingInvestment']/group_agg_df['impressions'])
    group_agg_df['CPC'] = (group_agg_df['marketingInvestment']/group_agg_df['clicks'])
    group_agg_df['POI'] = (group_agg_df['marketingInvestment']/group_agg_df['grossProfit']) # Profit over investment

    if use_nan == False:
        group_agg_df.fillna(0)
    else:
        pass
    return group_agg_df

In [62]:
def stat_index_platform(df, platform_code, granularity, index, precision):
    """Calculate marketing indexis for all platforms
    input: dataframe, granularity, index[ROI, CPI, CPC], precision
    output: dataframe with statistics for all platforms"""
    statistics_ROI = []
    for i in platform_code:
        new_df = df[df['platformCode'] == i]

        selected_df = select_agg_resample_df(new_df, 'periodStartDate', granularity, use_nan = False)      

        stat = main_statistics(selected_df[index], precision)

        statistics_ROI.append(stat)
        
    statistics_df = pd.DataFrame(statistics_ROI, columns = stat_name)
    statistics_df = statistics_df.set_index([pd.Index(platform_code)])
    return statistics_df


## Read Dataset

In [83]:
df = pd.read_csv(file_name,
                 sep="\t",
                 infer_datetime_format=True,
                 usecols=usecols,
                 dtype=dtype,
                 parse_dates=DateTimeID)

df['periodStartDate'] = pd.to_datetime(df['periodStartDate'])

platform_code = df.platformCode.unique().tolist()
platform_code.sort()

business_unit_code = df.businessUnit.unique().tolist()
business_unit_code.sort()

channel_code_code = df.channelCode.unique().tolist()
channel_code_code.sort()

currency_code_code = df.currencyCode.unique().tolist()
currency_code_code.sort()

df.head(5)

Unnamed: 0,businessUnit,platformCode,channelCode,adCampaign,marketingInvestment,impressions,clicks,visits,conversions,deliveries,netRevenue,grossProfit,currencyCode,adGroup,periodStartDate
0,kytaryro,unpaid,direct,,0.0,0,0,0,1,0,0.0,0.0,CZK,(not set),2019-01-01
1,kytaryro,AdWords,googleadsdsother,,0.0,0,0,2,0,0,0.0,0.0,CZK,(not set),2019-01-01
2,kytaryro,unpaid,organic,,0.0,0,0,0,1,1,1997.8,348.94,CZK,(not set),2019-01-01
3,kytaryro,Compari,compari,-,116.77,0,65,0,0,0,0.0,0.0,CZK,,2019-01-01
4,kytaryro,AdWords,googleadsdsother,(not set),0.0,0,0,0,0,0,0.0,0.0,CZK,(not set),2019-01-01


In [84]:
for i in platform_code:
    group_df = df[df['platformCode'] == i]
    adgroups = group_df['adGroup'].unique()
    channels = group_df['channelCode'].unique()
    campaigns = group_df['adCampaign'].unique()
        
    print(f'Platform {i} has \t{len(adgroups)} adgroups and \t{len(channels)} channels and \t{len(campaigns)} campaign')


Platform AdWords has 	12594 adgroups and 	27 channels and 	1547 campaign
Platform Adform has 	3 adgroups and 	1 channels and 	36 campaign
Platform Adminis has 	3 adgroups and 	1 channels and 	16 campaign
Platform Arukereso has 	2 adgroups and 	1 channels and 	110 campaign
Platform Bing has 	1 adgroups and 	1 channels and 	1 campaign
Platform Ceneo has 	3 adgroups and 	1 channels and 	70 campaign
Platform CjAffiliate has 	2 adgroups and 	2 channels and 	82 campaign
Platform Compari has 	3 adgroups and 	1 channels and 	97 campaign
Platform DogNet has 	2 adgroups and 	1 channels and 	44 campaign
Platform DoubleClick has 	3 adgroups and 	2 channels and 	14 campaign
Platform FacebookBusinessAdsManager has 	37 adgroups and 	7 channels and 	284 campaign
Platform HeurekaCz has 	3 adgroups and 	1 channels and 	453 campaign
Platform HeurekaSk has 	83 adgroups and 	1 channels and 	331 campaign
Platform Idealo has 	3 adgroups and 	2 channels and 	18 campaign
Platform Internal has 	2 adgroups and 	

In [65]:
print(platform_code)

['AdWords', 'Adform', 'Adminis', 'Arukereso', 'Bing', 'Ceneo', 'CjAffiliate', 'Compari', 'DogNet', 'DoubleClick', 'FacebookBusinessAdsManager', 'HeurekaCz', 'HeurekaSk', 'Idealo', 'Internal', 'Not tracked', 'OfflineTV', 'Olcsobbat', 'OneSignal', 'Organic', 'Outdoor', 'POS', 'Pricespy', 'Print', 'Promo_materialy', 'Radio', 'RtbHouse', 'Samba', 'Silverpop', 'Sklik', 'Yottly', 'ZboziCz', 'notset', 'unpaid']


In [66]:
print(business_unit_code)

['kytaryat', 'kytarycz', 'kytaryde', 'kytaryfr', 'kytaryhu', 'kytaryit', 'kytarypl', 'kytaryro', 'kytarysk', 'kytaryuk']


In [67]:
print(currency_code_code)

['CZK']


## Settings

In [68]:
independent_var = 'marketingInvestment'
dependent_var   = 'grossProfit'#'netRevenue' # 
business_unit   = 'kytarycz'
window_size     = 3 # window's size of filters

In [69]:
# Data source
data_source = widgets.Dropdown(
    options = ['platform','compaign','channel'],
    value='platform',
    description='Data source:',
    disabled=False,
)
display(data_source)

Dropdown(description='Data source:', options=('platform', 'compaign', 'channel'), value='platform')

In [70]:
# platform code
if 'All' not in business_unit_code: business_unit_code.insert(0, 'All')
business_unit_choose = widgets.Dropdown(
    options = business_unit_code,
    value = 'All',
    description='Business Unit:',
    disabled=False,
)
display(business_unit_choose)

Dropdown(description='Business Unit:', options=('All', 'kytaryat', 'kytarycz', 'kytaryde', 'kytaryfr', 'kytary…

In [71]:
business_unit_choose.value

'All'

In [72]:
granularity = widgets.Dropdown(
    options=['day', 'week', 'month'],
    value='week',
    description='Granularity:',
    disabled=False,
)
display(granularity)

Dropdown(description='Granularity:', index=1, options=('day', 'week', 'month'), value='week')

In [73]:
granularity.value

'week'

In [74]:
reggr_func = widgets.Dropdown(
    options=['log', 'sin', 'line'],
    value='log',
    description='Granularity:',
    disabled=False,
)
display(reggr_func)

Dropdown(description='Granularity:', options=('log', 'sin', 'line'), value='log')

In [75]:
filter_method = widgets.Dropdown(
    options = ['none','median','avrg','ewm'],
    value = 'avrg',
    description='Platform:',
    disabled=False,
)
display(filter_method)

Dropdown(description='Platform:', index=2, options=('none', 'median', 'avrg', 'ewm'), value='avrg')

In [76]:
filter_method.value

'avrg'

In [77]:
window_size = 10

In [78]:
regr_func = log_f

## Aggregate data and calculate descriptive statistics for dataset

In [79]:
#
if business_unit_choose.value == 'All':
    df_selected_unit = df
else:
    df_selected_unit = select_business_unit(df, business_unit_choose.value)
df_selected_unit.head()

Unnamed: 0,businessUnit,platformCode,channelCode,adCampaign,marketingInvestment,impressions,clicks,visits,conversions,deliveries,netRevenue,grossProfit,currencyCode,adGroup,periodStartDate
0,kytaryro,unpaid,direct,,0.0,0,0,0,1,0,0.0,0.0,CZK,(not set),2019-01-01
1,kytaryro,AdWords,googleadsdsother,,0.0,0,0,2,0,0,0.0,0.0,CZK,(not set),2019-01-01
2,kytaryro,unpaid,organic,,0.0,0,0,0,1,1,1997.8,348.94,CZK,(not set),2019-01-01
3,kytaryro,Compari,compari,-,116.77,0,65,0,0,0,0.0,0.0,CZK,,2019-01-01
4,kytaryro,AdWords,googleadsdsother,(not set),0.0,0,0,0,0,0,0.0,0.0,CZK,(not set),2019-01-01


In [80]:
# calculate statistics for selected busines unit(s)
statistics_ROI_df = stat_index_platform(df_selected_unit, platform_code, granularity.value, 'ROI', 3)

In [81]:
statistics_ROI_df

Unnamed: 0,Mean,Std,Med,Sqew,Kurtosis,Var,Iqr,Shapiro-stat,Shapiro-Pr
AdWords,8.822,1.854,8.983,0.324,1.214,3.436,2.096,0.975,0.043
Adform,inf,,0.034,,,,,,1.0
Adminis,inf,,inf,,,,,,1.0
Arukereso,5.144,3.306,4.227,2.888,11.583,10.931,2.911,0.747,0.0
Bing,,,,,,,,,1.0
Ceneo,6.348,4.429,5.292,1.671,2.782,19.616,4.064,0.843,0.0
CjAffiliate,,,inf,,,,,,1.0
Compari,14.829,10.015,12.415,1.465,2.451,100.296,10.826,0.885,0.0
DogNet,11.867,4.981,10.769,1.452,,24.808,,,1.0
DoubleClick,inf,,inf,,,,,,1.0


## Calculate regression's coefficients for chosen platform(s)

In [82]:
statistics_R2 = []
for i in platform_code:
    new_df = df_selected_unit[df_selected_unit['platformCode'] == i]
    selected_df_unfiltered = select_agg_resample_df(new_df, 'periodStartDate', 'day', use_nan = False)
    if filter_method.value   == 'none':
        selected_df_filtered = selected_df_unfiltered
    elif filter_method.value == 'avrg':
        selected_df_filtered = selected_df_unfiltered.rolling(window=window_size).mean()
    elif filter_method.value == 'median':
        pass
    elif filter_method.value == 'ewm':
        selected_df_filtered = selected_df_unfiltered.ewm(alpha=0.3, adjust=False).mean()
    else:
        pass
    selected_df = select_agg_resample_df(selected_df_filtered, 'periodStartDate', granularity.value, use_nan = False)
    if len(np.intersect1d(statistics_ROI_df.loc[i,:][0:2], [np.nan, np.inf, 0.0, 'Nan'])):
        regr = [np.nan, np.nan, np.nan]
        err = [np.nan, np.nan, np.nan]
    else:
        t_train = selected_df[independent_var]
        y_train = selected_df[dependent_var]
        
        regr, err = regression_calc(selected_df[[independent_var,dependent_var]],log_f)

    statistics_R2.append(np.concatenate((regr, err), axis=0))


statistics_R2_df = pd.DataFrame(statistics_R2, columns = regr_name)
statistics_R2_df = statistics_R2_df.set_index([pd.Index(platform_code)])

RuntimeError: Optimal parameters not found: Number of calls to function has reached maxfev = 800.

In [None]:
statistics_R2_df

## Residual vector calculation 

## Statistics of vector calculation

* Correlation R vector with initial dataset
* Statistics of R vector
* Norm - test of R-vector

In [None]:
# platform code
#if '__All__' not in platform_code: platform_code.insert(0, '__All__')
platform = widgets.Dropdown(
    options = platform_code,
    value = platform_code[0],
    description='Platform:',
    disabled=False,
)
display(platform)

In [None]:
platform.value

In [None]:
new_df = df_selected_unit[df_selected_unit['platformCode'] == platform.value]
#new_df['impressions']  = new_df['impressions'].rolling(window=10).mean()
#new_df['impressions'] = new_df['impressions'].rolling(5).mean()
df_selected_unit.head()


In [None]:
selected_df_unfiltered = select_agg_resample_df(new_df, 'periodStartDate', 'day', use_nan = False)
if filter_method.value   == 'none':
    selected_df_filtered = selected_df_unfiltered
elif filter_method.value == 'avrg':
    selected_df_filtered = selected_df_unfiltered.rolling(window=window_size).mean()
elif filter_method.value == 'median':
    pass
else:
    pass

selected_df = select_agg_resample_df(selected_df_filtered, 'periodStartDate', granularity.value, use_nan = False)
sorted_df = selected_df.sort_values(by = independent_var)
selected_df.head()

In [None]:
filter_method.value

In [None]:
selected_df

In [None]:
fig1 = selected_df.plot()
fig1.update_layout(title_text="Data distributions on "+ platform.value+' plaform and '+business_unit_choose.value+' business unit', title_font=dict(size=16))
fig1.update_layout(uniformtext_minsize=16, uniformtext_mode='hide')
fig1.show()

In [None]:
fig6 = selected_df.plot(selected_df, x = selected_df.index, y = ['POI'])
fig6.update_layout(title_text="Data distributions on "+ platform.value+' plaform and '+business_unit_choose.value+' business unit', title_font=dict(size=20))
fig6.update_layout(uniformtext_minsize=16, uniformtext_mode='hide')
fig6.show()

In [None]:
R = sorted_df[dependent_var]-regr_func(sorted_df[independent_var], *(statistics_R2_df.loc[platform.value, ['a','b', 'c']]))
upper = regr_func(sorted_df[independent_var],  *(statistics_R2_df.loc[platform.value, ['a','b','c']])) + 2*R.describe()[2]
lower = regr_func(sorted_df[independent_var],  *(statistics_R2_df.loc[platform.value, ['a','b','c']])) - 2*R.describe()[2]

In [None]:
import kaleido
import orca

In [None]:
import plotly.graph_objects as go
fig3 = go.Figure()
# Add traces
fig3.add_trace(
    go.Scatter(x=sorted_df[independent_var] ,
               y=sorted_df[dependent_var],
               mode='markers',
               name='Polynomial regression (projection)'))

fig3.add_trace(
    go.Scatter(x=sorted_df[independent_var] ,
               y=regr_func(sorted_df[independent_var],  *(statistics_R2_df.loc[platform.value, ['a','b','c']])),
               mode='lines',
               name='Polynomial regression (projection)'))
fig3.add_trace(
    go.Scatter(
        x=sorted_df[independent_var] , # x, then x reversed
        y=upper, # upper, then lower reversed
        #fill='toself',
       # fillcolor='rgba(0,100,80,0.2)',
       # line=dict(color='rgba(255,255,255,0)'),
       # hoverinfo="skip",
        name='Upper margin one sigma',
        showlegend=True
    ))
fig3.add_trace(
    go.Scatter(
        x=sorted_df[independent_var] , # x, then x reversed
        y=lower, # upper, then lower reversed
        #fill='toself',
       # fillcolor='rgba(0,100,80,0.2)',
       # line=dict(color='rgba(255,255,255,0)'),
       # hoverinfo="skip",
        name='Lower margin one sigma',
        showlegend=True
    ))

fig3.update_layout(
    title_text=
dependent_var+' over '+independent_var+' on '+ platform.value+' plaform and '+business_unit_choose.value+' business unit',
    title_font=dict(size=16))

fig3.write_image("images/fig3.jpeg")

fig3.show()

In [None]:
log_f.__name__

In [None]:
statistics_R2_df.loc[ platform.value , ['a','b','c']]

In [None]:
R = selected_df[dependent_var]-regr_func(selected_df[independent_var],*(statistics_R2_df.loc[platform.value, ['a','b','c']]))

In [None]:
fig4 = go.Figure()
# Add traces
fig4.add_trace(
    go.Scatter(x=sorted_df[independent_var] ,
               y=R,
               mode='markers',
               name='Polynomial regression (projection)'))

fig4.update_layout(
    title_text=
    f"Residual vector",
    title_font=dict(size=20))
fig4.show()

In [None]:
R.hist(bins= 5)

In [None]:
pd.Series(main_statistics(R,3), index =stat_name)

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Initialize figure with subplots
fig = make_subplots(
    rows=4, cols=1, subplot_titles=("Plot 1", "Plot 2", "Plot 3", "Plot 4")
)

# Add traces
fig.add_trace(go.Scatter(x=[1, 2, 3], y=[4, 5, 6]), row=1, col=1)
fig.add_trace(go.Scatter(x=[1, 2, 3], y=[3.5, 4.5, 5.5]), row=1, col=1)

fig.add_trace(go.Scatter(x=[20, 30, 40], y=[50, 60, 70]), row=2, col=1)
fig.add_trace(go.Scatter(x=[300, 400, 500], y=[600, 700, 800]), row=3, col=1)
fig.add_trace(go.Scatter(x=[4000, 5000, 6000], y=[7000, 8000, 9000]), row=4, col=1)

# Update xaxis properties
fig.update_xaxes(title_text="xaxis 1 title", row=1, col=1)
fig.update_xaxes(title_text="xaxis 2 title", range=[10, 50], row=2, col=1)
fig.update_xaxes(title_text="xaxis 3 title", showgrid=False, row=3, col=1)
fig.update_xaxes(title_text="xaxis 4 title", type="log", row=4, col=1)

# Update yaxis properties
fig.update_yaxes(title_text="yaxis 1 title", row=1, col=1)
fig.update_yaxes(title_text="yaxis 2 title", range=[40, 80], row=2, col=1)
fig.update_yaxes(title_text="yaxis 3 title", showgrid=False, row=3, col=1)
fig.update_yaxes(title_text="yaxis 4 title", row=4, col=1)

# Update title and height
fig.update_layout(title_text="Customizing Subplot Axes", height=1000, width=600)


fig.show()

In [None]:
def optimization_func():
    """Input: df with statistics and regrettion coefficients, regrettion function, objective func, value of objective
    Output:  optimal points in df """
    return result

In [None]:
df.info()

In [None]:
df.to_sparse().info()

In [None]:
df.info(memory_usage='deep')