In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import datetime
import matplotlib.pyplot as plt

In [2]:
# read data and drop nan values (for some reason there are a bunch of rows of nans at the end of the raw data)
df = pd.read_csv('analytics_20191001_20211001.csv', skiprows=6).dropna()
# convert dates from float to datetime
df['Date'] = pd.to_datetime(df['Date'], format='%Y%m%d')
# subset by dates specified for Q3
df = df[(df['Date'] >= '2019-10-01') & (df['Date'] <= '2020-10-01')]
# sort by date
df = df.sort_values(by='Date')
df.head()

Unnamed: 0,Default Channel Grouping,Date,Users,New Users,Sessions,Bounce Rate,Pages / Session,Avg. Session Duration,Ecommerce Conversion Rate,Transactions,Revenue
2220,(Other),2019-10-01,69,42,70,32.86%,4.64,0:03:09,0.00%,0.0,$0.00
2405,Social,2019-10-01,62,54,66,51.52%,2.77,0:01:12,0.00%,0.0,$0.00
2087,Affiliates,2019-10-01,76,59,82,69.51%,1.91,0:01:52,0.00%,0.0,$0.00
978,Direct,2019-10-01,374,318,404,37.87%,4.17,0:03:13,0.00%,0.0,$0.00
3606,Display,2019-10-01,24,22,25,80.00%,1.28,0:00:03,0.00%,0.0,$0.00


Data wrangling/formatting:

In [3]:
# convert channel to dummy variables
df = pd.get_dummies(df, columns=['Default Channel Grouping'], drop_first=True, prefix='channel')
# character modifications for string number columns
df['Users'] = df['Users'].apply(lambda x: x.replace(',', ''))
df['New Users'] = df['New Users'].apply(lambda x: x.replace(',', ''))
df['Sessions'] = df['Sessions'].apply(lambda x: x.replace(',', ''))
df['Bounce Rate'] = df['Bounce Rate'].apply(lambda x: x.replace('%', ''))
df['Ecommerce Conversion Rate'] = (df['Ecommerce Conversion Rate'].apply(lambda x: x.replace('%', '')))
df['Revenue'] = (df['Revenue'].apply(lambda x: x.replace('$', '')))
df['Revenue'] = (df['Revenue'].apply(lambda x: x.replace(',', '')))
# convert users columns to integers
df['Users'] = df['Users'].astype(int)
df['New Users'] = df['New Users'].astype(int)
df['Sessions'] = df['Sessions'].astype(int)
df['Bounce Rate'] = (df['Bounce Rate'].astype(float)) / 100
df['Ecommerce Conversion Rate'] = (df['Ecommerce Conversion Rate'].astype(float)) / 100
df['Revenue'] = df['Revenue'].astype(float)



# function to convert time string to number of seconds
def to_seconds(time_str):
    # convert time string to datetime object
    date_time = datetime.datetime.strptime(time_str, "%H:%M:%S")
    # datetime object defaults to 1900-01-01, so we can use that to get the length of time in the time string
    a_timedelta = date_time - datetime.datetime(1900, 1, 1)
    # conver to seconds and return
    seconds = a_timedelta.total_seconds()
    return seconds

# convert session durations to seconds
df['Avg. Session Duration'] = df['Avg. Session Duration'].apply(lambda x: x.replace('<00:00:01', '00:00:01'))
df['Avg. Session Duration'] = df['Avg. Session Duration'].apply(to_seconds)



# get day of week
df['Day of Week'] = df['Date'].dt.day_name()
# create weekend dummy variable
df['Weekend'] = ((df['Day of Week'] == 'Saturday') | (df['Day of Week'] == 'Sunday')).astype(int)
# get month of year
df['Month'] = df['Date'].apply(lambda x: x.month)
# create month dummy variable
df = pd.get_dummies(df, columns=['Month'], drop_first=True, prefix='month')

df

Unnamed: 0,Date,Users,New Users,Sessions,Bounce Rate,Pages / Session,Avg. Session Duration,Ecommerce Conversion Rate,Transactions,Revenue,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
2220,2019-10-01,69,42,70,0.3286,4.64,189.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
2405,2019-10-01,62,54,66,0.5152,2.77,72.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
2087,2019-10-01,76,59,82,0.6951,1.91,112.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
978,2019-10-01,374,318,404,0.3787,4.17,193.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
3606,2019-10-01,24,22,25,0.8000,1.28,3.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1102,2020-10-01,330,179,365,0.3233,8.05,285.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
833,2020-10-01,456,399,490,0.3163,5.66,204.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
2319,2020-10-01,66,59,70,0.6000,2.81,55.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
982,2020-10-01,373,264,396,0.3611,4.43,195.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0


Running regressions:

In [5]:
# get all channel columns
# note that all coefficients on the channel variables will be with respect to (Other)
channels = [col for col in df.columns if col.startswith('channel')]
# get all month columns
months = [col for col in df.columns if col.startswith('month')]
# varibles for which to control
controls = ['New Users', 'Weekend'] + months
# response variables on which to run regressions
resp_vars = ['Bounce Rate', 'Avg. Session Duration', 'Ecommerce Conversion Rate', 'Transactions']
# predictors
X = df[channels + controls]
X = sm.add_constant(X)

# run regression for all response variables
for var in resp_vars:
    
    print(var)
    y = df[var]
    est = sm.OLS(y, X).fit()
    print(est.summary())
    print('\n'*6)
    

Bounce Rate
                            OLS Regression Results                            
Dep. Variable:            Bounce Rate   R-squared:                       0.703
Model:                            OLS   Adj. R-squared:                  0.701
Method:                 Least Squares   F-statistic:                     343.9
Date:                Mon, 08 Nov 2021   Prob (F-statistic):               0.00
Time:                        12:23:49   Log-Likelihood:                 2719.2
No. Observations:                2928   AIC:                            -5396.
Df Residuals:                    2907   BIC:                            -5271.
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const           

Display users and revenue for each channel:

In [11]:
for col in channels:
    print(col)
    print(df[df[col] == 1]['Users'].sum())
    print(df[df[col] == 1]['Revenue'].sum())

channel_Affiliates
18862
0.0
channel_Direct
134904
20752.010000000002
channel_Display
21064
42.0
channel_Organic Search
364989
35755.86
channel_Paid Search
15481
2246.8
channel_Referral
107694
225.25
channel_Social
27944
354.5
