# Following are the instructions to get the results of Step 1

## 1. Import data and defined confounders initially

In [None]:
import pandas as pd
from datatools import *

In [None]:
# Initially considered confoudners
pre_confounders = [
    'saves_pre_period_1',
    'follows_pre_period_1',
    'playlists_pre_period_1',
    'tickets_pre_period_1',
    'merch_pre_period_1',
    'shares_pre_period_1',
    'streams_active_streams_pre_period_1',
    'streams_programmed_streams_pre_period_1',
    'saves_pre_period_2',
    'follows_pre_period_2',
    'playlists_pre_period_2',
    'tickets_pre_period_2',
    'merch_pre_period_2',
    'shares_pre_period_2',
    'streams_active_streams_pre_period_2',
    'streams_programmed_streams_pre_period_2',
    'saves_pre_period_3',
    'follows_pre_period_3',
    'playlists_pre_period_3',
    'tickets_pre_period_3',
    'merch_pre_period_3',
    'shares_pre_period_3',
    'cumsaves_pre_period_3',
    'cumfollows_pre_period_3',
    'cumplaylists_pre_period_3',
    'cumtickets_pre_period_3',
    'cummerch_pre_period_3',
    'cumshares_pre_period_3',
    'streams_active_streams_pre_period_3',
    'streams_programmed_streams_pre_period_3'
]

In [None]:
# Loading data
all_data = load_data('../Data/CausalFandom_main_data.pickle')

## 2. Run the SMD test to determine the optimal number of clusters

In [None]:
pre_list = []
post_list = []
result_list = []
clus_size_list = []
smd_before = []

pre_outcome = [
    'tickets_pre_period_1',
    'merch_pre_period_1',
    'shares_pre_period_1',
    [
        'streams_active_streams_pre_period_1',
        'streams_programmed_streams_pre_period_1'
    ]
]

post_outcome = [
    'tickets_following_four_weeks',
    'merch_following_four_weeks',
    'shares_following_four_weeks',
    [
        'streams_active_streams_following_four_weeks',
        'streams_programmed_streams_following_four_weeks'
    ]
]

for idx in range(4):
    pre_period_name = pre_outcome[idx]
    post_period_name = post_outcome[idx]
    (control_gap, case_gap) = (-1,2)
    sampled_data = all_data.sample(n=5000, replace=False).dropna()

    for clus_size in [10,15,20,25,30,35,40,45,50,55,60, 90, 120, 150]:
        pre_list.append(pre_period_name)
        post_list.append(post_period_name)
        clus_size_list.append(clus_size)
        confounders = variable_list('art') + variable_list('usr') + pre_confounders
        clus_label = kmeans_confoudners(sampled_data, clus_size, confounders)
        # Iterate though all sizes
        case_control_label = get_case_control_label(pre_period_name,post_period_name,sampled_data,control_gap,case_gap)
        smd_before, smd_after, le_before, le_after = smd_calculation_confounders(sampled_data,clus_label,confounders,case_control_label)
        diff = np.mean(le_after) - np.mean(le_before)
        result_list.append(diff)

# Store results for latter visualisation
smd = pd.DataFrame(
    {
    'pre period var':pre_list,
    'post period var':post_list,
    'clus size' : clus_size_list,
    'SMD diff: ': result_list
    }
)

The number of clusters decided after running the above is :

In [None]:
# Number of cluster of 4 outcomes with and without gaps
clus_size_list = [[15,10], [10,10], [15,10], [20,10]]

## 3. Calculate the statistics ('./plots' directory is needed to store plots)

In [None]:
# Produce the statistical results
pre_list = []
post_list = []
thres_list = []
smd_list = []
result_list = []

pre_outcome = [
    'tickets_pre_period_1',
    'merch_pre_period_1',
    'shares_pre_period_1',
    [
        'streams_active_streams_pre_period_1',
        'streams_programmed_streams_pre_period_1'
    ]
]

post_outcome = [
    'tickets_following_four_weeks',
    'merch_following_four_weeks',
    'shares_following_four_weeks',
    [
        'streams_active_streams_following_four_weeks',
        'streams_programmed_streams_following_four_weeks'
    ]
]

for idx in range(4):
    pre_period_name = pre_outcome[idx]
    post_period_name = post_outcome[idx]
    cleaned_data = clean_data(pre_period_name, post_period_name, all_data).dropna()
    cur_size_list = clus_size_list[idx]
    gap_list = [(0,1),(-1,2)]

    for idx1 in range(2) :
        (control_gap, case_gap) = gap_list[idx1]
        cur_size = cur_size_list[idx1]
        # Collect name of the results
        thres_list.append((control_gap, case_gap))
        pre_list.append(pre_period_name)
        post_list.append(post_period_name)
        # Get the plot for threshold and ecdf
        check_threshold(pre_period_name, post_period_name, cleaned_data, control_gap, case_gap, ifplot= True)
        # Produce SMD result
        case_control_label = get_case_control_label(pre_period_name, post_period_name, cleaned_data, control_gap, case_gap)
        confounders = variable_list('art') + variable_list('usr') + pre_confounders
        clus_label = kmeans_confoudners(cleaned_data, cur_size, confounders)
        smd_before,smd_after,le_before,le_after = smd_calculation_confounders(cleaned_data, clus_label, confounders, case_control_label)
        diff = np.mean(le_after) - np.mean(le_before)
        smd_list.append(diff)
        # Get the smd plot
        smd_plot_calmean(le_before, le_after, pre_period_name, post_period_name, pd.DataFrame(), control_gap, case_gap, name_suffix='originaldata')
        # Produce the final result
        stat_results = cal_stats(cleaned_data, variable_list('treat_act'), clus_label, case_control_label)
        result_list.append(stat_results)

result = pd.DataFrame(
    {
    'pre period var':pre_list,
    'post period var':post_list,
    'SMD diff: ': smd_list,
    'threshold':thres_list
    }
)

## 4. Save the results locally

In [None]:
# save the result to file
name_list = []

preoutcome = [
    'tickets_pre1',
    'merch_pre1',
    'shares_pre1',
    'streams_pre1',
]
postoutcome = [
    'f4k',
    'merch_following_four_weeks',
    'shares_following_four_weeks',
    'streams_following_four_weeks'
]

for idx in range(4):
    pre_period_name = preoutcome[idx]
    post_period_name = postoutcome[idx]
    gap_list = [(0,1),(-1,2)]
    for idx1 in range(2) :
        (control_gap, case_gap) = gap_list[idx1]
        name = pre_period_name + 'f4k_' + str(control_gap) +'_' + str(case_gap)
        name_list.append(name)

# Save the result to the local directory (./stat_result)
with pd.ExcelWriter('./stat_result/originalresult.xlsx') as writer:
    for i in range(len(result_list)):
        item =  result_list[i]
        item.to_excel(writer, sheet_name=name_list[i])
