# Case study: Opportunity cost

## Setup

In [78]:
import pandas as pd
import altair as alt

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

## Import data

In [79]:
df = pd.read_csv("https://raw.githubusercontent.com/kirenz/datasets/master/opportunity_cost.csv")
df.head()

Unnamed: 0,group,decision
0,control,buy video
1,control,buy video
2,control,buy video
3,control,buy video
4,control,buy video


In [80]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   group     150 non-null    object
 1   decision  150 non-null    object
dtypes: object(2)
memory usage: 2.5+ KB


In [81]:
df['group'].value_counts()

control      75
treatment    75
Name: group, dtype: int64

## Observed data

## Crosstable

In [82]:
pd.crosstab(df.group, df.decision,  margins=True)

decision,buy video,not buy video,All
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
control,56,19,75
treatment,41,34,75
All,97,53,150


In [83]:
pd.crosstab(df.group, df.decision, normalize='index',  margins=True).round(2)

decision,buy video,not buy video
group,Unnamed: 1_level_1,Unnamed: 2_level_1
control,0.75,0.25
treatment,0.55,0.45
All,0.65,0.35


## Calculate difference

In [84]:
df_control = df[(df["group"] == "control")]
df_treatment = df[(df["group"] == "treatment")]

control_total = len(df_control)
treatment_total = len(df_treatment)

control_not_buy = len(df_control[df_control['decision']=="not buy video"])
treatment_not_buy = len(df_treatment[df_treatment['decision']=="not buy video"])

control_p = round(control_not_buy / control_total, 3)
treatment_p = round(treatment_not_buy / treatment_total, 3)

p_diff_ob = round(treatment_p - control_p, 3)

df_p_diff_ob = pd.DataFrame({'p_diff': [p_diff_ob] })
df_p_diff_ob

Unnamed: 0,p_diff
0,0.2


The proportion of students who chose not to buy the video was 20 percentage points higher in the treatment group than the control group.

## Randomization

### Crosstable

Example with only one randomization:

In [85]:
df['group_random'] = df['group'].sample(frac=1, random_state=0).reset_index(drop=True)

Since the randomization of files in this simulation is independent of the promotion decisions, any difference in promotion rates is due to chance.

In [86]:
pd.crosstab(df.group_random, df.decision,  margins=True)

decision,buy video,not buy video,All
group_random,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
control,51,24,75
treatment,46,29,75
All,97,53,150


### Calculate difference

#### One randomization

In [87]:
df_control = df[(df["group_random"] == "control")]
df_treatment = df[(df["group_random"] == "treatment")]

control_total = len(df_control)
treatment_total = len(df_treatment)

control_not_buy = len(df_control[df_control['decision']=="not buy video"])
treatment_not_buy = len(df_treatment[df_treatment['decision']=="not buy video"])

control_p = round(control_not_buy / control_total, 3)
treatment_p = round(treatment_not_buy / treatment_total, 3)

p_diff = round(treatment_p - control_p, 3)

p_diff

0.067

#### Multiple randomizations

In [88]:
# create an empty list
random_difference = []

# make 1000 randomizations and save results
for i in range(0, 1000):

    df['group_random'] = df['group'].sample(frac=1, random_state=i).reset_index(drop=True)

    df_control = df[(df["group_random"] == "control")]
    df_treatment = df[(df["group_random"] == "treatment")]

    control_total = len(df_control)
    treatment_total = len(df_treatment)

    control_not_buy = len(df_control[df_control['decision']=="not buy video"])
    treatment_not_buy = len(df_treatment[df_treatment['decision']=="not buy video"])

    control_p = round(control_not_buy / control_total, 3)
    treatment_p = round(treatment_not_buy / treatment_total, 3)

    p_diff = round(treatment_p - control_p, 3)

    random_difference.append(p_diff)

In [89]:
# create pandas dataframe
df_random = pd.DataFrame({'p_diff': random_difference})


In [90]:
df_random.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
p_diff,1000.0,-0.003355,0.081123,-0.253,-0.067,-0.013,0.04,0.28


In [91]:
chart1 = alt.Chart(df_random).mark_circle(size=100).transform_window(
    id='rank()',
    groupby=['p_diff']
).encode(
    alt.X('p_diff:O', title='Differences across 1000 shuffles'),
    alt.Y('id:O',
          axis=None,
          sort='descending')
).properties(height=300, width=400)


chart2 = alt.Chart(df_p_diff_ob).mark_circle(size=100).transform_window(
    id='rank()',
    groupby=['p_diff']
).encode(
    alt.X('p_diff:O'),
    alt.Y('id:O',
          axis=None,
          sort='descending'),
    color=alt.value('orange')
)

chart1 + chart2 


In [92]:
alt.Chart(df_random).mark_bar().encode(
    alt.X('p_diff:Q', bin=True, title='Differences across 100 shuffles'),
    alt.Y('count()')
).properties(height=300, width=400)

## Result

In [93]:
count_diff = df_random[df_random['p_diff'] >= p_diff_ob].count().p_diff

p_value = count_diff/len(df_random)
p_value

0.01

- Under the null hypothesis (no treatment effect), we would observe a difference of at least +20% about 1% of the time.

That is really rare! Instead, we will conclude the data provide strong evidence there is a treatment effect: reminding students before a purchase that they could instead spend the money later on something else lowers the chance that they will continue with the purchase. Notice that we are able to make a causal statement for this study since the study is an experiment, although we do not know why the reminder induces a lower purchase rate.

 ## p-value and statistical significance

- H0: Null hypothesis. Reminding students that they can save money for later purchases will not have any impact on students’ spending decisions.

- HA: Alternative hypothesis. Reminding students that they can save money for later purchases will reduce the chance they will continue with a purchase.

In [94]:
p_value

0.01

In [95]:
def significance(p_value):
    if p_value <= 0.05:
        return "Reject Null Hypothesis"
    else:
        return "Accept Null Hypothesis"

In [96]:
significance(p_value)

'Reject Null Hypothesis'