In [49]:
import pandas as pd

In [50]:
df = pd.read_csv('data.csv')

In [51]:
df

Unnamed: 0,treatment,video,song_rank,video_length,day_posted,time_posted,time_data_collected,views,likes,comments,favorites,shared,end_of_experiment_date_time,final_views,final_likes,final_comments,final_favorites,final_shared
0,C,Video #17,,0:06:00,2/27/2024,4:00:00 PM,5:00:00 PM,597,13,0,0,0,3/4/2024 20:00,610,14,0,0,0
1,C,Video #18,,0:07:00,2/27/2024,5:00:00 PM,6:00:00 PM,364,6,0,0,0,3/4/2024 20:00,585,11,0,0,0
2,T,Video #13,8.0,0:05:00,2/27/2024,7:00:00 PM,8:00:00 PM,2,0,0,0,0,3/4/2024 20:00,8,0,0,0,0
3,T,Video #6,3.0,0:06:00,2/27/2024,8:00:00 PM,9:00:00 PM,53,3,0,0,0,3/4/2024 20:00,551,25,0,0,0
4,T,Video #10,2.0,0:16:00,2/27/2024,9:00:00 PM,10:00:00 PM,366,25,0,0,0,3/4/2024 20:00,575,40,2,0,0
5,T,Video #30,4.0,0:15:00,2/28/2024,8:00:00 AM,9:00:00 AM,259,4,0,0,0,3/4/2024 20:00,602,16,0,0,0
6,C,Video #27,,0:05:00,2/28/2024,9:00:00 AM,10:00:00 AM,0,0,0,0,0,3/4/2024 20:00,7,0,0,0,0
7,C,Video #5,,0:05:00,2/28/2024,10:00:00 AM,11:00:00 AM,0,0,0,0,0,3/4/2024 20:00,3,0,0,0,0
8,T,Video #9,7.0,0:06:00,2/28/2024,11:00:00 AM,12:00:00 PM,0,0,0,0,0,3/4/2024 20:00,6,0,0,0,0
9,C,Video #23,,0:09:00,2/28/2024,12:00:00 PM,1:00:00 PM,0,0,0,0,0,3/4/2024 20:00,5,0,0,0,0


In [52]:
df.isnull().sum()

treatment                       0
video                           0
song_rank                      30
video_length                    0
day_posted                      0
time_posted                     0
time_data_collected             0
views                           0
likes                           0
comments                        0
favorites                       0
shared                          0
end_of_experiment_date_time     0
final_views                     0
final_likes                     0
final_comments                  0
final_favorites                 0
final_shared                    0
dtype: int64

In [53]:
df.dtypes

treatment                       object
video                           object
song_rank                      float64
video_length                    object
day_posted                      object
time_posted                     object
time_data_collected             object
views                            int64
likes                            int64
comments                         int64
favorites                        int64
shared                           int64
end_of_experiment_date_time     object
final_views                      int64
final_likes                      int64
final_comments                   int64
final_favorites                  int64
final_shared                     int64
dtype: object

In [54]:
# df['day_posted'] = df['day_posted'].astype('category')
df['day_posted'] = pd.to_datetime(df['day_posted'])
df['time_posted'] = pd.to_datetime(df['time_posted']).dt.hour
df['time_data_collected'] = pd.to_datetime(df['time_data_collected'])
df['end_of_experiment_date_time'] = pd.to_datetime(df['end_of_experiment_date_time'])

# Check the data types after conversion
print(df.dtypes)


treatment                              object
video                                  object
song_rank                             float64
video_length                           object
day_posted                     datetime64[ns]
time_posted                             int64
time_data_collected            datetime64[ns]
views                                   int64
likes                                   int64
comments                                int64
favorites                               int64
shared                                  int64
end_of_experiment_date_time    datetime64[ns]
final_views                             int64
final_likes                             int64
final_comments                          int64
final_favorites                         int64
final_shared                            int64
dtype: object


In [55]:
# ATE hat of the treatment on views after an hour of posting
atehat = df.loc[df['treatment'] == 'T', 'views'].mean() - df.loc[df['treatment'] == 'C', 'views'].mean()

print(f"The estimate of ATE is {atehat:.4f}")

The estimate of ATE is -9.1333


In [56]:
# Does treatment cause more views? Is there any statistical sifnificance?
from scipy.stats import ttest_ind

tstat, pvalue = ttest_ind(df.loc[df['treatment'] == 'T', 'views'],
                          df.loc[df['treatment'] == 'C', 'views'],
                          alternative='two-sided', equal_var=False)

print(f"t-score (t): {tstat:.4f}")
print(f"P-value (p): {pvalue:.4e}")
print(f"Should we reject H0? {pvalue < 0.05}")

t-score (t): -0.3359
P-value (p): 7.3841e-01
Should we reject H0? False


In [57]:
# Cohen's d
cohens_d = atehat / df["views"].std()
print(f"Cohen's d is {cohens_d:.4f}.")

Cohen's d is -0.0874.


In [58]:
# Power of the experiment
from pingouin import power_ttest2n

power = power_ttest2n(nx = len(df[df["treatment"] == 'T']), ny = len(df[df["treatment"] == 'C']),
                      d = cohens_d, alpha = 0.05, alternative = 'two-sided')

print(f"The power is {power:.4f}.")

The power is 0.0628.


In [63]:
from pyfixest.estimation import feols
from pyfixest.utils import get_data
from pyfixest.summarize import etable

# Regression of treatment on views fixing for day and time posted
reg_views = feols('views ~ treatment | day_posted + time_posted', data = df).vcov('hetero')

etable([reg_views])

                            est1
--------------  ----------------
depvar                     views
--------------------------------
treatment[T.T]  -38.034 (21.019)
--------------------------------
time_posted                    x
day_posted                     x
--------------------------------
R2                         0.674
S.E. type                 hetero
Observations                  60
--------------------------------
Significance levels: * p < 0.05, ** p < 0.01, *** p < 0.001
Format of coefficient cell:
Coefficient (Std. Error)


Please use 'pyfixest.etable' instead. You may refer the updated documentation at: https://s3alfisc.github.io/pyfixest/quickstart.html
  etable([reg_views])


In [None]:
# balance check
# do reg to check for diffrences in treatment snd control for any covariate

In [64]:
# proportions_ztest function to check whether the randomization proportion was intended

from statsmodels.stats.proportion import proportions_ztest

n_treated = df.treatment.sum()
n = df.shape[0]
_, p_val = proportions_ztest(n_treated, n, value = .8)
print(f"The p-value for the proportions test is {p_val:.2} so we fail to reject the null hypothesis of proper randomization")

UFuncTypeError: ufunc 'multiply' did not contain a loop with signature matching types (dtype('int32'), dtype('<U60')) -> None