In [7]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os, sys
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import spearmanr
import matplotlib.dates as mdates
from matplotlib.colors import LinearSegmentedColormap
from datetime import datetime, timedelta
import seaborn as sns


sys.path.append('../utils/')

from utils import *
from analysis_utils import *


"""
Descriptive analyses to run: 
- correlation
- t-test
- R^2
"""

'\nDescriptive analyses to run: \n- correlation\n- t-test\n- R^2\n'

In [8]:
prefix = '../data/prepared/merged/'
units = 'months'

do_controls = False
do_primary = True

normalized = pd.read_csv(prefix + 'merged_' + units + '.csv')

non_add_L_set = []


In [9]:
normalized

Unnamed: 0,ds,tgc,wth,cowspiracy,fok,okja,yawye,sustainability,animal_welfare,climate,...,Month_Dummy3,Month_Dummy4,Month_Dummy5,Month_Dummy6,Month_Dummy7,Month_Dummy8,Month_Dummy9,Month_Dummy10,Month_Dummy11,Month_Dummy12
0,2004-01-03,0.00,0.00,0.00,0.00,0.0,0.00,51.20,8.64,237.5,...,0,0,0,0,0,0,0,0,0,0
1,2004-01-10,0.00,0.00,0.00,0.00,0.0,0.00,58.88,18.24,262.5,...,0,0,0,0,0,0,0,0,0,0
2,2004-01-17,0.00,0.00,0.00,0.00,0.0,0.00,69.12,14.40,262.5,...,0,0,0,0,0,0,0,0,0,0
3,2004-01-24,0.00,0.00,0.00,0.00,0.0,0.00,64.00,20.64,237.5,...,0,0,0,0,0,0,0,0,0,0
4,2004-01-31,0.00,0.00,0.00,0.00,0.0,0.00,67.84,15.36,250.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1060,2024-04-27,1.47,0.84,0.00,2.43,0.0,1.35,96.00,8.88,287.5,...,0,1,0,0,0,0,0,0,0,0
1061,2024-05-04,0.98,0.84,0.00,2.16,0.0,1.08,87.04,7.68,300.0,...,0,0,1,0,0,0,0,0,0,0
1062,2024-05-11,0.98,0.00,0.24,2.16,0.0,0.81,79.36,8.88,300.0,...,0,0,1,0,0,0,0,0,0,0
1063,2024-05-18,1.47,0.00,0.00,2.97,0.0,0.54,71.68,6.72,237.5,...,0,0,1,0,0,0,0,0,0,0


In [10]:
#normalized[['ds', 'NeuhoferLusk']].dropna()

In [11]:
def compute_stats(outcome, difference=True):
    trt_lag = 0
    analysis_dct = {'Y': (outcome, []), 'C': []} 


    # Month only
    _, _, _, res_month_only = run_analysis(analysis_dct, normalized, trt_lag, 
                                     non_add_L_set,
                                     PS=False, 
                                  PS_logistic=False, 
                                  fit_method='GLSAR',
                                  add_L=False,
                                  difference=difference,
                                  include_time=False,
                                  include_month=True,
                                  normalize=True,
                                 verbose=False) 
    
    
    # Month + docs
    analysis_dct = {'Y': (outcome, []), 'C': [('cowspiracy', [0]),
                                          ('okja', [0]), ('wth', [0]), ('tgc', [0]),
                                         ('fok', [0]), ('yawye', [0])]} #


    _, _, _, res_month_docs = run_analysis(analysis_dct, normalized, trt_lag,
                                 non_add_L_set,
                                 PS=False, 
                              PS_logistic=False, 
                              fit_method='GLSAR',
                              add_L=False,
                              difference=difference,
                              include_time=False,
                              include_month=True,
                              normalize=True,
                             verbose=False) 
    
    # Docs only
    analysis_dct = {'Y': (outcome, []), 'C': [('cowspiracy', [0]), ('okja', [0]), ('wth', [0]), 
                                              ('tgc', [0]),('fok', [0]), ('yawye', [0])]}  #


    _, _, _, res_docs_only = run_analysis(analysis_dct, normalized, trt_lag, 
                                     non_add_L_set,
                                     PS=False, 
                                  PS_logistic=False, 
                                  fit_method='GLSAR',
                                  add_L=False,
                                  difference=difference,
                                  include_time=False,
                                  include_month=False,
                                  normalize=True,
                                 verbose=False) 
    
    return res_month_only.rsquared_adj, res_month_docs.rsquared_adj, res_docs_only.rsquared_adj 

In [12]:
# R^2: Month only

if units == 'months':
    outcomes = ['plant_based_plus_plant_based', 'vegan', 'vegetarian', 'StewartMilk', 
                'StewartPBMilk', 'Zhao', 'ksu_chicken', 'ksu_beef', 'ksu_pork',
               'plant_based_informative', 'vegan_informative', 'vegetarian_informative',
               'plant_based_behavior', 'vegan_behavior', 'vegetarian_behavior']
else:
    outcomes = ['plant_based_plus_plant_based', 'vegan', 'vegetarian', 'StewartMilk', 
                'StewartPBMilk', 'Zhao', 
               'plant_based_informative', 'vegan_informative', 'vegetarian_informative',
               'plant_based_behavior', 'vegan_behavior', 'vegetarian_behavior']    
    

for outcome in outcomes:
    r21, r22, r23 = compute_stats(outcome, difference=True)
    print(outcome, '{r1:0.2f}, {r2:0.2f}, {r3:0.2f}'.format(r1=r21, r2=r22, r3=r23))

plant_based_plus_plant_based 0.01, 0.43, 0.42
vegan 0.01, 0.04, 0.04
vegetarian 0.10, 0.12, 0.03
StewartMilk 0.19, 0.18, -0.01
StewartPBMilk 0.26, 0.25, 0.01
Zhao 0.07, 0.06, -0.01
plant_based_informative 0.01, 0.39, 0.38
vegan_informative 0.00, 0.02, 0.02
vegetarian_informative 0.05, 0.06, 0.02
plant_based_behavior 0.03, 0.50, 0.49
vegan_behavior 0.06, 0.11, 0.06
vegetarian_behavior 0.13, 0.14, 0.03
