In [0]:
import boto3
import datetime as dt
import json
import numpy as np
import pandas as pd

import snowflake.connector
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.options.display.float_format = '{:,.2f}'.format

In [0]:
from abc import ABCMeta, abstractmethod

class Credentials(metaclass=ABCMeta):
    pass
    
    
class SSMPSCredentials(Credentials):
    def __init__(self, secretid: str):
        self._secretid = secretid
        self._secrets = {}
        
    def get_keys(self):
        """
        credential fetching 
        """
        _aws_sm_args = {'service_name': 'secretsmanager', 'region_name': 'us-east-1'}
        secrets_client = boto3.client(**_aws_sm_args)
        get_secret_value_response = secrets_client.get_secret_value(SecretId=self._secretid)
        return get_secret_value_response
    
    
class BaseConnector(metaclass=ABCMeta):
    @abstractmethod
    def connect(self):
        raise NotImplementedError
    

class SnowflakeConnector(BaseConnector):
    def __init__(self, credentials: Credentials):
        keys = credentials.get_keys()
        self._secrets = json.loads(keys.get('SecretString', "{}"))

    def connect(self, dbname: str, schema: str = 'DEFAULT'):
        ctx = snowflake.connector.connect(
            user=self._secrets['login_name'],
            password=self._secrets['login_password'],
            account=self._secrets['account'],
            warehouse=self._secrets['warehouse'],
            database=dbname,
            schema=schema
        )

        return ctx
    
## Credentials
SF_CREDS = 'datascience-max-dev-sagemaker-notebooks'

## Snowflake connection 
conn=SnowflakeConnector(SSMPSCredentials(SF_CREDS))
ctx=conn.connect("MAX_DEV","WORKSPACE")

def run_query(query):
    cursor = ctx.cursor()
    cursor.execute(query)
    df = pd.DataFrame(cursor.fetchall(), columns = [desc[0] for desc in cursor.description])
    df.columns= df.columns.str.lower()
    return df

In [0]:
data = read_from_s3('Cannibalization Analysis.csv')
data_ca_title = read_from_s3('cannibalization_analysis_detailed_view.csv')

# Causal

In [0]:
!pip install cdt
!pip install torch

In [0]:
import cdt
import networkx as nx
import matplotlib.pyplot as plt

In [0]:
data['medal_number'] = data['medal_number'].fillna(3)

In [0]:
df = data[['ahvr', 'medal_Platinum_count', 'medal_same_count', 'medal_number']]

In [0]:
df_msc_avg = df.groupby(['medal_number'])['medal_same_count'].mean().reset_index().rename(columns = {'medal_same_count':'medal_same_count_avg'})
df = df.merge(df_msc_avg, on = ['medal_number'])
df['medal_same_count_bigger_than_average'] = 0
df.loc[df['medal_same_count'] > df['medal_same_count_avg'], 'medal_same_count_bigger_than_average'] = 1

In [0]:
df_msc_avg = df.groupby(['medal_number'])['medal_Platinum_count'].mean().reset_index().rename(columns = {'medal_Platinum_count':'medal_Platinum_count_avg'})
df = df.merge(df_msc_avg, on = ['medal_number'])
df['medal_Platinum_count_bigger_than_average'] = 0
df.loc[df['medal_Platinum_count'] > df['medal_Platinum_count_avg'], 'medal_Platinum_count_bigger_than_average'] = 1

In [0]:
# ### Setting different threshold
# df['medal_Platinum_count_avg'] = 0.98
# df['medal_Platinum_count_bigger_than_average'] = 0
# df.loc[df['medal_Platinum_count'] > df['medal_Platinum_count_avg'], 'medal_Platinum_count_bigger_than_average'] = 1

In [0]:
df.tail()

In [0]:
# Get skeleton graph
# initialize graph lasso
glasso = cdt.independence.graph.Glasso()
# apply graph lasso to data
skeleton = glasso.predict(df)
# visualize network
fig = plt.figure(figsize=(15,10))
nx.draw_networkx(skeleton, font_size=18, font_color='r')

In [0]:
# # Use causal discovery to get causal models
# # PC algorithm
# model_pc = cdt.causality.graph.PC()
# graph_pc = model_pc.predict(df, skeleton)

# # visualize network
# fig=plt.figure(figsize=(15,10))
# nx.draw_networkx(graph_pc, font_size=18, font_color='r')

In [0]:
!pip install econml
!pip install dowhy

In [0]:
import econml
import dowhy
from dowhy import CausalModel

In [0]:
df.columns

In [0]:
G = nx.DiGraph()
G.add_nodes_from(['ahvr', 'medal_Platinum_count', 'medal_same_count', 'medal_number'])
G.add_edges_from([
                  ("medal_number", "ahvr"), 
                  ("medal_number", "medal_same_count"), 
                  ("medal_number", "medal_Platinum_count"), 
                  ("medal_same_count", "medal_same_count_bigger_than_average"), 
                  ("medal_Platinum_count", "medal_Platinum_count_bigger_than_average"), 
                  ("medal_same_count_bigger_than_average", "ahvr"), 
                  ("medal_Platinum_count_bigger_than_average", "ahvr")
                ])
# G.add_edges_from([
#                   ("medal_number", "ahvr"), 
# #                   ("medal_number", "medal_same_count"), 
# #                   ("medal_number", "medal_Platinum_count"), 
# #                   ("medal_same_count", "medal_same_count_bigger_than_average"), 
# #                   ("medal_Platinum_count", "medal_Platinum_count_bigger_than_average"), 
#                   ("medal_same_count_bigger_than_average", "ahvr"), 
#                   ("medal_Platinum_count_bigger_than_average", "ahvr")
#                 ])

In [0]:
graph = nx.generate_gml(G)
graph = ''.join(graph)

In [0]:
# With graph
model=CausalModel(
        data = df,
        treatment="medal_Platinum_count_bigger_than_average",
        outcome="ahvr",
        graph = graph
        )

In [0]:
model.view_model() 

In [0]:
# Generate estimand
identified_estimand= model.identify_effect(proceed_when_unidentifiable=True)
print(identified_estimand)

In [0]:
import warnings
warnings.filterwarnings('ignore')

In [0]:
# Compute causal effect using metalearner
identified_estimand_experiment = model.identify_effect(proceed_when_unidentifiable=True)

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import TweedieRegressor

metalearner_estimate = model.estimate_effect(identified_estimand_experiment,                             
                                             method_name="backdoor.econml.metalearners.TLearner",
                                             confidence_intervals=False,
                                             method_params={
                                                         "init_params":{'models': RandomForestRegressor()},
                                                         "fit_params":{}
                                                                  })
print(metalearner_estimate)

In [0]:
# Print histogram of causal effects
plt.hist(metalearner_estimate.cate_estimates)

## By Tier

### Platinum

In [0]:
# With graph
model=CausalModel(
        data = df[df['medal_number'] == 0],
        treatment="medal_Platinum_count_bigger_than_average",
        outcome="ahvr",
        graph = graph
        )

In [0]:
identified_estimand_experiment = model.identify_effect(proceed_when_unidentifiable=True)

metalearner_estimate = model.estimate_effect(identified_estimand_experiment,                             
                                             method_name="backdoor.econml.metalearners.TLearner",
                                             confidence_intervals=False,
                                             method_params={
                                                         "init_params":{'models': TweedieRegressor()},
                                                         "fit_params":{}
                                                                  })
print(metalearner_estimate)

In [0]:
# Print histogram of causal effects
plt.hist(metalearner_estimate.cate_estimates)

### Gold

In [0]:
df_gold = df[df['medal_number'] == 1]
# df_gold['medal_same_count_avg'] = 3 # 2.78
df_gold['medal_Platinum_count_bigger_than_average'] = 0
df_gold.loc[df_gold['medal_Platinum_count'] > df_gold['medal_Platinum_count_avg'], 'medal_Platinum_count_bigger_than_average'] = 1

In [0]:
df_gold.head()

In [0]:
# With graph
model=CausalModel(
        data = df_gold,
        treatment="medal_Platinum_count_bigger_than_average",
        outcome="ahvr",
        graph = graph
        )

In [0]:
identified_estimand_experiment = model.identify_effect(proceed_when_unidentifiable=True)

metalearner_estimate = model.estimate_effect(identified_estimand_experiment,                             
                                             method_name="backdoor.econml.metalearners.TLearner",
                                             confidence_intervals=False,
                                             method_params={
                                                         "init_params":{'models': RandomForestRegressor()},
                                                         "fit_params":{}
                                                                  })
print(metalearner_estimate)

### Silver

In [0]:
# With graph
model=CausalModel(
        data = df[df['medal_number'] == 2],
        treatment="medal_Platinum_count_bigger_than_average",
        outcome="ahvr",
        graph = graph
        )

In [0]:
df[df['medal_number'] == 2].head()

In [0]:
identified_estimand_experiment = model.identify_effect(proceed_when_unidentifiable=True)

metalearner_estimate = model.estimate_effect(identified_estimand_experiment,                             
                                             method_name="backdoor.econml.metalearners.TLearner",
                                             confidence_intervals=False,
                                             method_params={
                                                         "init_params":{'models': RandomForestRegressor()},
                                                         "fit_params":{}
                                                                  })
print(metalearner_estimate)

### Bronze

In [0]:
# With graph
model=CausalModel(
        data = df[df['medal_number'] >= 2],
        treatment="medal_Platinum_count_bigger_than_average",
        outcome="ahvr",
        graph = graph
        )

In [0]:
identified_estimand_experiment = model.identify_effect(proceed_when_unidentifiable=True)

metalearner_estimate = model.estimate_effect(identified_estimand_experiment,                             
                                             method_name="backdoor.econml.metalearners.TLearner",
                                             confidence_intervals=False,
                                             method_params={
                                                         "init_params":{'models': RandomForestRegressor()},
                                                         "fit_params":{}
                                                                  })
print(metalearner_estimate)

In [0]:
# !pip install causalinference

In [0]:
from causalinference import CausalModel
from causalinference.utils import random_data
#Y is the outcome, D is treatment status, and X is the independent variable
causal = CausalModel(df['ahvr'], 
                     df['medal_Platinum_count_bigger_than_average'], 
                     df[['medal_number', 'medal_Platinum_count_bigger_than_average', 'medal_same_count_bigger_than_average', 
                         'medal_Platinum_count', 'medal_same_count']])

In [0]:
print(causal.summary_stats)

In [0]:
causal.est_via_ols()
print(causal.estimates)

In [0]:
# ATE, ATC, and ATT stand for Average Treatment Effect, Average Treatment Effect for Control and Average Treatment Effect for Treated, respectively. 
# Using this information, we could assess whether the treatment has an effect compared to the control.