In [1]:
#there's lots of options for models including agnostic but we'll start with NN

#https://shap.readthedocs.io/en/latest/


#SHAPE MAKES THE COEFFICIENT CORRELATION evident
#causal when feature analyzing is strongly independent from others (i.e. why we matched + hope get unobserved)


"""
Look into double ML (CausalML)
- will fail to capture indirect casual effects if control for downstream features caused by feature of interest
"""

"""
Unfortunately, we often don’t know the true causal graph so it can be hard to know when another feature is redundant with our feature of interest because of observed confounding vs. non-confounding redundancy. If it is because of confounding then we should control for that feature using a method like double ML, whereas if it is a downstream consequence then we should drop the feature from our model if we want full causal effects rather than only direct effects. Controlling for a feature we shouldn’t tends to hide or split up causal effects, while failing to control for a feature we should have controlled for tends to infer causal effects that do not exist. This generally makes controlling for a feature the safer option when you are uncertain.

"""
#controlling for feature is safer; confounding -> control, downstream conseqeunce -> drop
#this is actually good for us to know. Our base data is all downstream which could be a problem (meaning we should also try dropping that info from dataset)
#this means we should run it both with and without base data

###DoubleMLIRM

'\nUnfortunately, we often don’t know the true causal graph so it can be hard to know when another feature is redundant with our feature of interest because of observed confounding vs. non-confounding redundancy. If it is because of confounding then we should control for that feature using a method like double ML, whereas if it is a downstream consequence then we should drop the feature from our model if we want full causal effects rather than only direct effects. Controlling for a feature we shouldn’t tends to hide or split up causal effects, while failing to control for a feature we should have controlled for tends to infer causal effects that do not exist. This generally makes controlling for a feature the safer option when you are uncertain.\n\n'

# Old Data Loading (TODO - move into one file they can all use)

In [2]:
old_path = "../../Dataset Expansion/raw_data"
import os 
import json 

REDDITOR_FILE = os.path.join(old_path, "redditor_dict.json")
SUBMISSION_FILE = os.path.join(old_path, "submission_dict.json")
COMMENT_FILE = os.path.join(old_path, "comment_dict.json")

def save_get_json(file_name):
    if (os.path.exists(file_name)):
        with open(file_name, 'r') as f:
            return json.load(f)
    return dict()

#TODO - comment back in - it's just easier this way

redditor_dict = save_get_json(REDDITOR_FILE)
submission_dict = save_get_json(SUBMISSION_FILE)
comment_dict = save_get_json(COMMENT_FILE)
assert(all([(len(i) !=0 for i in [redditor_dict, submission_dict, comment_dict])]))

In [3]:
from datetime import datetime

# import CONTROL_DATE, FIRST_CONSIDER_DATE from "../Dataset Expansion/feature_creation_corrected.ipynb"
# from ..DatasetExpansion/feature_creation_corrected import CONTROL_DATE, FIRST_CONSIDER_DATE
CONTROL_DATE = datetime(2023,11,22)
FIRST_CONSIDER_DATE = datetime(2023, 11-3,22)

# CONTROL_DATE = datetime(2023,11,22)

treatment_group = ['MoosieGoose', 'JollyK9', 'Southern_Ad3032', 'bduwowy272habbw', 'Late_Introduction203', 'kapster68', 'TheApertureMonkey', 'talemoon22', 'sebagolindenwald', 'spicyranchplzz', 'TheFloorMayBeLava_02', 'rxtten_flesh', 'greenblooded395', 'greenblooded395', 'DrakenJosh98', 'WhichUsernameIsBest', 'FStahp2', 'Pongpianskul', 'Kanashimi515', 'eviuwu', 'Kattheloner_22', 'Reeze2911', 'Sac20000', 'RanpoWasTaken', 'jlynny1811', 'Playful-Fail4778', 'GarageOk8109', 'katandcats', 'holyredemption', 'jifpeanutbutter420', 'Timely_Inflation1000', 'Erica_Peanut']

# control_group = ['utroi', 'Evening-Management-7', 'kendragibs', 'throwaway8194122', 'Intelligent-Risk9885', 'jojaques', 'control_burn', 'MetroL7', 'abcdmetalhead', 'AdPsychological9510', 'lola4274', 'iuanaj', 'Wide-Bridge-6461', 'Wide-Bridge-6461', '14Boogie', 'ElthonJzohn_997', 'Am3l1a____', 'Oz_a_day', 'OkWrangler1500', 'prettyp0thead', 'miamihausjunkie', 'ElthonJzohn_997', 'Awkward_Reindeer_788', 'SpendLate5545', 'ace2212', 'Maggiethedogie', 'CommunicationDue6753', 'whale_omelette94', 'IndependentInternet3', 'Yeetenanny', 'EARTHISLIFENOMARS', 'ElizabethWolf1214']
control_group = ['utroi', 'xtheboard', 'redman334', 'Evening-Management-7', 'Althe456', 'Super-Breadfruit2894', 'kendragibs', 'wutangis4thedaisy', 'chaoscoateddisaster', 'throwaway8194122', 'PolymerLilac', 'Mysterious_Writer655', 'Intelligent-Risk9885', 'Dull_Check8184', 'Hoommann', 'jojaques', 'jorjacw', 'phravalmom', 'control_burn', 'PeteHealy', 'No-Kaleidoscope9305', 'MetroL7', 'stonks-n-concrete', 'lipglossgirl18', 'abcdmetalhead', 'ConversationUseful', 'ImThatOneStudent', 'AdPsychological9510', 'GibbyTime24', 'cremehoneysky', 'lola4274', 'amoore1501', 'DuckLoveless', 'iuanaj', 'Lisalis9', 'Nanimon22', 'Wide-Bridge-6461', 'achlys19', 'weimaranercollie', 'Wide-Bridge-6461', 'achlys19', 'weimaranercollie', '14Boogie', 'WanderingWindow', 'wigwamtree', 'ElthonJzohn_997', 'CulturalMarsupial89', 'Cosplay_bird168', 'Am3l1a____', 'Ok-Pianist-6844', 'Fun_Sir5658', 'Oz_a_day', 'OMG2Reddit', 'jt2424', 'OkWrangler1500', 'Illustrious-Onion-42', 'Commades', 'prettyp0thead', 'niversescribbles', 'Willofthesouth', 'miamihausjunkie', '8109NZ814', 'sikalpi', 'ElthonJzohn_997', 'Awkward_Reindeer_788', 'CulturalMarsupial89', 'Awkward_Reindeer_788', 'Remote_Membership241', 'ElthonJzohn_997', 'SpendLate5545', 'vampirocitadino', 'Uncle-Thor', 'ace2212', 'SuperNintdoChalmers', 'Timpelgrim', 'Maggiethedogie', 'iuanaj', 'Lisalis9', 'CommunicationDue6753', 'MoekindoSama', 'Temporary_Stay1666', 'whale_omelette94', 'gaypornalt2174', 'anonheartthrob', 'IndependentInternet3', 'Evening-Management-7', 'Mean_Ideal_7504', 'Yeetenanny', 'Electric_Spongebob', 'TangledGoatsucker', 'EARTHISLIFENOMARS', 'Anthony__Alexander', 'RogueRedditerr', 'ElizabethWolf1214', 'weimaranercollie', 'achlys19']
# control_group =  ['Theghostofsabotage', 'ThrowRAcottoncandyy', 'sikubis', 'catiegirl74', 'No_Break_4303', 'MyWhatBigEyesIHave', 'JusticeBeevr', 'ElGranTocho', 'Apuksl', 'Freddybear480', 'Charlie2905', 'FuzzySign278', 'bitteroldbird', 'earlylife_crisis_', 'Notaprumber', 'phuckme2', 'Keiichan_', 'TheSeraphman', 'YearOfTheMoose', 'Alishabrooks29', 'Similar-Lab64', '-doves-nest-', 'outlier37', 'Sudden-Manner-7027', 'SIRDumbDumb', 'Maintenanceman368', '_Dreamy-Rose_', 'aesmith1291', 'partial_birth', 'Tight_Ad_4459', 'NothofagusMacrocarpa', 'Stella2662']
"""
pca + propensity: ['Adam_718_702', 'Majestic-Fig-524', 'sun_madness', 'jolielu', 'ShamingShoegaze', 'KekMio', 'HollowPomegranate', 'morameat', 'sleepyvibes', 'amdetermined', 'MyWhatBigEyesIHave', 'yandererecon', 'loveForParanormal', 'ieat_tortas', 'GeneralSab', 'Fair_Bowler_4913', 'Key-Philosophy-2877', 'vigilantfox85', 'EisWarren', 'Unclelexx999', 'Dm_me9596', 'fruitsaladqueen', 'No-Application-4971', 'GRRAVEYARDD', 'sixsix6', 'FuzzySign278', 'a1ayy', 'Smooth-Noise-5836', 'ResponsibleEnd2058', 'JusticeBeevr', 'cutebutcrazy91', 'hollyelms']
propensity: ['Theghostofsabotage', 'ThrowRAcottoncandyy', 'sikubis', 'catiegirl74', 'No_Break_4303', 'MyWhatBigEyesIHave', 'JusticeBeevr', 'ElGranTocho', 'Apuksl', 'Freddybear480', 'Charlie2905', 'FuzzySign278', 'bitteroldbird', 'earlylife_crisis_', 'Notaprumber', 'phuckme2', 'Keiichan_', 'TheSeraphman', 'YearOfTheMoose', 'Alishabrooks29', 'Similar-Lab64', '-doves-nest-', 'outlier37', 'Sudden-Manner-7027', 'SIRDumbDumb', 'Maintenanceman368', '_Dreamy-Rose_', 'aesmith1291', 'partial_birth', 'Tight_Ad_4459', 'NothofagusMacrocarpa', 'Stella2662']
KD: ['utroi', 'Evening-Management-7', 'kendragibs', 'throwaway8194122', 'Intelligent-Risk9885', 'jojaques', 'control_burn', 'MetroL7', 'abcdmetalhead', 'AdPsychological9510', 'lola4274', 'iuanaj', 'Wide-Bridge-6461', 'Wide-Bridge-6461', '14Boogie', 'ElthonJzohn_997', 'Am3l1a____', 'Oz_a_day', 'OkWrangler1500', 'prettyp0thead', 'miamihausjunkie', 'ElthonJzohn_997', 'Awkward_Reindeer_788', 'SpendLate5545', 'ace2212', 'Maggiethedogie', 'CommunicationDue6753', 'whale_omelette94', 'IndependentInternet3', 'Yeetenanny', 'EARTHISLIFENOMARS', 'ElizabethWolf1214']
"""

assert(all([c not in treatment_group for c in control_group]))

In [5]:
!pip install empath

Collecting empath
  Using cached empath-0.89-py3-none-any.whl
Installing collected packages: empath
Successfully installed empath-0.89


You should consider upgrading via the 'C:\Python310\python.exe -m pip install --upgrade pip' command.


In [6]:
#define input text model (as will be analyzed) here
#define empath

from empath import Empath

global lexicon

lexicon = Empath()

#negative
lexicon.create_category("angry", ["angry"], model="nytimes")
lexicon.create_category("lonely", ["lonely"], model="nytimes")
lexicon.create_category("sad", ["sad"], model="nytimes")
lexicon.create_category("worried", ["worried"], model="nytimes")
lexicon.create_category("frustrated", ["frustrated"], model="nytimes")
lexicon.create_category("anxious", ["anxious"], model="nytimes")
lexicon.create_category("nervous", ["nervous"], model="nytimes")
lexicon.create_category("disappointed", ["disappointed"], model="nytimes")
lexicon.create_category("gloomy", ["gloomy"], model="nytimes")
lexicon.create_category("miserable", ["miserable"], model="nytimes")
lexicon.create_category("lonely", ["lonely"], model="nytimes")

#positive
lexicon.create_category("happy", ["happy"], model="nytimes")
lexicon.create_category("loved", ["loved"], model="nytimes")
lexicon.create_category("joyful", ["joyful"], model="nytimes")
lexicon.create_category("content", ["content"], model="nytimes")
lexicon.create_category("pleased", ["pleased"], model="nytimes")
lexicon.create_category("grateful", ["grateful"], model="nytimes")
lexicon.create_category("relieved", ["relieved"], model="nytimes")
lexicon.create_category("optimistic", ["optimistic"], model="nytimes")
lexicon.create_category("satisfied", ["satisfied"], model="nytimes")
lexicon.create_category("excited", ["excited"], model="nytimes")
lexicon.create_category("loved", ["loved"], model="nytimes")


def get_data_from_txt(txt:str):
    global lexicon

    if (not len(txt)):
        return {k: 0 for k in["angry","sad","worried",
       "frustrated","anxious",
        "nervous","disappointed",
      "gloomy","miserable","longly",
       "happy", "loved", "joyful",
        "content", "pleased",
      "grateful", "relieved",
      "optimistic", "satisfied",
        "excited"]}

    unnormalized = lexicon.analyze(txt, categories=["angry","sad","worried",
       "frustrated","anxious",
        "nervous","disappointed",
      "gloomy","miserable","longly",
       "happy", "loved", "joyful",
        "content", "pleased",
      "grateful", "relieved",
      "optimistic", "satisfied",
        "excited"],normalize = False)
  
    normalized = lexicon.analyze(txt, categories=["angry","sad","worried",
       "frustrated","anxious",
        "nervous","disappointed",
      "gloomy","miserable","longly",
       "happy", "loved", "joyful",
        "content", "pleased",
      "grateful", "relieved",
      "optimistic", "satisfied",
        "excited"],normalize = True)
    

    #we want to make these average to 0 
    summed = 0
    for v in normalized.values(): summed += v 
    to_add = -summed/len(normalized)

    for k in normalized.keys():
        normalized[k] += to_add
    
    return normalized

global txt_keys
txt_keys = ["angry","sad","worried",
       "frustrated","anxious",
        "nervous","disappointed",
      "gloomy","miserable","longly",
       "happy", "loved", "joyful",
        "content", "pleased",
      "grateful", "relieved",
      "optimistic", "satisfied",
        "excited"]

global submission_keys_order
submission_keys_order = [
        # "distinguished" ,
        "is_original_content" ,
        "over_18" ,
        "score" ,
        "title",
        "upvote_ratio" ]

global comment_key_order
comment_key_order =  [
      "is_edited",
      "num_replies", 
      "score",
      "score_is_hidden",
      "total_awards",
      "num_ups",
      "num_downs",
      "body",
      "is_submitter", 
      "stickied"
]

["angry", "furious", "frustrated", "frightened", "disgusted", "outraged", "upset", "irritated", "resentful", "annoyed", "embarrassed", "unhappy", "exasperated", "incensed", "indignant", "fearful", "distressed", "scared", "terrified", "enraged", "confused", "afraid", "anxious", "irate", "insulted", "agitated", "distraught", "nervous", "bitter", "bewildered", "impatient", "offended", "shocked", "disgusted", "humiliated", "ashamed", "apprehensive", "puzzled", "incredulous", "perplexed", "tired", "worried", "disgruntled", "mystified", "dispirited", "despondent", "sad", "aghast", "alarmed", "exasperated", "mad", "terrified", "dismayed", "aggrieved", "irritated", "uneasy", "apologetic", "pained", "angered", "uncomfortable", "annoyed", "frightened", "bewildered", "horrified", "dissatisfied", "sorry", "appalled", "disappointed", "insecure", "defiant", "jealous", "relieved", "shocked", "bullied", "abusive", "intimidated", "disturbed", "suspicious", "complaining", "embittered", "elated", "angrie

In [75]:
import numpy as np
import time
# import binary_search

class Redditor:
    """this is basically a class that created a data array and column label array so we can use that for KD Tree and/or Propensity Score"""

    comment_keys = []
    sub_data_keys = []

    def __init__(self, redditor_dict, comment_dict, submission_dict, redditor_name, start_date, control_date):
        """im lazy rn so first one passed in must have at least some comments"""
        global txt_keys
        global submission_keys_order
        global comment_key_order

        self.all_data = None
        self.all_data_keys = None
        #yield stats related to this
        redditor =  redditor_dict[redditor_name] 
        base_redditor_data = redditor["data"]
        if (not base_redditor_data["has_subreddit"]):
            #set values to -1
            base_redditor_data["over_18"] = -1#redditor.subreddit["over_18"]
            base_redditor_data["num_subscribers"] = -1#redditor.subreddit["subscribers"]

            #this is the only one added that rlly shouldn't be
            base_redditor_data["public_description"] = ""

        
        description_data = get_data_from_txt(base_redditor_data["public_description"])

        _keys = list(base_redditor_data.keys())
        _keys.sort()

        base_data = list()
        base_data_keys = list()
        for k in _keys:
            v = base_redditor_data[k]
            if (isinstance(v, type(""))): continue
            base_data.append(v)
            base_data_keys.append(k)

        base_data += [description_data[i] for i in txt_keys] #= [v for k,v in base_redditor_data.items() if not isinstance(v, type(""))] +
        base_data_keys += [f'public_description_{i}' for i in txt_keys]

        #get sequential data for comments
        sequential_redditor_data = redditor["comments"]

        
        #sequential data -> 2 sorts
        k_segs = 2
        segments = [start_date + (1+i)*(control_date-start_date)/k_segs for i in range(k_segs)]
        cnts = [0 for _ in range(k_segs)]
        suicide_cnts = [0 for _ in range(k_segs)]
        segment_data = [np.zeros((53,),dtype='float') for _ in range(k_segs)]
        comment_keys = [] #None, just setting each time even tho inefficient
        sub_data_keys = []
        current_seg_idx = 0 

        try:
            sequential_redditor_data.sort()
        except:
            print("sequential data is ", sequential_redditor_data) #okay they have the exact same time which is why there are problems bc its not letting me sort bc dict not comparable
            return #TODO - I actually need to correct this one - this doesn't enforce types we wanted to 
        
        #we need to make sure we have data covering for all this time NOTE must add this
        is_valid = sequential_redditor_data[-1][0] < start_date if len(sequential_redditor_data) else True

        for comment_date, comment_data in sequential_redditor_data:

            #can do binary search for efficiency in future - TODO
            if (comment_date > control_date):
                break 
            if (comment_date < start_date): continue 

            sub_id = comment_data["submission_id"] 
            comment_key = sub_id + "--------" + comment_data["comment_id"]
            try:
                comment_data_dict = comment_dict[comment_key]["data"]
            except:
                return

            #getting comment data 
            comment_body_data = get_data_from_txt(comment_data_dict["body"])
            comment_data = [comment_data_dict[k] for k in comment_key_order]

            #make comment keys if they don't alr exist
            if (not len(Redditor.comment_keys)): Redditor.comment_keys = [comment_key_order[i] for i in range(len(comment_key_order)) if not isinstance(comment_data[i], type(""))] + [f'comment_body_{t}' for t in txt_keys]
            
            #concat numeric data and txt data
            comment_data = [v for v in comment_data if not isinstance(v, type(""))] + [comment_body_data[i] for i in txt_keys]

            #getting submission data 
            try:
                sub_data_dict = submission_dict[sub_id]
            except:
                return
            sub_data = [sub_data_dict[k] for k in submission_keys_order]

            #TODO - make lookup to avoid repeated computation
            sub_title_data = get_data_from_txt(sub_data_dict["title"])

            if (not len(Redditor.sub_data_keys)): Redditor.sub_data_keys = [submission_keys_order[i] for i in range(len(submission_keys_order)) if not isinstance(sub_data[i], type(""))] + [f'submission_title_{t}' for t in txt_keys]
            sub_data = [v for v in sub_data if not isinstance(v, type(""))] + [sub_title_data[i] for i in txt_keys]  
            #we have a valid comment date 


            #aggregate
            all_time_info = np.array(comment_data + sub_data, dtype='float')

            while (segments[current_seg_idx] < comment_date):
                current_seg_idx += 1
            
            cnts[current_seg_idx] += 1

            segment_data[current_seg_idx] += all_time_info

            if (any(kw in sub_data_dict["title"] for kw in ["suicide", "depress"]) or any(kw in comment_data_dict["body"] for kw in ["suicide", "depress"])):
                suicide_cnts[current_seg_idx] += 1


        #make my segment keys 
        segment_keys = ["comment_"+ i for i in Redditor.comment_keys] + ["submission_" + i for i in Redditor.sub_data_keys ]

        all_data = base_data*1
        all_data_keys = base_data_keys * 1

        #check lengths match up
        assert len(all_data) == len(all_data_keys), "(0) all_data is of len " + str(len(all_data)) + " all data keys is of len " + str(len(all_data_keys))
        assert len(segment_keys) == len(segment_data[0].tolist()), "segment keys is len " + str(len(segment_keys)) + " while segment_data is len " +  str(len(segment_data[0].tolist()))
        
        
        #add segment data in 
        segment_data = [segment_data[i]/max(1,cnts[i]) for i in range(k_segs)]
        for i in range(k_segs):
            all_data += segment_data[i].tolist()
            all_data_keys += [f'seg_{i}_{key_name}' for key_name in segment_keys]

        for i in range(1): 
            all_data.append(is_valid)
            all_data_keys.append("is_valid")
        for i in range(1): 
            all_data += cnts
            all_data_keys += [f'num_comments_in_seg_{i}' for i in range(len(cnts))]
        for i in range(1):
            all_data += suicide_cnts
            all_data_keys += [f'num_suicide_mentions_{i}' for i in range(len(suicide_cnts))]

        assert len(all_data) == len(all_data_keys), "all_data is of len " + str(len(all_data)) + " all data keys is of len " + str(len(all_data_keys))
        self.all_data_keys = all_data_keys
        self.all_data = np.array(all_data, dtype='float')
            #add it to the correct one 


        #okay time to get sequential data
        


    def get_data(self):
        return self.all_data
r = Redditor(redditor_dict, comment_dict, submission_dict, treatment_group[0], int(time.mktime(FIRST_CONSIDER_DATE.timetuple())), int(time.mktime(CONTROL_DATE.timetuple())) )

In [76]:
print(r.all_data.shape)
print(len(r.all_data_keys))

(145,)
145


In [77]:
control = [Redditor(redditor_dict, comment_dict, submission_dict, c, int(time.mktime(FIRST_CONSIDER_DATE.timetuple())), int(time.mktime(CONTROL_DATE.timetuple())) ) for c in control_group]
treat = [Redditor(redditor_dict, comment_dict, submission_dict, t, int(time.mktime(FIRST_CONSIDER_DATE.timetuple())), int(time.mktime(CONTROL_DATE.timetuple())) )for t in treatment_group]

In [78]:
epsilon = 0.000000001
control_data = np.array([c.all_data for c in control])
treatment_data = np.array([t.all_data for t in treat])

c_std = np.std(control_data, axis =0)
t_std = np.std(treatment_data, axis=0)

treat_mean = np.mean(treatment_data, axis=0) 
control_mean= np.mean(control_data, axis=0)

control_data = (control_data - treat_mean)/(epsilon + t_std)
treatment_data = (treatment_data - treat_mean)/(epsilon + t_std)


In [68]:
from random import randint

In [79]:
#for now, we're doing a NN for perplexity score, will turn into actual later
treatments = np.array([0 for i in range(len(control))] + [1 for i in range(len(treat))])
# ys = np.array([randint(1,20) for i in range(len(control) + len(treat))])
ys = treatments
Xs = np.concatenate((control_data, treatment_data))
print(Xs.shape)
print(ys.shape)
assert(Xs.shape[0] == ys.shape[0])
assert(treatments.shape[0] == ys.shape[0])
assert False, "ys shold not be treatment"

(128, 145)
(128,)


In [16]:
!pip install econml

Collecting econml
  Downloading econml-0.14.1-cp310-cp310-win_amd64.whl (929 kB)
     -------------------------------------- 929.6/929.6 KB 2.9 MB/s eta 0:00:00
Collecting sparse
  Downloading sparse-0.14.0-py2.py3-none-any.whl (80 kB)
     ---------------------------------------- 81.0/81.0 KB ? eta 0:00:00
Collecting lightgbm
  Using cached lightgbm-4.1.0-py3-none-win_amd64.whl (1.3 MB)
Collecting shap<0.42.0,>=0.38.1
  Downloading shap-0.41.0-cp310-cp310-win_amd64.whl (435 kB)
     -------------------------------------- 435.6/435.6 KB 3.0 MB/s eta 0:00:00
Collecting statsmodels>=0.10
  Using cached statsmodels-0.14.1-cp310-cp310-win_amd64.whl (9.8 MB)
Collecting cloudpickle
  Using cached cloudpickle-3.0.0-py3-none-any.whl (20 kB)
Collecting patsy>=0.5.4
  Using cached patsy-0.5.4-py2.py3-none-any.whl (233 kB)
Installing collected packages: patsy, cloudpickle, sparse, lightgbm, statsmodels, shap, econml
Successfully installed cloudpickle-3.0.0 econml-0.14.1 lightgbm-4.1.0 patsy-0.5.4

You should consider upgrading via the 'C:\Python310\python.exe -m pip install --upgrade pip' command.


# Data One (Double ML)

In [19]:
!pip install xgboost

Collecting xgboost
  Using cached xgboost-2.0.2-py3-none-win_amd64.whl (99.8 MB)
Installing collected packages: xgboost
Successfully installed xgboost-2.0.2


You should consider upgrading via the 'C:\Python310\python.exe -m pip install --upgrade pip' command.


In [80]:
# https://shap.readthedocs.io/en/latest/example_notebooks/overviews/Be%20careful%20when%20interpreting%20predictive%20models%20in%20search%20of%20causal%C2%A0insights.html

import matplotlib.pyplot as plt
from econml.dml import LinearDML
from sklearn.base import BaseEstimator, clone
import xgboost
from sklearn.linear_model import LinearRegression

# Run Double ML, controlling for all the other features
def double_ml(y, causal_feature, control_features):
    """Use doubleML from econML to estimate the slope of the causal effect of a feature."""

    est = LinearDML(model_y=LinearRegression())
    est.fit(y, causal_feature, W=control_features)
    return est.effect_inference()


effect = double_ml(ys, treatments, Xs)

In [81]:
effect.summary_frame()

Unnamed: 0_level_0,point_estimate,stderr,zstat,pvalue,ci_lower,ci_upper
X,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.047,0.034,30.783,0.0,0.98,1.114


In [51]:
!pip install DoubleML

Collecting DoubleML
  Using cached DoubleML-0.7.0-py3-none-any.whl (234 kB)
Installing collected packages: DoubleML
Successfully installed DoubleML-0.7.0


You should consider upgrading via the 'C:\Python310\python.exe -m pip install --upgrade pip' command.


In [82]:
import numpy as np
import doubleml as dml
from doubleml.datasets import make_did_SZ2020
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
np.random.seed(42)
ml_g = RandomForestRegressor(n_estimators=100, max_depth=5, min_samples_leaf=5)
ml_m = RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_leaf=5)
data = make_did_SZ2020(n_obs=500, return_type='DataFrame')
print(data)
obj_dml_data = dml.DoubleMLData(data, 'y', 'd')
print(obj_dml_data)
dml_did_obj = dml.DoubleMLDID(obj_dml_data, ml_g, ml_m)
dml_did_obj.fit().summary

           Z1        Z2        Z3        Z4           y    d
0    0.280397 -0.161563  0.374582  0.868365  234.141188  1.0
1   -0.436909 -0.307469 -0.416819  0.256420  191.432100  0.0
2   -0.617402  0.556987  0.250681 -0.061405  203.014252  0.0
3    0.000140 -1.630138 -0.468617 -1.706889  159.419151  1.0
4   -0.961297  0.364768  1.119084 -0.846581  192.933615  0.0
..        ...       ...       ...       ...         ...  ...
495 -0.938382 -1.926729  0.725102 -0.464327  160.438105  1.0
496  0.881096  0.201463  0.827761  0.609844  257.372425  0.0
497 -0.736528  0.462325 -0.572178  0.723444  200.440252  1.0
498  1.249469  0.243495  0.490437  1.159711  270.203470  0.0
499 -0.259027 -0.895659  0.002256 -1.184481  173.738908  1.0

[500 rows x 6 columns]

------------------ Data summary      ------------------
Outcome variable: y
Treatment variable(s): ['d']
Covariates: ['Z1', 'Z2', 'Z3', 'Z4']
Instrument variable(s): None
No. Observations: 500

------------------ DataFrame info    ------------

Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
d,-3.116501,2.029539,-1.535571,0.124644,-7.094325,0.861322


In [103]:
_keys = list(r.all_data_keys)
import collections
print([item for item, count in collections.Counter(_keys).items() if count > 1])

import pandas as pd
df = pd.DataFrame(Xs)
df.columns = _keys
df["is_treatment"] = treatments 
df["the_outcome"] = [randint(0,1) if i else 1 for i in treatments] 
new_data = dml.DoubleMLData(df, 'the_outcome', 'is_treatment')
df.head()
dml_did_obj = dml.DoubleMLDID(new_data, LinearRegression(),RandomForestClassifier())


#https://docs.doubleml.org/stable/examples/py_double_ml_did.html

# https://econml.azurewebsites.net/_autosummary/econml.dml.NonParamDML.html#econml.dml.NonParamDML

# to try
""" 
DoubleMLIRM
DoubleMLIIVM 
DoubleMLDID
"""

print(dml_did_obj.fit())

[]

------------------ Data summary      ------------------
Outcome variable: the_outcome
Treatment variable(s): ['is_treatment']
Covariates: ['comment_karma', 'has_subreddit', 'is_employee', 'is_gold', 'is_mod', 'is_suspended', 'link_karma', 'num_moderated', 'num_multireddits', 'num_subscribers', 'num_trophies', 'over_18', 'time_creation', 'verified_email', 'public_description_angry', 'public_description_sad', 'public_description_worried', 'public_description_frustrated', 'public_description_anxious', 'public_description_nervous', 'public_description_disappointed', 'public_description_gloomy', 'public_description_miserable', 'public_description_longly', 'public_description_happy', 'public_description_loved', 'public_description_joyful', 'public_description_content', 'public_description_pleased', 'public_description_grateful', 'public_description_relieved', 'public_description_optimistic', 'public_description_satisfied', 'public_description_excited', 'seg_0_comment_is_edited', 'seg_0_c

Propensity predictions from learner RandomForestClassifier() for ml_m are close to zero or one (eps=1e-12).


In [15]:
import causalml
from causalml.inference.meta import BaseSRegressor, BaseTRegressor, BaseXRegressor, BaseRRegressor
from causalml.dataset.regression import synthetic_data

# Load synthetic data
y, X, treatment, tau, b, e = synthetic_data(mode=1, n=10000, p=25, sigma=0.5)
w_multi = np.array(['treatment_A' if x==1 else 'control' for x in treatment]) # customize treatment/control names

slearner = BaseSRegressor(LGBMRegressor(), control_name='control')
slearner.estimate_ate(X, w_multi, y)
slearner_tau = slearner.fit_predict(X, w_multi, y)

model_tau_feature = RandomForestRegressor()  # specify model for model_tau_feature

slearner.get_importance(X=X, tau=slearner_tau, model_tau_feature=model_tau_feature,
                        normalize=True, method='auto', features=feature_names)

# Using the feature_importances_ method in the base learner (LGBMRegressor() in this example)
slearner.plot_importance(X=X, tau=slearner_tau, normalize=True, method='auto')

# Using eli5's PermutationImportance
slearner.plot_importance(X=X, tau=slearner_tau, normalize=True, method='permutation')

# Using SHAP
shap_slearner = slearner.get_shap_values(X=X, tau=slearner_tau)

# Plot shap values without specifying shap_dict
slearner.plot_shap_values(X=X, tau=slearner_tau)

# Plot shap values WITH specifying shap_dict
slearner.plot_shap_values(X=X, shap_dict=shap_slearner)

# interaction_idx set to 'auto' (searches for feature with greatest approximate interaction)
slearner.plot_shap_dependence(treatment_group='treatment_A',
                              feature_idx=1,
                              X=X,
                              tau=slearner_tau,
                              interaction_idx='auto')

ModuleNotFoundError: No module named 'causalml'