In [1]:
import os
import sys
import math
import logging
from pathlib import Path

import numpy as np
import scipy as sp
import sklearn
import statsmodels.api as sm
from statsmodels.formula.api import ols

%load_ext autoreload
%autoreload 2

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import seaborn as sns
sns.set_context("poster")
sns.set(rc={'figure.figsize': (16, 9.)})
sns.set_style("whitegrid")

import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)

logging.basicConfig(level=logging.INFO, stream=sys.stdout)

In [2]:
from justcause import *
from justcause.data.frames import CausalFrame
from justcause.data.sets import load_ihdp

**PLEASE** save this file right now using the following naming convention: `NUMBER_FOR_SORTING-YOUR_INITIALS-SHORT_DESCRIPTION`, e.g. `1.0-fw-initial-data-exploration`. Use the number to order the file within the directory according to its usage.

In [3]:
bunch = load_ihdp()
df = CausalFrame(bunch.data, covariates=bunch.covariate_names, treatment='t')

One could use CausalFrames instead of data bunchs directly. They keep information about the covariates and names.

In [4]:
df.names.covariates

['0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24']

In [5]:
df[['0', '1']].names.covariates # works also after an operation

['0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24']

Easy access of X, t, and y for calling learners.

In [6]:
df.np.X, df.np.t

(array([[ 1.39739503,  0.99634625, -1.10562395, ...,  0.        ,
          0.        ,  1.        ],
        [ 1.39739503,  0.99634625, -1.10562395, ...,  0.        ,
          0.        ,  1.        ],
        [ 1.39739503,  0.99634625, -1.10562395, ...,  0.        ,
          0.        ,  1.        ],
        ...,
        [ 0.38143657, -0.20294594, -0.73326097, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.38143657, -0.20294594, -0.73326097, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.38143657, -0.20294594, -0.73326097, ...,  0.        ,
          0.        ,  0.        ]]), 0         1.0
 1         1.0
 2         1.0
 3         1.0
 4         1.0
          ... 
 746995    0.0
 746996    0.0
 746997    0.0
 746998    0.0
 746999    0.0
 Name: t, Length: 747000, dtype: float64)

In [7]:
df[['0', '1']].np.X

array([[ 1.39739503,  0.99634625],
       [ 1.39739503,  0.99634625],
       [ 1.39739503,  0.99634625],
       ...,
       [ 0.38143657, -0.20294594],
       [ 0.38143657, -0.20294594],
       [ 0.38143657, -0.20294594]])

In [8]:
df[['0', '1']].np.t

IndexError: No treatment variable in CausalFrame