In [1]:
Notes = '''

## Add the locations of your python Libraries if you have multiple locations:

import sys
new_paths = [
    "/Users/jar/Library/Python/3.8/bin",
    "/Users/jar/Library/Python/3.8/lib/python/site-packages", 
]
for p in new_paths:
    if p not in sys.path: 
        sys.path = [p]+sys.path  

'''

In [16]:
#############################################################################
#############################################################################

In [17]:
### INSTALL THE CGEM MODULE:
# pip install --upgrade cgem
# pip show cgem

In [4]:
from cgem import *

In [18]:
#############################################################################
#############################################################################

In [19]:
Task = '''

Create a simple causal simulation to generate a dataset
that can be used to conduct a computational proof of CGEM.

'''
def gen_artificial_data_v1(size=10000):
    """
    Generate an artificial dataset representing a causal system.

    Parameters:
    size (int): Number of data points to generate.

    Returns:
    pandas.DataFrame: A DataFrame with the generated data.
    """
    global cats,effs
    # Generating random values for the variables
    reg_var_a = np.random.normal(10, 3, size)
    reg_var_b = np.random.normal(12, 4, size)
    reg_var_c = np.random.normal(15, 5, size)

    # Calculating the effect based on the variables
    effect_x = 20.0 + (1.0 * reg_var_a) + (1.5 * reg_var_b) + (2.0 * reg_var_c)

    # Defining categories and their corresponding effects
    cats = list("ABCDEFGHIJ")
    effs = np.around(np.linspace(0.5, 1.4, len(cats)), 2)
    cat2effect = {cat: round(eff, 4) for cat, eff in zip(cats, effs)}

    # Generating categorical variable and its effect
    cat_var_d = np.array([choice(cats) for _ in range(size)])
    cat_effect_d = np.array([cat2effect[c] for c in cat_var_d])

    # Adding a noise effect
    noise_effect = np.random.uniform(0.90, 1.10, size)

    # Calculating the target variable
    target_var_z = ((effect_x) * cat_effect_d) * noise_effect

    # Constructing the dataframe
    df = pd.DataFrame({
        'TGT_Z': target_var_z,
        'REG_A': reg_var_a,
        'REG_B': reg_var_b,
        'REG_C': reg_var_c,
        'CAT_D': cat_var_d
    })

    return df

#------------------------------------------------

DF1 = gen_artificial_data_v1(size=10000)
DF2 = gen_artificial_data_v1(size=10000) 

#------------------------------------------------

In [20]:
DF1.head(10) 

Unnamed: 0,TGT_Z,REG_A,REG_B,REG_C,CAT_D
0,56.195786,6.248147,12.373554,16.175011,D
1,49.194092,16.538513,15.854863,11.959708,B
2,116.991986,12.022187,12.710596,19.066333,I
3,67.989816,9.243552,14.777219,18.81022,C
4,54.96116,10.454428,20.680133,18.433032,B
5,44.392594,9.988682,7.36304,13.539356,B
6,75.044417,10.32256,17.955078,8.957982,G
7,65.176173,8.169347,11.80788,21.397151,C
8,52.91879,10.267926,9.282756,13.6846,D
9,81.184679,15.742983,13.93274,15.751626,E


In [21]:
#############################################################################
#############################################################################

In [22]:
### MASTER EFFECTS FORMULA: 
Formula = "TGT_Z = CAT_D_EFF * LIN_REG_EFF"

### TERMS PARAMETERS:
tparams = {
    "CAT_D_EFF": {
        'model': "CatRegModel()", 
        'xvars': ['CAT_D'],
        'ival' : 10,
    },
    "LIN_REG_EFF": {
        'model': "OLS()", 
        'xvars': ['REG_A','REG_B','REG_C'],
        'ival' : 10,
    } 
}   

In [23]:
model = CGEM() 
model.load_df(DF1)  
model.define_form(Formula) 
model.define_terms(tparams)  

In [24]:
model.fit(25,verbose=True); 


##################################################
Learning Epoch: 1
--------------------------------------------------
RMSE 1: 36.54713079260933
RMSE 2: 31.28781419363407
DELTA: -5.259316598975261
RSQ 1: -1.0303183556525064
RSQ 2: -0.4880174142276028
DELTA: 0.5423009414249036
--------------------------------------------------

##################################################
Learning Epoch: 2
--------------------------------------------------
RMSE 1: 31.28781419363407
RMSE 2: 26.998834140188247
DELTA: -4.2889800534458224
RSQ 1: -0.4880174142276028
RSQ 2: -0.10801997030543387
DELTA: 0.37999744392216894
--------------------------------------------------

##################################################
Learning Epoch: 3
--------------------------------------------------
RMSE 1: 26.998834140188247
RMSE 2: 23.491602642353833
DELTA: -3.5072314978344146
RSQ 1: -0.10801997030543387
RSQ 2: 0.1611528014541571
DELTA: 0.269172771759591
--------------------------------------------------

###

In [25]:
#############################################################################
#############################################################################

In [26]:
preds = model.predict(DF2) 
actuals = DF2['TGT_Z'].values
r2 = model.calc_r2(actuals,preds)  
print('CrosVal R-Squared:',round(r2,5)) 

CrosVal R-Squared: 0.96857


In [27]:
#############################################################################
#############################################################################
#############################################################################
#############################################################################