In [1]:
Notes = '''

## Add the locations of your python Libraries if you have multiple locations:

import sys
new_paths = [
    "/Users/jar/Library/Python/3.8/bin",
    "/Users/jar/Library/Python/3.8/lib/python/site-packages", 
]
for p in new_paths:
    if p not in sys.path: 
        sys.path = [p]+sys.path  

'''

In [2]:
#############################################################################
#############################################################################

In [3]:
### INSTALL THE CGEM MODULE:
# pip install --upgrade cgem
# pip show cgem

In [4]:
pip show cgem

Name: cgem
Version: 0.0.9
Summary: CGEM: Collaborative Generalized Effects Modeling
Home-page: https://github.com/jrolf/cgem
Author: James A. Rolfsen
Author-email: james.rolfsen@think.dev
License: UNKNOWN
Location: /Users/jar/opt/anaconda3/lib/python3.8/site-packages
Requires: sympy, pygam, statsmodels, xgboost, numpy, scipy, pandas-ta, pandas, scikit-learn
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [5]:
from cgem import *

In [6]:
#############################################################################
#############################################################################

In [7]:
Task = '''

Create a simple causal simulation to generate a dataset
that can be used to conduct a computational proof of CGEM.

'''
def gen_artificial_data_v1(size=10000):
    """
    Generate an artificial dataset representing a causal system.

    Parameters:
    size (int): Number of data points to generate.

    Returns:
    pandas.DataFrame: A DataFrame with the generated data.
    """
    global cats,effs
    # Generating random values for the variables
    reg_var_a = np.random.normal(10, 3, size)
    reg_var_b = np.random.normal(12, 4, size)
    reg_var_c = np.random.normal(15, 5, size)

    # Calculating the effect based on the variables
    effect_x = 20.0 + (1.0 * reg_var_a) + (1.5 * reg_var_b) + (2.0 * reg_var_c)

    # Defining categories and their corresponding effects
    cats = list("ABCDEFGHIJ")
    effs = np.around(np.linspace(0.5, 1.4, len(cats)), 2)
    cat2effect = {cat: round(eff, 4) for cat, eff in zip(cats, effs)}

    # Generating categorical variable and its effect
    cat_var_d = np.array([choice(cats) for _ in range(size)])
    cat_effect_d = np.array([cat2effect[c] for c in cat_var_d])

    # Adding a noise effect
    noise_effect = np.random.uniform(0.90, 1.10, size)

    # Calculating the target variable
    target_var_z = ((effect_x) * cat_effect_d) * noise_effect

    # Constructing the dataframe
    df = pd.DataFrame({
        'TGT_Z': target_var_z,
        'REG_A': reg_var_a,
        'REG_B': reg_var_b,
        'REG_C': reg_var_c,
        'CAT_D': cat_var_d
    })

    return df

#------------------------------------------------

DF1 = gen_artificial_data_v1(size=10000)
DF2 = gen_artificial_data_v1(size=10000) 

#------------------------------------------------

In [8]:
DF1.head(10) 

Unnamed: 0,TGT_Z,REG_A,REG_B,REG_C,CAT_D
0,92.796096,8.4078,12.285544,13.221552,H
1,98.605217,9.433186,17.764508,16.382178,G
2,99.193593,11.733575,10.806605,13.514444,J
3,86.090798,12.56964,10.743854,15.061856,G
4,58.846771,6.91946,13.300037,13.996446,D
5,40.924103,9.616978,11.478593,14.345552,A
6,110.287898,10.480244,11.59133,16.764344,I
7,37.065587,14.065405,4.197482,10.703233,B
8,72.684934,15.216271,6.273357,13.400651,G
9,44.660527,12.377715,8.406873,17.507483,B


In [9]:
#############################################################################
#############################################################################

In [10]:
### MASTER EFFECTS FORMULA: 
Formula = "TGT_Z = CAT_D_EFF * LIN_REG_EFF"

### TERMS PARAMETERS:
tparams = {
    "CAT_D_EFF": {
        'model': "CatRegModel()", 
        'xvars': ['CAT_D'],
        'ival' : 10,
    },
    "LIN_REG_EFF": {
        'model': "OLS()", 
        'xvars': ['REG_A','REG_B','REG_C'],
        'ival' : 10,
    } 
}   

In [11]:
### CREATE CGEM MODEL AND LOAD THE PARAMETERS:
model = CGEM() 
model.load_df(DF1)  
model.define_form(Formula) 
model.define_terms(tparams)  

In [12]:
### FIT THE CGEM MODEL WITH N LEARNING EPOCHS:
model.fit(25,verbose=True); 


##################################################
Learning Epoch: 1
--------------------------------------------------
RMSE 1: 36.835527438818026
RMSE 2: 31.518551982705677
DELTA: -5.316975456112349
RSQ 1: -1.0425792965333236
RSQ 2: -0.49546982380792204
DELTA: 0.5471094727254016
--------------------------------------------------

##################################################
Learning Epoch: 2
--------------------------------------------------
RMSE 1: 31.518551982705677
RMSE 2: 27.183171887463807
DELTA: -4.33538009524187
RSQ 1: -0.49546982380792204
RSQ 2: -0.11236010045902822
DELTA: 0.3831097233488938
--------------------------------------------------

##################################################
Learning Epoch: 3
--------------------------------------------------
RMSE 1: 27.183171887463807
RMSE 2: 23.638744348874727
DELTA: -3.5444275385890798
RSQ 1: -0.11236010045902822
RSQ 2: 0.15881029737036934
DELTA: 0.27117039782939756
--------------------------------------------------

In [13]:
#############################################################################
#############################################################################

In [14]:
### TAKE THE FITTED MODEL AND APPLY IT TO THE TEST SET 
### TO EVALUATE MODEL PERFORMANCE ON OUT-OF-SAMPLE DATA: 
preds = model.predict(DF2) 
actuals = DF2['TGT_Z'].values
r2 = model.calc_r2(actuals,preds)  
print('CrosVal R-Squared:',round(r2,5))  # CrosVal R-Squared: 0.96904 

CrosVal R-Squared: 0.96904


In [15]:
#############################################################################
#############################################################################
#############################################################################
#############################################################################

In [16]:
OLS?

In [19]:
import numpy as np
import pandas as pd

def get_n_dims(arr):
    """
    Returns the number of dimensions of a numpy array or pandas DataFrame.

    :param arr: numpy array or pandas DataFrame
    :return: integer representing the number of dimensions
    """
    return arr.ndim


In [21]:
get_n_dims(DF1)

2

In [24]:
a = np.array([[[1,2,3,4]]])  
get_n_dims(a)

3

In [25]:
str(type(DF1))

"<class 'pandas.core.frame.DataFrame'>"

In [26]:
b = np.array(a).flatten()

In [27]:
b

array([1, 2, 3, 4])

In [29]:
c = np.array(b).flatten()
c

array([1, 2, 3, 4])

In [30]:
import numpy as np

def compute_averages(x, y):
    # Find the unique categories and their counts
    categories, counts = np.unique(x, return_counts=True)

    # Sum the observations for each category
    sums = np.bincount(x, weights=y)

    # Compute averages, avoiding division by zero for any category not in x
    averages = sums[categories] / counts

    return dict(zip(categories, averages))

# Example usage
x = np.array([1, 2, 3, 1, 2, 3, 1, 2, 3])  # Category IDs
y = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])  # Observation values

average_values = compute_averages(x, y) 
print(average_values)


{1: 4.0, 2: 5.0, 3: 6.0}


In [31]:
import numpy as np

def map_averages_to_new_ids(averages_dict, a):
    # Create a mapping from category IDs to indices
    unique_ids = np.unique(a)
    id_to_index = {id_: i for i, id_ in enumerate(unique_ids)}

    # Create an array of averages using this mapping
    averages_array = np.array([averages_dict.get(id_, 0) for id_ in unique_ids])

    # Map the averages to the new IDs in array 'a' using the mapping
    index_array = np.vectorize(id_to_index.get)(a)
    b = averages_array[index_array]

    return b

# Example usage
averages_dict = {'cat1': 4.0, 'cat2': 5.0, 'cat3': 6.0}  # Replace with your calculated averages
a = np.array(['cat3', 'cat1', 'cat2', 'cat3', 'cat1'])    # New category IDs array

b = map_averages_to_new_ids(averages_dict, a)
print(b)


[6. 4. 5. 6. 4.]
