In [1]:
import numpy  as np
import pandas as pd 

In [2]:
from cgem import * 

In [3]:
#############################################################################
#############################################################################

In [120]:
notes = '''

class CGEM:
    
    def __init__(self):
        self.df1 = None
        self.YVar = None
        self.TermList = None
        self.TrueForm = None
        self.tparams = None
        self.target_ival = None
        self.epoch_logs = [] 

    def load_df(self, df): # #df_name='df'):
        self.df1 = df.copy()
        self.train_len = len(df)

    def define_form(self, formula="TGT_Z = CAT_D_EFF * LIN_REG_EFF"):
        self.YVar     = self.get_vars(formula, side='left')[0]
        self.TrueForm = self.reform(formula, self.YVar)
        self.TermList = self.get_vars(formula, side='right')
        
        # Initializing the Maximum Learning Rate:
        self.TermLR = {} 
        for term in self.TermList: 
            self.TermLR[term] = 0.1   
        # Special Provision:
        #self.TermLR['const'] = 0.3  
        
    def define_terms(self, terms_params):
        self.tparams = dict(terms_params)
        self.target_ival = eval(f'self.df1["{self.YVar}"].mean()')

    def reform(self, eq_str1="y=m*x+b", solve_for='x'):
        eq_str1 = eq_str1.replace(' ', '').replace('==', '=').replace('~', '=')
        left, right = tuple(eq_str1.split('='))
        sleft, sright = sympify(left, evaluate=False), sympify(right, evaluate=False)
        atoms = list(sleft.atoms()) + list(sright.atoms())
        for atom in atoms:
            try:
                exec(f"{atom}=Symbol('{atom}')")
            except:
                pass
        eq1 = eval(f"Eq({left}, {right})")
        
        #---------------------------------------------------------
        self.left, self.right = left, right
        self.sleft, self.sright = sleft, sright
        self.eq1 = eq1 
        #---------------------------------------------------------
        
        eq2 = eval(f"Eq({solve_for}, solve(eq1, {solve_for})[0])")
        eq_str2 = str(eq2)[3:-1].replace(' ', '').replace(',', ' = ')
        
        #---------------------------------------------------------
        self.last_left, self.last_right = left, right
        self.last_sleft, self.last_sright = sleft, sright
        self.last_eq1 = eq1 
        self.last_eq2 = eq2
        self.last_eq_str2 = eq_str2
        #---------------------------------------------------------
        
        return eq_str2
    
    def get_vars(self, eq_str1="y=m*x+b", side='both'):
        eq_str1 = eq_str1.replace(' ', '').replace('==', '=').replace('~', '=')
        left, right = tuple(eq_str1.split('='))
        sleft, sright = sympify(left, evaluate=False), sympify(right, evaluate=False)

        if side == 'both':    atoms = list(sleft.atoms()) + list(sright.atoms())
        elif side == 'right': atoms = list(sright.atoms())
        elif side == 'left':  atoms = list(sleft.atoms())

        # Filter out non-symbol atoms and sort them
        found_vars = sorted(str(atom) for atom in atoms if atom.is_Symbol)
        return found_vars 

    def eq2np(self, eq_str):
        eq_conv = [['log(', 'np.log('], ['exp(', 'np.exp(']]
        for a, b in eq_conv:
            eq_str = eq_str.replace(a, b)
        return eq_str

    def np2eq(self, eq_str):
        eq_conv = [['log(', 'np.log('], ['exp(', 'np.exp(']]
        for a, b in eq_conv:
            eq_str = eq_str.replace(b, a)
        return eq_str

    def evaluation_string(self, eq_str1="y=m*x+b", solve_for='x', dfname='df1', tvars=[]):
        eq_str2 = self.reform(eq_str1, solve_for)
        numpy_form = eq_str2.split('=')[1].strip()
        numpy_form = self.eq2np(numpy_form)
        for tvar in tvars:
            numpy_form = numpy_form.replace(tvar, f"{dfname}['{tvar}']")
        return numpy_form

    def evaluation(self, eq_str1="y=m*x+b", solve_for='x', dfname='df1', tvars=[]):
        es = self.evaluation_string(eq_str1, solve_for, dfname, tvars)
        self.es = es
        return eval(es) 

    def fit(self, n_epochs=50,verbose=False):
        # Creates the initial version of the Transient Effects DataFrame: 
        self.initialize_tdf() # << self.TDF is created.

        # Preserve the values of the Target Variable for later evaluation
        TrueVals = self.TDF[self.YVar].values 

        # Initial Evaluation of Predictions
        #preds = self.evaluation(self.TrueForm, self.YVar, 'TDF', tvars=self.TermList)
        #actuals = TrueVals
        #R2 = max(round(r2_score(actuals, preds), 5), 0.00001)

        for epoch_num in range(1,n_epochs+1):
            if verbose==True and epoch_num % 1 == 0:  # Adjust this condition for controlling the print frequency
                print(f"\n{'#' * 50}\nLearning Epoch: {epoch_num}")

            # Initial Evaluation
            yhat1 = self.evaluation(self.TrueForm, self.YVar, 'self.TDF', tvars=self.TermList + [self.YVar])
            rmse1 = self.calc_rmse(TrueVals, yhat1)
            rsq1  = self.calc_r2(TrueVals, yhat1)

            model_log = {}
            NewEffects = {}
            for term in self.TermList: 
                #self.term = str(term) 
                
                if 'lr' in self.tparams[term]: 
                    self.TermLR[term] = self.tparams[term]['lr'] 
                
                self.term_tdf1 = self.TDF[[self.YVar] + self.TermList].copy()
                self.term_tdf2 = self.term_tdf1.copy()

                # Old Effects
                old_effects = self.term_tdf1[term].values
                
                # Implied Effects
                implied_effects = self.evaluation(self.TrueForm, term, 'self.term_tdf1', tvars=self.TermList + [self.YVar])
                
                # Fit a new model
                y = implied_effects
                xvars = [xvar for xvar in self.tparams[term]['xvars'] if xvar!=''] 
                if len(xvars)>0: X = self.df1[xvars].values 
                else: X = np.ones(self.train_len) 
                model = eval(self.tparams[term]['model'])
                model.fit(X, y) 

                # Predict new effects
                new_effects = model.predict(X) 
                self.new_effects = new_effects      ## DEBUG
                self.term_tdf2[term] = new_effects

                # Evaluate performance after learning new effects
                yhat2 = self.evaluation(self.TrueForm, self.YVar, 'self.term_tdf2', tvars=self.TermList + [self.YVar])
                rmse2 = self.calc_rmse(TrueVals, yhat2) 
                rsq2 = self.calc_r2(TrueVals, yhat2)

                # Update effects
                LRate = self.TermLR[term]
                deltas = new_effects - old_effects
                learned_effects = old_effects + (LRate * deltas)
                NewEffects[term] = learned_effects

                model_log[term] = {
                    'm_str':self.tparams[term]['model'], 
                    'xvars':self.tparams[term]['xvars'],
                    'model':model, 
                    'LRate':LRate, 
                    'rmse1':rmse1,
                    'rmse2':rmse2,
                    'rsq1' :rsq1 ,
                    'rsq2' :rsq2 ,
                }

            # Update TDF with new effects
            for term in self.TermList:
                self.TDF[term] = NewEffects[term]

            # Final evaluation for this iteration
            yhat2 = self.evaluation(self.TrueForm, self.YVar, 'self.TDF', tvars=self.TermList + [self.YVar])
            rmse2 = self.calc_rmse(TrueVals, yhat2) 
            rsq2  = self.calc_r2(TrueVals, yhat2)

            elog = {
                'epoch' : epoch_num,
                'models': model_log,  
            }
            self.epoch_logs.append(elog) 

            if verbose==True and epoch_num % 1 == 0:  # Adjust this condition for controlling the print frequency
                print(f"{'-' * 50}\nRMSE 1: {rmse1}\nRMSE 2: {rmse2}\nDELTA: {rmse2 - rmse1}")
                print(f"RSQ 1: {rsq1}\nRSQ 2: {rsq2}\nDELTA: {rsq2 - rsq1}\n{'-' * 50}")

        print('CGEM model fitting complete. ('+str(epoch_num)+' epochs)')  
    
    def predict(self, X):
        """
        Predict using the CGEM model.

        Parameters:
        X (pandas.DataFrame): Input features for making predictions.

        Returns:
        numpy.ndarray: Predicted values.
        """
        # Create a DataFrame for storing the predictions:
        self.PDF = X.copy() 
        self.last_log = self.epoch_logs[-1]
        self.pred_len = len(X) 
        
        NewEffects = {}
        # Apply the learned effects to the prediction DataFrame
        for term in self.TermList:
            if term == self.YVar: continue  # or term == 'const': continue            
            # Load the last available effects model for the given term: 
            self.last_model = deepcopy(self.last_log['models'][term]['model'])
            
            if "ScalarTerm" in str(self.last_model):
                self.PDF[term] = self.last_model.scalar 
                continue 

            # Predict new effects
            #----------------------------------------------------------
            xvars = [xvar for xvar in self.tparams[term]['xvars'] if xvar!=''] 
            if len(xvars)>0: self.X2 = X[xvars].values 
            else: self.X2 = np.array([np.ones(self.pred_len)]) 
            
            #self.X2 = X[self.tparams[term]['xvars']].values
            #----------------------------------------------------------
            pred_effects = self.last_model.predict(self.X2)
            self.pred_effects = pred_effects
            self.PDF[term] = pred_effects
            
        yhat2 = self.evaluation(
            self.TrueForm,
            self.YVar,
            'self.PDF',
            tvars=self.TermList+[self.YVar]
        )
        return yhat2
    
    
    def initialize_tdf(self):
        """
        Initialize the Transient DataFrame (TDF) that holds all the currently learned effect values.
        """
        self.RDF = pd.DataFrame()
        self.RDF[self.YVar] = self.df1[self.YVar].values

        for term in self.TermList:
            if term==self.YVar: continue
            
            form2 = str(self.TrueForm)
            
            #for term2 in self.TermList:
            #    if term2 == term or term2 == self.YVar: continue
            #    null_val = self.tparams[term2]['ival']
            #    form2 = form2.replace(term2, str(null_val))

            #--------------------------------
            self.term = term 
            self.form2 = form2
            #--------------------------------
            
            self.form3a = self.reform(form2, term)
            self.form3 = self.eq2np(self.form3a).replace(term, 'term_vals')
            
            #--------------------------------
            self.last_term = term 
            self.last_form2 = form2
            #--------------------------------

            yvar_vals = f"self.df1['{self.YVar}'].values"
            self.form3 = self.form3.replace(self.YVar, yvar_vals) 
            expr = self.form3.split(' = ')[1]

            #exec(self.form3) 
            self.expr = expr
            
            term_vals = eval(expr) 
            term_vals = list(term_vals)
            for _ in range(5): shuffle(term_vals)
            self.RDF[term] = term_vals

            # Special Provision
            if term == 'const': self.RDF[term] = 1.0
            else: self.RDF[term] = self.tparams[term]['ival']

        if 'const' in self.TermList: self.RDF['const'] = self.RDF['const'].mean()

        self.TDF = self.RDF.copy() 
        #print('Done Initializing Effects.') 


    def calc_r2(self,actual, predictions):
        """
        Calculate the R-squared value between actual and predicted values.

        Parameters:
        actual (numpy array): The actual values.
        predictions (numpy array): The predicted values.

        Returns:
        float: The R-squared value.
        """
        # Calculate the mean of actual values
        mean_actual = np.mean(actual)

        # Calculate the total sum of squares (SST)
        sst = np.sum(np.square(np.subtract(actual, mean_actual)))

        # Calculate the residual sum of squares (SSR)
        ssr = np.sum(np.square(np.subtract(actual, predictions)))

        # Calculate R-squared
        r_squared = 1 - (ssr / sst)
        return r_squared

    def calc_rmse(self,actual, predictions):
        """
        Calculate the Root Mean Square Error (RMSE) between actual and predicted values.

        Parameters:
        actual (numpy array): The actual values.
        predictions (numpy array): The predicted values.

        Returns:
        float: The RMSE value.
        """
        # Calculate the square of differences
        differences = np.subtract(actual, predictions)
        squared_differences = np.square(differences)

        # Calculate the mean of squared differences
        mean_squared_differences = np.mean(squared_differences)

        # Calculate the square root of the mean squared differences (RMSE)
        rmse = np.sqrt(mean_squared_differences)
        return rmse



'''

In [123]:

def norm_flat(X):
    x_type = str(type(X)) 
    if ('DataFrame' in x_type) or ('Series' in x_type):
        X = X.values 
    return np.array(X).flatten() 


#-----------------------------------------------------------


class DirectVar:
    
    def __init__(self):
        pass
    
    def fit(self,X=[],y=[]): 
        pass
    
    def predict(self,X=[]):
        return norm_flat(X) 


class ScalarTerm:
    
    def __init__(self):
        pass

    def fit(self,X=[],y=[]): 
        # Only "y" is required here.
        # X should be initialized to ensure len(X)=0
        self.scalar = np.array(y).mean() 

    def predict(self,X=[]):  
        # We "predict" a single number for every row of X
        return self.scalar * np.ones(len(X))  
    
    
class CatRegModel:
    def __init__(self):
        """
        Initialize the CatRegModel. This model encodes categorical variables 
        and fits a linear regression model.
        """
        self.encoder = LBZ()
        self.model = OLS()

    def fit(self, X, y):
        """
        Fit the model with the encoded features.

        Parameters:
        X (array-like): Feature variable.
        y (array-like): Target variable.
        """
        X_encoded = self.encoder.fit_transform(X)
        self.model.fit(X_encoded, y)

    def predict(self, X):
        """
        Predict using the fitted model.

        Parameters:
        X (array-like): Feature variable.

        Returns:
        numpy.ndarray: Predicted values.
        """
        X_encoded = self.encoder.transform(X)
        return self.model.predict(X_encoded)


class CatEffectTerm: 

    def __init__(self):
        pass

    def fit(self,X,y): 
        # X is the list of category names, per record in the training set.
        # y is the target effect we are converging on.
          # "x" is now a 1-D array of category names.
        # Calculate the average values per category and return a dict: 
        self.cat_vals = self.calc_cat_means(x,y)  

    def predict(self,X):
        # We "predict" a single number for every row of X
        X2 = norm_flat(X) 
        preds = self.map_cat_vals(X2) 
        return preds

    def calc_cat_means(self,x,y):
        # Find the unique categories and their counts
        categories, counts = np.unique(x, return_counts=True)
        # Sum the observations for each category
        sums = np.bincount(x, weights=y)
        # Compute averages, avoiding division by zero for any category not in x
        averages = sums[categories] / counts
        return dict(zip(categories, averages))

    def map_cat_vals(self,cats2vals,new_cats):
        # Create a mapping from category IDs to indices
        unique_ids = np.unique(new_cats)
        id2index = {id_: i for i, id_ in enumerate(unique_ids)} 
        # Create an array of averages using this mapping
        averages_array = np.array([cats2vals.get(id_, 0) for id_ in unique_ids])
        # Map the averages to the new IDs in array 'a' using the mapping
        idx = np.vectorize(id2index.get)(a) 
        return averages_array[idx] 



In [138]:
class CGEM:
    
    def __init__(self):
        self.df1 = None
        self.YVar = None
        self.TermList = None
        self.TrueForm = None
        self.tparams = None
        self.target_ival = None
        self.epoch_logs = [] 

    def load_df(self, df): # #df_name='df'):
        self.df1 = df.copy()
        self.train_len = len(df)

    def define_form(self, formula="TGT_Z = CAT_D_EFF * LIN_REG_EFF"):
        self.YVar     = self.get_vars(formula, side='left')[0]
        self.TrueForm = self.reform(formula, self.YVar)
        self.TermList = self.get_vars(formula, side='right')
        
        # Initializing the Maximum Learning Rate:
        self.TermLR = {} 
        for term in self.TermList: 
            self.TermLR[term] = 0.1   
        # Special Provision:
        #self.TermLR['const'] = 0.3  
        
    def define_terms(self, terms_params):
        self.tparams = dict(terms_params)
        self.target_ival = eval(f'self.df1["{self.YVar}"].mean()')

    def reform(self, eq_str1="y=m*x+b", solve_for='x'):
        eq_str1 = eq_str1.replace(' ', '').replace('==', '=').replace('~', '=')
        left, right = tuple(eq_str1.split('='))
        sleft, sright = sympify(left, evaluate=False), sympify(right, evaluate=False)
        atoms = list(sleft.atoms()) + list(sright.atoms())
        for atom in atoms:
            try:
                exec(f"{atom}=Symbol('{atom}')")
            except:
                pass
        eq1 = eval(f"Eq({left}, {right})")
        
        #---------------------------------------------------------
        self.left, self.right = left, right
        self.sleft, self.sright = sleft, sright
        self.eq1 = eq1 
        #---------------------------------------------------------
        
        eq2 = eval(f"Eq({solve_for}, solve(eq1, {solve_for})[0])")
        eq_str2 = str(eq2)[3:-1].replace(' ', '').replace(',', ' = ')
        
        #---------------------------------------------------------
        self.last_left, self.last_right = left, right
        self.last_sleft, self.last_sright = sleft, sright
        self.last_eq1 = eq1 
        self.last_eq2 = eq2
        self.last_eq_str2 = eq_str2
        #---------------------------------------------------------
        
        return eq_str2
    
    def get_vars(self, eq_str1="y=m*x+b", side='both'):
        eq_str1 = eq_str1.replace(' ', '').replace('==', '=').replace('~', '=')
        left, right = tuple(eq_str1.split('='))
        sleft, sright = sympify(left, evaluate=False), sympify(right, evaluate=False)

        if side == 'both':    atoms = list(sleft.atoms()) + list(sright.atoms())
        elif side == 'right': atoms = list(sright.atoms())
        elif side == 'left':  atoms = list(sleft.atoms())

        # Filter out non-symbol atoms and sort them
        found_vars = sorted(str(atom) for atom in atoms if atom.is_Symbol)
        return found_vars 

    def eq2np(self, eq_str):
        eq_conv = [['log(', 'np.log('], ['exp(', 'np.exp(']]
        for a, b in eq_conv:
            eq_str = eq_str.replace(a, b)
        return eq_str

    def np2eq(self, eq_str):
        eq_conv = [['log(', 'np.log('], ['exp(', 'np.exp(']]
        for a, b in eq_conv:
            eq_str = eq_str.replace(b, a)
        return eq_str

    def evaluation_string(self, eq_str1="y=m*x+b", solve_for='x', dfname='df1', tvars=[]):
        eq_str2 = self.reform(eq_str1, solve_for)
        numpy_form = eq_str2.split('=')[1].strip()
        numpy_form = self.eq2np(numpy_form)
        for tvar in tvars:
            numpy_form = numpy_form.replace(tvar, f"{dfname}['{tvar}']")
        return numpy_form

    def evaluation(self, eq_str1="y=m*x+b", solve_for='x', dfname='df1', tvars=[]):
        es = self.evaluation_string(eq_str1, solve_for, dfname, tvars)
        self.es = es
        return eval(es) 

    def fit(self, n_epochs=50,verbose=False):
        # Creates the initial version of the Transient Effects DataFrame: 
        self.initialize_tdf() # << self.TDF is created.

        # Preserve the values of the Target Variable for later evaluation
        TrueVals = self.TDF[self.YVar].values 

        # Initial Evaluation of Predictions
        #preds = self.evaluation(self.TrueForm, self.YVar, 'TDF', tvars=self.TermList)
        #actuals = TrueVals
        #R2 = max(round(r2_score(actuals, preds), 5), 0.00001)

        for epoch_num in range(1,n_epochs+1):
            if verbose==True and epoch_num % 1 == 0:  # Adjust this condition for controlling the print frequency
                print(f"\n{'#' * 50}\nLearning Epoch: {epoch_num}")

            # Initial Evaluation
            yhat1 = self.evaluation(self.TrueForm, self.YVar, 'self.TDF', tvars=self.TermList + [self.YVar])
            rmse1 = self.calc_rmse(TrueVals, yhat1)
            rsq1  = self.calc_r2(TrueVals, yhat1)

            model_log = {}
            NewEffects = {}
            for term in self.TermList: 
                #self.term = str(term) 
                
                if 'lr' in self.tparams[term]: 
                    self.TermLR[term] = self.tparams[term]['lr'] 
                
                self.term_tdf1 = self.TDF[[self.YVar] + self.TermList].copy()
                self.term_tdf2 = self.term_tdf1.copy()

                # Old Effects
                old_effects = self.term_tdf1[term].values
                
                # Implied Effects
                implied_effects = self.evaluation(self.TrueForm, term, 'self.term_tdf1', tvars=self.TermList + [self.YVar])
                
                # Fit a new model
                y = implied_effects
                xvars = [xvar for xvar in self.tparams[term]['xvars'] if xvar!=''] 
                if len(xvars)>0: X = self.df1[xvars].values 
                else: X = np.ones(self.train_len) 
                model = eval(self.tparams[term]['model'])
                model.fit(X, y) 

                # Predict new effects
                new_effects = model.predict(X) 
                self.new_effects = new_effects      ## DEBUG
                self.term_tdf2[term] = new_effects

                # Evaluate performance after learning new effects
                yhat2 = self.evaluation(self.TrueForm, self.YVar, 'self.term_tdf2', tvars=self.TermList + [self.YVar])
                rmse2 = self.calc_rmse(TrueVals, yhat2) 
                rsq2 = self.calc_r2(TrueVals, yhat2)

                # Update effects
                LRate = self.TermLR[term]
                deltas = new_effects - old_effects
                learned_effects = old_effects + (LRate * deltas)
                NewEffects[term] = learned_effects

                model_log[term] = {
                    'm_str':self.tparams[term]['model'], 
                    'xvars':self.tparams[term]['xvars'],
                    'model':model, 
                    'LRate':LRate, 
                    'rmse1':rmse1,
                    'rmse2':rmse2,
                    'rsq1' :rsq1 ,
                    'rsq2' :rsq2 ,
                }

            # Update TDF with new effects
            for term in self.TermList:
                self.TDF[term] = NewEffects[term]

            # Final evaluation for this iteration
            yhat2 = self.evaluation(self.TrueForm, self.YVar, 'self.TDF', tvars=self.TermList + [self.YVar])
            rmse2 = self.calc_rmse(TrueVals, yhat2) 
            rsq2  = self.calc_r2(TrueVals, yhat2)

            elog = {
                'epoch' : epoch_num,
                'models': model_log,  
            }
            self.epoch_logs.append(elog) 

            if verbose==True and epoch_num % 1 == 0:  # Adjust this condition for controlling the print frequency
                print(f"{'-' * 50}\nRMSE 1: {rmse1}\nRMSE 2: {rmse2}\nDELTA: {rmse2 - rmse1}")
                print(f"RSQ 1: {rsq1}\nRSQ 2: {rsq2}\nDELTA: {rsq2 - rsq1}\n{'-' * 50}")

        print('CGEM model fitting complete. ('+str(epoch_num)+' epochs)')  
    
    def predict(self, X):
        """
        Predict using the CGEM model.

        Parameters:
        X (pandas.DataFrame): Input features for making predictions.

        Returns:
        numpy.ndarray: Predicted values.
        """
        # Create a DataFrame for storing the predictions:
        self.PDF = X.copy() 
        self.last_log = self.epoch_logs[-1]
        self.pred_len = len(X) 
        
        NewEffects = {}
        # Apply the learned effects to the prediction DataFrame
        for term in self.TermList:
            if term == self.YVar: continue  # or term == 'const': continue            
            # Load the last available effects model for the given term: 
            self.last_model = deepcopy(self.last_log['models'][term]['model'])
            
            if "ScalarTerm" in str(self.last_model):
                self.PDF[term] = self.last_model.scalar 
                continue 

            # Predict new effects
            #----------------------------------------------------------
            xvars = [xvar for xvar in self.tparams[term]['xvars'] if xvar!=''] 
            if len(xvars)>0: self.X2 = X[xvars].values 
            else: self.X2 = np.array([np.ones(self.pred_len)]) 
            
            #self.X2 = X[self.tparams[term]['xvars']].values
            #----------------------------------------------------------
            pred_effects = self.last_model.predict(self.X2)
            self.pred_effects = pred_effects
            self.PDF[term] = pred_effects
            
        yhat2 = self.evaluation(
            self.TrueForm,
            self.YVar,
            'self.PDF',
            tvars=self.TermList+[self.YVar]
        )
        return yhat2
    
    
    def initialize_tdf(self):
        """
        Initialize the Transient DataFrame (TDF) that holds all the currently learned effect values.
        """
        
        self.RDF = pd.DataFrame()
        self.RDF[self.YVar] = self.df1[self.YVar].values
                
        self.rdf = self.df1.copy()
        for term in self.TermList:
            self.rdf[term] = self.tparams[term]['ival'] 
        
        for term in self.TermList:
            if term==self.YVar: continue
            
            form2 = str(self.TrueForm)
            
            #for term2 in self.TermList:
            #    if term2 == term or term2 == self.YVar: continue
            #    null_val = self.tparams[term2]['ival']
            #    form2 = form2.replace(term2, str(null_val))

            #--------------------------------
            self.term = term 
            self.form2 = form2
            #--------------------------------
            
            self.form3a = self.reform(form2, term)
            self.form3 = self.eq2np(self.form3a).replace(term, 'term_vals')
            
            #--------------------------------
            self.last_term = term 
            self.last_form2 = form2
            #--------------------------------

            
            df_vals_base = "self.rdf['TERM'].values"
            
            df_vals = df_vals_base.replace('TERM',self.YVar) 
            self.form3 = self.form3.replace(self.YVar, df_vals) 
            for term2 in self.TermList:
                if term2 not in self.form3: continue
                df_vals = df_vals_base.replace('TERM',term2) 
                self.form3 = self.form3.replace(term2, df_vals) 
            
            expr = self.form3.split(' = ')[1]

            self.expr = expr
            
            term_vals = eval(expr) 
            term_vals = list(term_vals)
            
            for _ in range(5): shuffle(term_vals)
            self.RDF[term] = term_vals

            # Special Provision
            if term == 'const': self.RDF[term] = 1.0
            else: self.RDF[term] = self.tparams[term]['ival']

        if 'const' in self.TermList: self.RDF['const'] = self.RDF['const'].mean()

        self.TDF = self.RDF.copy() 
        #print('Done Initializing Effects.') 


    def calc_r2(self,actual, predictions):
        """
        Calculate the R-squared value between actual and predicted values.

        Parameters:
        actual (numpy array): The actual values.
        predictions (numpy array): The predicted values.

        Returns:
        float: The R-squared value.
        """
        # Calculate the mean of actual values
        mean_actual = np.mean(actual)

        # Calculate the total sum of squares (SST)
        sst = np.sum(np.square(np.subtract(actual, mean_actual)))

        # Calculate the residual sum of squares (SSR)
        ssr = np.sum(np.square(np.subtract(actual, predictions)))

        # Calculate R-squared
        r_squared = 1 - (ssr / sst)
        return r_squared

    def calc_rmse(self,actual, predictions):
        """
        Calculate the Root Mean Square Error (RMSE) between actual and predicted values.

        Parameters:
        actual (numpy array): The actual values.
        predictions (numpy array): The predicted values.

        Returns:
        float: The RMSE value.
        """
        # Calculate the square of differences
        differences = np.subtract(actual, predictions)
        squared_differences = np.square(differences)

        # Calculate the mean of squared differences
        mean_squared_differences = np.mean(squared_differences)

        # Calculate the square root of the mean squared differences (RMSE)
        rmse = np.sqrt(mean_squared_differences)
        return rmse



In [139]:
#############################################################################
#############################################################################

In [140]:
LEN = 10000
a = np.random.normal(0,2,LEN)
b = np.random.normal(0,1,LEN) 
e = np.random.normal(0,0.5,LEN)
y = (4*a) + (5*b) + (1) + e

df = pd.DataFrame()
df['A'] = a
df['B'] = b
df['Y'] = y

#---------------------------------------------------------

is_train = (np.random.uniform(0,1,len(df))) <= 0.5
is_test  = ~is_train

DF1 = df[is_train].copy() 
DF2 = df[is_test ].copy()  

In [159]:
### DEFINE THE MASTER EFFECTS FORMULA: 
Formula = "Y = LIN_REG_A + LIN_REG_B"

### DEFINE THE TERM MODEL PARAMETERS:
tparams = {
    'LIN_REG_A': {
        'model': "OLS(fit_intercept=True)",  # Ordinary Least Squares Model
        'xvars': ['A'],                       # Independent variables for this effect
        'ival' : 0,                           # Initial value
    },
    'LIN_REG_B': {
        'model': "OLS(fit_intercept=False)",  # Ordinary Least Squares Model
        'xvars': ['B'],                       # Independent variables for this effect
        'ival' : 0,                           # Initial value
    } 
}  

In [160]:
model = CGEM() 
model.load_df(DF1)  
model.define_form(Formula) 
model.define_terms(tparams)  
model.fit(25,verbose=True); 


##################################################
Learning Epoch: 1
--------------------------------------------------
RMSE 1: 9.593975241320093
RMSE 2: 8.6264275295496
DELTA: -0.9675477117704929
RSQ 1: -0.011996291751857457
RSQ 2: 0.18182973694798554
DELTA: 0.193826028699843
--------------------------------------------------

##################################################
Learning Epoch: 2
--------------------------------------------------
RMSE 1: 8.6264275295496
RMSE 2: 7.757048977366247
DELTA: -0.8693785521833535
RSQ 1: 0.18182973694798554
RSQ 2: 0.33843152589310643
DELTA: 0.1566017889451209
--------------------------------------------------

##################################################
Learning Epoch: 3
--------------------------------------------------
RMSE 1: 7.757048977366247
RMSE 2: 6.975943298265432
DELTA: -0.7811056791008149
RSQ 1: 0.33843152589310643
RSQ 2: 0.46495831614371264
DELTA: 0.1265267902506062
--------------------------------------------------

#########

In [161]:
preds = model.predict(DF2) 
actuals = DF2['Y'].values
r2 = model.calc_r2(actuals, preds) 
print('CrosVal R-Squared:', round(r2, 5))  

CrosVal R-Squared: 0.99718


In [150]:
#############################################################################
#############################################################################

In [151]:
### DEFINE THE MASTER EFFECTS FORMULA: 
Formula = "Y = const + LIN_REG_A + LIN_REG_B"

### DEFINE THE TERM MODEL PARAMETERS:
tparams = {
    'const': {
        'model': "ScalarTerm()",  # Ordinary Least Squares Model
        'xvars': [''],                       # Independent variables for this effect
        'ival' : 0,                           # Initial value
        'lr'   : 0.3,
    },
    'LIN_REG_A': {
        'model': "OLS(fit_intercept=False)",  # Ordinary Least Squares Model
        'xvars': ['A'],                       # Independent variables for this effect
        'ival' : 0,                           # Initial value
        'lr'   : 0.1,
    },
    'LIN_REG_B': {
        'model': "OLS(fit_intercept=False)",  # Ordinary Least Squares Model
        'xvars': ['B'],                       # Independent variables for this effect
        'ival' : 0,                           # Initial value
        'lr'   : 0.1, 
    } 
}  

In [152]:
model = CGEM() 
model.load_df(DF1)  
model.define_form(Formula) 
model.define_terms(tparams)  
model.fit(25,verbose=True); 


##################################################
Learning Epoch: 1
--------------------------------------------------
RMSE 1: 9.53704590175917
RMSE 2: 8.574293502204927
DELTA: -0.9627523995542422
RSQ 1: -2.1829158773467583e-05
RSQ 2: 0.19168911823348989
DELTA: 0.19171094739226335
--------------------------------------------------

##################################################
Learning Epoch: 2
--------------------------------------------------
RMSE 1: 8.574293502204927
RMSE 2: 7.7093524073243085
DELTA: -0.8649410948806189
RSQ 1: 0.19168911823348989
RSQ 2: 0.3465422225767474
DELTA: 0.1548531043432575
--------------------------------------------------

##################################################
Learning Epoch: 3
--------------------------------------------------
RMSE 1: 7.7093524073243085
RMSE 2: 6.932337222071886
DELTA: -0.7770151852524227
RSQ 1: 0.3465422225767474
RSQ 2: 0.4716264173871222
DELTA: 0.1250841948103748
--------------------------------------------------

###

In [153]:
preds = model.predict(DF2) 
actuals = DF2['Y'].values
r2 = model.calc_r2(actuals, preds) 
print('CrosVal R-Squared:', round(r2, 5)) 

CrosVal R-Squared: 0.99718


In [170]:
#############################################################################
#############################################################################

In [175]:
### DEFINE THE MASTER EFFECTS FORMULA: 
Formula = "Y = const + LIN_REG_A + LIN_REG_B"
Formula = "Y = intercept + (slope1 * A_VAR) + (slope2 * B_VAR)"

### DEFINE THE TERM MODEL PARAMETERS:
tparams = {
    'intercept': {
        'model': "ScalarTerm()",  # Ordinary Least Squares Model
        'xvars': [''],                       # Independent variables for this effect
        'ival' : 0,                           # Initial value
        'lr'   : 0.1,
    },
    'slope1': {
        'model': "ScalarTerm()",  # Ordinary Least Squares Model
        'xvars': [''],                       # Independent variables for this effect
        'ival' : 1,                           # Initial value
        'lr'   : 0.1,
    },
    'slope2': {
        'model': "ScalarTerm()",  # Ordinary Least Squares Model
        'xvars': [''],                       # Independent variables for this effect
        'ival' : 1,                           # Initial value
        'lr'   : 0.1,
    },
    'A_VAR': {
        'model': "DirectVar()",  # Ordinary Least Squares Model
        'xvars': ['A'],                       # Independent variables for this effect
        'ival' : 1,                           # Initial value
        'lr'   : 0.1,
    },
    'B_VAR': {
        'model': "DirectVar()",  # Ordinary Least Squares Model
        'xvars': ['B'],                       # Independent variables for this effect
        'ival' : 1,                           # Initial value
        'lr'   : 0.1,
    },
}  

In [176]:
model = CGEM() 
model.load_df(DF1)  
model.define_form(Formula) 
model.define_terms(tparams)  
model.fit(25,verbose=True); 


##################################################
Learning Epoch: 1
--------------------------------------------------
RMSE 1: 9.584681952650271
RMSE 2: 9.063362746845772
DELTA: -0.5213192058044989
RSQ 1: -0.010036683054285733
RSQ 2: 0.09684880899776671
DELTA: 0.10688549205205244
--------------------------------------------------

##################################################
Learning Epoch: 2
--------------------------------------------------
RMSE 1: 9.063362746845772
RMSE 2: 247.2470175440263
DELTA: 238.18365479718054
RSQ 1: 0.09684880899776671
RSQ 2: -671.1154171923724
DELTA: -671.2122660013702
--------------------------------------------------

##################################################
Learning Epoch: 3
--------------------------------------------------
RMSE 1: 247.2470175440263
RMSE 2: 1482.1982547571072
DELTA: 1234.951237213081
RSQ 1: -671.1154171923724
RSQ 2: -24153.292965034478
DELTA: -23482.177547842104
--------------------------------------------------

######

In [162]:
Notes = '''

form2 = str(self.TrueForm)

#--------------------------------
self.term = term 
self.form2 = form2
#--------------------------------

self.form3 = self.reform(form2, term)
self.form3 = self.eq2np(self.form3).replace(term, 'term_vals')

#--------------------------------
self.last_term = term 
self.last_form2 = form2
#--------------------------------

yvar_vals = f"self.df1['{self.YVar}'].values"
self.form3 = self.form3.replace(self.YVar, yvar_vals) 
expr = self.form3.split(' = ')[1]

#exec(self.form3) 
self.expr = expr

term_vals = eval(expr) 

'''

In [55]:
Notes = '''

self.last_left, self.last_right = left, right
self.last_sleft, self.last_sright = sleft, sright
self.last_eq1 = eq1 
self.last_eq2 = eq2
self.last_eq_str2 = eq_str2

self.left, self.right = left, right
self.sleft, self.sright = sleft, sright
self.eq1 = eq1 

#--------------------------------
self.term = term 
self.form2 = form2
#--------------------------------
self.last_term = term 
self.last_form2 = form2
#--------------------------------

'''
print(Notes.replace('self.','model.'))



model.last_left, model.last_right = left, right
model.last_sleft, model.last_sright = sleft, sright
model.last_eq1 = eq1 
model.last_eq2 = eq2
model.last_eq_str2 = eq_str2

model.left, model.right = left, right
model.sleft, model.sright = sleft, sright
model.eq1 = eq1 

#--------------------------------
model.term = term 
model.form2 = form2
#--------------------------------
model.last_term = term 
model.last_form2 = form2
#--------------------------------




In [56]:
#--------------------------------

In [57]:
model.last_term

'slope2'

In [59]:
model.last_sleft, model.last_sright

(Y, A_VAR*slope1 + B_VAR*slope2 + const)

In [60]:
[model.last_eq1] 

[Eq(Y, A_VAR*slope1 + B_VAR*slope2 + const)]

In [61]:
[model.last_eq2]

[Eq(Y, A_VAR*slope1 + B_VAR*slope2 + const)]

In [62]:
[model.last_eq_str2]

['Y = A_VAR*slope1+B_VAR*slope2+const']

In [63]:
model.last_form2

'Y = 1*1+1*slope2+0'

In [64]:
#--------------------------------

In [76]:
model.last_term

'const'

In [78]:
model.last_sleft, model.last_sright

(Y, LIN_REG_A + LIN_REG_B + const)

In [None]:
    def reform(self, eq_str1="y=m*x+b", solve_for='x'):
        eq_str1 = eq_str1.replace(' ', '').replace('==', '=').replace('~', '=')
        left, right = tuple(eq_str1.split('='))
        sleft, sright = sympify(left, evaluate=False), sympify(right, evaluate=False)
        atoms = list(sleft.atoms()) + list(sright.atoms())
        for atom in atoms:
            try:
                exec(f"{atom}=Symbol('{atom}')")
            except:
                pass
        eq1 = eval(f"Eq({left}, {right})")
        
        #---------------------------------------------------------
        self.left, self.right = left, right
        self.sleft, self.sright = sleft, sright
        self.eq1 = eq1 
        #---------------------------------------------------------
        
        eq2 = eval(f"Eq({solve_for}, solve(eq1, {solve_for})[0])")
        eq_str2 = str(eq2)[3:-1].replace(' ', '').replace(',', ' = ')
        return eq_str2

In [None]:
#model.epoch_logs[0]

In [None]:
model.epoch_logs[50]

In [None]:
elog = model.epoch_logs[50]
m = elog['models']['const']['model'] 

In [None]:
m.scalar

In [None]:
a1 = np.array([5,5,5,5,5])
a2 = m.predict(a1) 

In [None]:
a2