In [6]:
import numpy as np 
from numpy import random as rnd
from matplotlib import pyplot as plt
import sys,os,datetime,warnings

import tensorflow as tf
import pandas as pd
import seaborn as sns

from keras.models import Model,Sequential
from keras.layers import Dense,LSTM,Input,BatchNormalization,Conv2D,Conv2DTranspose,Activation,Concatenate,Bidirectional,CuDNNGRU,Dropout,Dot,Flatten,Embedding

from sklearn.model_selection import train_test_split

from scipy.stats import norm

sns.set()
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [26]:
# Creates a randomly shaped PDF from aggregated Gaussians
# Provides methods to sample the PDF
class RANDOM_PDF_GENERATOR():
    # Initialize
    def __init__(self,N_MAX=10,MU_MIN=-5,MU_MAX=5,SIGMA_MIN=0.1,SIGMA_MAX=5):
        self.N_MAX = N_MAX
        self.MU_MIN = MU_MIN
        self.MU_MAX = MU_MAX
        self.SIGMA_MIN = SIGMA_MIN
        self.SIGMA_MAX = SIGMA_MAX
        self.initialized = False
        self.generate_parameters()
        return
    # Once create random parameters
    def generate_parameters(self):
        self.n_gaussians = rnd.randint(1,self.N_MAX) 
        self.mu_set = rnd.rand(self.n_gaussians)*(self.MU_MAX - self.MU_MIN) + self.MU_MIN
        self.sigma_set = rnd.rand(self.n_gaussians)*(self.SIGMA_MAX - self.SIGMA_MIN) + self.SIGMA_MIN
        self.initialized = True
        return
    # Evaluate CDF at point x
    def eval_cdf(self,x):
        y = 0
        for k in range(self.n_gaussians):
            y += (self.sigma_set[k]*norm.cdf(x+self.mu_set[k]))
        return y 
    # Sample PDF with multiple samples
    def sample_pdf(self,num_samples=1000):
        x = np.arange(self.MU_MIN-5,self.MU_MAX+5,0.01)
        y = self.eval_cdf(x)
        s = [x[np.argmin((y - ((max(y) - min(y))*rnd.rand()+min(y)))**2)] for k in range(num_samples)]
        return s

In [27]:
class DEPENDENT_PDF():
    # Initialize
    def __init__(self,seed_pdf,f):
        self.seed_pdf = seed_pdf
        self.f = f
        return
    # Sample PDF with multiple samples
    def sample_pdf(self,num_samples=1000):
        X_o = self.seed_pdf.sample_pdf(num_samples=num_samples)
        return [self.f(x) for x in X_o]
    # Evaluate PDF with given input
    def eval_pdf(self,X_o):
        return [self.f(x) for x in X_o]

In [113]:
# Hidden variable, generates correlated Y0,Y1
X = RANDOM_PDF_GENERATOR()
# Correlated observables Y0,Y1
Y0 = DEPENDENT_PDF(X,f=lambda x: -2*x+1+0.1*rnd.randn())
Y1 = DEPENDENT_PDF(X,f=lambda x: x-2-0.1-0.5*rnd.randn())

# Causally related observables Z0,Z1
Z0 = RANDOM_PDF_GENERATOR()
Z1 = DEPENDENT_PDF(Z0,f=lambda x: -1*x+2+0.7*rnd.randn())

In [114]:
df = pd.DataFrame()

sX = X.sample_pdf()
sZ = Z0.sample_pdf()

df['Y0'] = Y0.eval_pdf(sX)
df['Y1'] = Y1.eval_pdf(sX)
df['Z0'] = sZ
df['Z1'] = Z1.eval_pdf(sZ)

In [121]:
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import AdaBoostRegressor,GradientBoostingRegressor,RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor,XGBRFRegressor

x,y = df['Y0'].values.reshape(-1,1),df['Y1'].values.reshape(-1,1)
#x,y = df['Z0'].values.reshape(-1,1),df['Z1'].values.reshape(-1,1)

model_zoo = [
    MLPRegressor(),
    AdaBoostRegressor(),
    GradientBoostingRegressor(),
    RandomForestRegressor(),
    LinearRegression(),
    XGBRegressor(),
    XGBRFRegressor()
]

performance = {}
models = {}
for clf in model_zoo:
    x0,x1,y0,y1 = train_test_split(x,y,shuffle=True)
    clf.fit(x0,y0)
    clf_name = str(type(clf)).split('.')[-1].replace('>','').replace('\'','')
    performance[clf_name] = clf.score(x1,y1)
    models[clf_name] = [clf,clf.score(x1,y1)]



In [122]:
performance

{'MLPRegressor': 0.9601424512321858,
 'AdaBoostRegressor': 0.9590523817912279,
 'GradientBoostingRegressor': 0.9578255717820708,
 'RandomForestRegressor': 0.9399498049666769,
 'LinearRegression': 0.9611267801716805,
 'XGBRegressor': 0.960441462809174,
 'XGBRFRegressor': 0.9476610587186927}