# PCA on Covariates Simulations Notebook

In [3]:
# Import objects from the setup file
import os
import sys
sys.path.append(os.path.expanduser('~/repo/ECMA-31330-Project/Source'))
from ME_Setup import *

# Packages
import numpy as np
import pandas as pd
import seaborn as sns
from pca import pca
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import FactorAnalysis
import math
from tqdm import tqdm

In [4]:
# Supressing Output
from contextlib import contextmanager
import sys, os

@contextmanager
def suppress_stdout():
    with open(os.devnull, "w") as devnull:
        old_stdout = sys.stdout
        sys.stdout = devnull
        try:  
            yield
        finally:
            sys.stdout = old_stdout

**My main changes and what I want people to review in the code block below:**
    
1) I added an IV section

2) I added a new thing where we take half of the mismeasured covariate values and then use them as the exponent to raise e to (e.g. z4 = e**z4)

3) i got rid of the 1,000 value for beta1 and beta2

4) i got rid of the p = 10 value

**Comments on those changes:**

1) i'm curious if i did this right

2) i did this because of bonhomme's feedback about where PCA might outperform taking the simple avg. we've gotten some tentative signs that PCA does do better than the avg when you stretch out he observations

3) , 4) it seemed like we didn't really need those values

In [5]:
# Data frame to store output
output = pd.DataFrame()

# 2,000 observations
N = 2000

# Loop over combinations of: betas, covariances between variables, p numbers of parameters
for beta1 in [0.1,1,10]:
    for beta2 in [0.1,1,10]:
        for covariance in [-0.9,-0.5,0,0.5,0.9]:
            for p in [5,20,50]:
                # Select only the scenarios we actually want to run by default - betas of 1, covariances of 0.5, p of 5... require three of these conditions to be satisfied to run.
                counter = 0
                if beta1 == 1:
                    counter += 1
                if beta2 == 1:
                    counter += 1
                if covariance == 0.5:
                    counter += 1
                if p == 5:
                    counter+=1
                if counter >= 3:
                    # Run with and without transformations
                    for exp_of_var in ['yes','no']:
                        # 1000 simulations
                        for k in tqdm(range(1)):
                            # Initialize Lists to store coef values for all five methods and the true coef
                            pca_coef = []
                            mismeasured_coef = []
                            mismeasured_allvar_coef = []
                            mismeasured_avg_coef = []
                            iv_coef = []
                            true_val_coef =[]

                            # Create variables
                            vars_mean = [0,0,0]
                            vars_cov = np.array([[1,covariance,0],
                                                 [covariance,1,0],
                                                 [0,0,1]])
                            # Producing 3 variables: x for the variable of interest, the true Z covariate, the random error
                            vars_ = pd.DataFrame(np.random.multivariate_normal(vars_mean, vars_cov, N), columns = ['x','true_z','u'])
                            vars_['y'] = beta1 * vars_['x'] + beta2 * vars_['true_z'] + vars_['u']

                            # Create measurement errors for each of the p measurements of the covariates- mean zero and variance one
                            errors_mean = np.zeros(p)
                            errors_cov = np.zeros((p,p))
                            for i in range(p):
                                for j in range(p):
                                    if i == j:
                                        errors_cov[i,j] = 1

                            errors = np.random.multivariate_normal(errors_mean, errors_cov, N)
                            # Column labels for Z variables (covariates variables mismeasured)
                            z_vars = []
                            for i in range(p):
                                z_vars.append('z'+str(i+1))
                            # Add errors to the true_z to get mismeasured values
                            mismeasured_z = pd.DataFrame(errors, columns = z_vars)
                            for i in mismeasured_z.columns:
                                mismeasured_z[i] = mismeasured_z[i] + vars_['true_z']

                            # Take e to the power of the values for half of the measurements if log_of_var is true
                            if exp_of_var == 'yes':
                                mismeasured_z.iloc[:,int(len(mismeasured_z.columns)/2):] =np.exp(mismeasured_z.iloc[:,int(len(mismeasured_z.columns)/2):])
                                
                            # Do feature scaling (normalize to mean 0 and variance 1) for the mismeasured z
                            # Note that x and y are already normalized by construction
                            scaled_mismeasured_z = mismeasured_z.copy()
                            for i in mismeasured_z.columns:
                                scaled_mismeasured_z[i] = (mismeasured_z[i] - mismeasured_z[i].mean()) / mismeasured_z[i].std()

                            # Suppress output
                            with suppress_stdout():
                                # Use PCA on the mismeasured values
                                pca_model = pca()
                                pca_results = pca_model.fit_transform(scaled_mismeasured_z)
                                pca_z = pca_results['PC']['PC1']

                            # NOTE: in non-pca cases, no need to rescale or normalize since mismeasured variables and x and y have mean 0 and sd 1

                            # Average mismeasured variables:
                            vars_['avg_mismeasured_z'] = mismeasured_z[z_vars].mean(axis=1)

                            # Add relevant variables to vars_ dataframe
                            vars_[mismeasured_z.columns] = mismeasured_z
                            vars_['pca_z'] = pca_z

                            # Single mismeasured covariate results
                            model_mismeasured = sm.OLS(vars_['y'],vars_[['x','z1']])
                            results_mismeasured = model_mismeasured.fit()
                            mismeasured_coef.append(results_mismeasured.params[0])

                            # All Variables Mismeasured Results
                            # Create full list of items to include in regression
                            tot_vars = ['x']
                            tot_vars.extend(z_vars)
                            model_mismeasured_allvar = sm.OLS(vars_['y'],vars_[tot_vars])
                            results_mismeasured_allvar = model_mismeasured_allvar.fit()
                            mismeasured_allvar_coef.append(results_mismeasured_allvar.params[0])

                            # Average Mismeasured Variables Results
                            model_mismeasured_avg = sm.OLS(vars_['y'],vars_[['x','avg_mismeasured_z']])
                            results_mismeasured_avg = model_mismeasured_avg.fit()
                            mismeasured_avg_coef.append(results_mismeasured_avg.params[0])

                            # PCA Results
                            model_pca = sm.OLS(vars_['y'],vars_[['x','pca_z']])
                            results_pca = model_pca.fit()
                            pca_coef.append(results_pca.params[0])

                            # Instrumental Variables Results
                            # Instrument z1 on the other items in the mismeasured df
                            z_string = mismeasured_z.columns[1]
                            for z in mismeasured_z.columns[2:]:
                                z_string = z_string + ' + ' + z
                            # Create the predicted value of z1
                            vars_['pred_z1'] = smf.ols("z1 ~ " + z_string, data = mismeasured_z).fit().predict()
                            iv_results = smf.ols("y ~ x + pred_z1 -1", data = vars_).fit()
                            iv_coef.append(iv_results.params[0])

                            # True Results
                            model_true = sm.OLS(vars_['y'],vars_[['x','true_z']])
                            results_true = model_true.fit()
                            true_val_coef.append(results_true.params[0])

                            # Output Findings
                            new_output = pd.DataFrame()
                            new_output['mismeasured_coef'] = mismeasured_coef
                            new_output['mismeasured_allvar_coef'] = mismeasured_allvar_coef
                            new_output['mismeasured_avg_coef'] = mismeasured_avg_coef
                            new_output['pca_coef'] = pca_coef
                            new_output['true_val_coef'] = true_val_coef
                            new_output['iv_coef'] = iv_coef
                            new_output['covariance'] = vars_cov[0][1]
                            new_output['beta1'] = beta1
                            new_output['beta2'] = beta2
                            new_output['p'] = p
                            new_output['exp_of_var'] = exp_of_var
                            output = output.append(new_output)

output

100%|██████████| 1/1 [00:00<00:00,  1.62it/s]
100%|██████████| 1/1 [00:00<00:00,  2.03it/s]
100%|██████████| 1/1 [00:00<00:00,  2.06it/s]
100%|██████████| 1/1 [00:00<00:00,  2.02it/s]
100%|██████████| 1/1 [00:00<00:00,  2.10it/s]
100%|██████████| 1/1 [00:00<00:00,  1.91it/s]
100%|██████████| 1/1 [00:00<00:00,  2.13it/s]
100%|██████████| 1/1 [00:00<00:00,  2.26it/s]
100%|██████████| 1/1 [00:00<00:00,  2.24it/s]
100%|██████████| 1/1 [00:00<00:00,  2.26it/s]
100%|██████████| 1/1 [00:00<00:00,  2.18it/s]
100%|██████████| 1/1 [00:00<00:00,  2.25it/s]
100%|██████████| 1/1 [00:00<00:00,  2.03it/s]
100%|██████████| 1/1 [00:00<00:00,  2.10it/s]
100%|██████████| 1/1 [00:00<00:00,  1.77it/s]
100%|██████████| 1/1 [00:00<00:00,  1.82it/s]
100%|██████████| 1/1 [00:00<00:00,  2.19it/s]
100%|██████████| 1/1 [00:00<00:00,  2.19it/s]
100%|██████████| 1/1 [00:00<00:00,  2.07it/s]
100%|██████████| 1/1 [00:00<00:00,  2.12it/s]
100%|██████████| 1/1 [00:00<00:00,  2.04it/s]
100%|██████████| 1/1 [00:00<00:00,

Unnamed: 0,mismeasured_coef,mismeasured_allvar_coef,mismeasured_avg_coef,pca_coef,true_val_coef,iv_coef,covariance,beta1,beta2,p,exp_of_var
0,0.377291,0.279092,0.431273,0.303882,0.10198,0.327421,0.5,0.1,1.0,5,yes
0,0.372929,0.199632,0.200331,0.200412,0.094372,0.22004,0.5,0.1,1.0,5,no
0,1.051855,1.038778,1.069775,1.046737,1.021211,1.048326,0.5,1.0,0.1,5,yes
0,0.991493,0.967623,0.968608,0.967959,0.953142,0.969589,0.5,1.0,0.1,5,no
0,0.2311,0.378299,0.182787,0.337343,1.017201,0.315554,-0.9,1.0,1.0,5,yes
0,0.205285,0.473921,0.474957,0.47539,0.894709,0.437787,-0.9,1.0,1.0,5,no
0,0.741761,0.841806,0.627834,0.807872,1.026095,0.774325,-0.5,1.0,1.0,5,yes
0,0.716063,0.913958,0.912209,0.912553,1.007239,0.89371,-0.5,1.0,1.0,5,no
0,1.004262,1.01385,1.019553,1.015938,1.029424,1.023411,0.0,1.0,1.0,5,yes
0,0.977675,0.988048,0.988668,0.987748,0.996131,0.995471,0.0,1.0,1.0,5,no


In [6]:
output

Unnamed: 0,mismeasured_coef,mismeasured_allvar_coef,mismeasured_avg_coef,pca_coef,true_val_coef,iv_coef,covariance,beta1,beta2,p,exp_of_var
0,0.377291,0.279092,0.431273,0.303882,0.10198,0.327421,0.5,0.1,1.0,5,yes
0,0.372929,0.199632,0.200331,0.200412,0.094372,0.22004,0.5,0.1,1.0,5,no
0,1.051855,1.038778,1.069775,1.046737,1.021211,1.048326,0.5,1.0,0.1,5,yes
0,0.991493,0.967623,0.968608,0.967959,0.953142,0.969589,0.5,1.0,0.1,5,no
0,0.2311,0.378299,0.182787,0.337343,1.017201,0.315554,-0.9,1.0,1.0,5,yes
0,0.205285,0.473921,0.474957,0.47539,0.894709,0.437787,-0.9,1.0,1.0,5,no
0,0.741761,0.841806,0.627834,0.807872,1.026095,0.774325,-0.5,1.0,1.0,5,yes
0,0.716063,0.913958,0.912209,0.912553,1.007239,0.89371,-0.5,1.0,1.0,5,no
0,1.004262,1.01385,1.019553,1.015938,1.029424,1.023411,0.0,1.0,1.0,5,yes
0,0.977675,0.988048,0.988668,0.987748,0.996131,0.995471,0.0,1.0,1.0,5,no


In [7]:
#output.to_csv('C://Users//paulo//Documents//Econometrics and ML//Final Project//n_2000_results.csv')

In [8]:
pd.set_option('display.max_rows', 10)

In [9]:
output['pca_ape'] = (output['pca_coef'] - output['beta1']).abs()/output['beta1']
output['mismeasured_coef_ape'] = (output['mismeasured_coef'] - output['beta1']).abs()/output['beta1']
output['mismeasured_allvar_coef_ape'] = (output['mismeasured_allvar_coef'] - output['beta1']).abs()/output['beta1']
output['mismeasured_avg_coef_ape'] = (output['mismeasured_avg_coef'] - output['beta1']).abs()/output['beta1']
output['iv_coef_ape'] = (output['iv_coef'] - output['beta1']).abs()/output['beta1']

In [10]:
output.groupby(['covariance','beta1','beta2','p','exp_of_var']).mean().reset_index()

Unnamed: 0,covariance,beta1,beta2,p,exp_of_var,mismeasured_coef,mismeasured_allvar_coef,mismeasured_avg_coef,pca_coef,true_val_coef,iv_coef,pca_ape,mismeasured_coef_ape,mismeasured_allvar_coef_ape,mismeasured_avg_coef_ape,iv_coef_ape
0,-0.9,1.0,1.0,5,no,0.205285,0.473921,0.474957,0.475390,0.894709,0.437787,0.524610,0.794715,0.526079,0.525043,0.562213
1,-0.9,1.0,1.0,5,yes,0.231100,0.378299,0.182787,0.337343,1.017201,0.315554,0.662657,0.768900,0.621701,0.817213,0.684446
2,-0.5,1.0,1.0,5,no,0.716063,0.913958,0.912209,0.912553,1.007239,0.893710,0.087447,0.283937,0.086042,0.087791,0.106290
3,-0.5,1.0,1.0,5,yes,0.741761,0.841806,0.627834,0.807872,1.026095,0.774325,0.192128,0.258239,0.158194,0.372166,0.225675
4,0.0,1.0,1.0,5,no,0.977675,0.988048,0.988668,0.987748,0.996131,0.995471,0.012252,0.022325,0.011952,0.011332,0.004529
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17,0.5,1.0,10.0,5,yes,4.018535,2.814896,4.802446,3.191262,0.934235,3.411628,2.191262,3.018535,1.814896,3.802446,2.411628
18,0.5,10.0,1.0,5,no,10.309383,10.124738,10.123779,10.124320,10.009970,10.137428,0.012432,0.030938,0.012474,0.012378,0.013743
19,0.5,10.0,1.0,5,yes,10.285971,10.174535,10.427017,10.223802,9.988553,10.238272,0.022380,0.028597,0.017453,0.042702,0.023827
20,0.9,1.0,1.0,5,no,1.750883,1.469735,1.469053,1.473203,0.956816,1.519797,0.473203,0.750883,0.469735,0.469053,0.519797


In [11]:
output.groupby(['covariance','beta1','beta2','p','exp_of_var']).std().reset_index()

Unnamed: 0,covariance,beta1,beta2,p,exp_of_var,mismeasured_coef,mismeasured_allvar_coef,mismeasured_avg_coef,pca_coef,true_val_coef,iv_coef,pca_ape,mismeasured_coef_ape,mismeasured_allvar_coef_ape,mismeasured_avg_coef_ape,iv_coef_ape
0,-0.9,1.0,1.0,5,no,,,,,,,,,,,
1,-0.9,1.0,1.0,5,yes,,,,,,,,,,,
2,-0.5,1.0,1.0,5,no,,,,,,,,,,,
3,-0.5,1.0,1.0,5,yes,,,,,,,,,,,
4,0.0,1.0,1.0,5,no,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17,0.5,1.0,10.0,5,yes,,,,,,,,,,,
18,0.5,10.0,1.0,5,no,,,,,,,,,,,
19,0.5,10.0,1.0,5,yes,,,,,,,,,,,
20,0.9,1.0,1.0,5,no,,,,,,,,,,,


In [12]:
output.loc[(output['beta1']==1)&(output['beta2']==1)&(output['covariance'] == 0.5)&(output['exp_of_var']=='yes')].groupby(['covariance','beta1','beta2','p']).mean().reset_index()

Unnamed: 0,covariance,beta1,beta2,p,mismeasured_coef,mismeasured_allvar_coef,mismeasured_avg_coef,pca_coef,true_val_coef,iv_coef,pca_ape,mismeasured_coef_ape,mismeasured_allvar_coef_ape,mismeasured_avg_coef_ape,iv_coef_ape
0,0.5,1.0,1.0,5,1.249019,1.148227,1.346894,1.180911,0.964079,1.21286,0.180911,0.249019,0.148227,0.346894,0.21286
1,0.5,1.0,1.0,20,1.30606,1.065254,1.339793,1.102144,1.009884,1.070822,0.102144,0.30606,0.065254,0.339793,0.070822
2,0.5,1.0,1.0,50,1.300459,1.019558,1.295226,1.053262,0.993733,1.030565,0.053262,0.300459,0.019558,0.295226,0.030565


In [13]:
output.loc[(output['beta1']==1)&(output['beta2']==1)&(output['covariance'] == 0.5)&(output['exp_of_var']=='yes')].groupby(['covariance','beta1','beta2','p']).std().reset_index()

Unnamed: 0,covariance,beta1,beta2,p,mismeasured_coef,mismeasured_allvar_coef,mismeasured_avg_coef,pca_coef,true_val_coef,iv_coef,pca_ape,mismeasured_coef_ape,mismeasured_allvar_coef_ape,mismeasured_avg_coef_ape,iv_coef_ape
0,0.5,1.0,1.0,5,,,,,,,,,,,
1,0.5,1.0,1.0,20,,,,,,,,,,,
2,0.5,1.0,1.0,50,,,,,,,,,,,


In [14]:
def std_parenth_padder(type_col, col_to_pad):
    if type_col == 'std':
        return('(' + col_to_pad + ')')
    else:
        return(col_to_pad)

def format_level_0(type_col, col_to_format):
    if type_col == 'std':
        return('')
    else:
        return(col_to_format)

In [15]:
import regex as re

# Pass this function a pandas dataframe of which to create the LaTeX file
# dataframe is the df
# filename is the save location
# index tells if you want the index saved to the file or not
def Minimal_Latex(dataframe, filename, index = False):

    # Open file
    with open(filename, "w") as f:
        
        # Start from the pandas
        corrected_table = dataframe.to_latex(header = False, index = index)

        # Strip all material that pandas inserts- the tabular environment and toprule and bottomrule
        corrected_table = re.sub(r'\\begin.+', '', corrected_table)
        corrected_table = re.sub(r'\\toprule', '', corrected_table)
        corrected_table = re.sub(r'\\bottomrule', '', corrected_table)
        corrected_table = re.sub(r'\\end.+', '', corrected_table)

        # Write to file and skip blank lines
        f.write(corrected_table.strip())

In [16]:
coef_names_mapper = {'mismeasured_coef':'Single Measurement', 'mismeasured_allvar_coef':'All Measurements', 'pca_coef':'PCA', 'mismeasured_avg_coef':'Average of Measurements', 'iv_coef':'Instrumental Variable'}
ape_names_mapper = {'mismeasured_coef_ape':'Single Measurement', 'mismeasured_allvar_coef_ape':'All Measurements', 'pca_ape':'PCA', 'mismeasured_avg_coef_ape':'Average of Measurements', 'iv_coef_ape':'Instrumental Variable'}

In [17]:
output

Unnamed: 0,mismeasured_coef,mismeasured_allvar_coef,mismeasured_avg_coef,pca_coef,true_val_coef,iv_coef,covariance,beta1,beta2,p,exp_of_var,pca_ape,mismeasured_coef_ape,mismeasured_allvar_coef_ape,mismeasured_avg_coef_ape,iv_coef_ape
0,0.377291,0.279092,0.431273,0.303882,0.101980,0.327421,0.5,0.1,1.0,5,yes,2.038820,2.772914,1.790919,3.312727,2.274208
0,0.372929,0.199632,0.200331,0.200412,0.094372,0.220040,0.5,0.1,1.0,5,no,1.004116,2.729287,0.996316,1.003306,1.200398
0,1.051855,1.038778,1.069775,1.046737,1.021211,1.048326,0.5,1.0,0.1,5,yes,0.046737,0.051855,0.038778,0.069775,0.048326
0,0.991493,0.967623,0.968608,0.967959,0.953142,0.969589,0.5,1.0,0.1,5,no,0.032041,0.008507,0.032377,0.031392,0.030411
0,0.231100,0.378299,0.182787,0.337343,1.017201,0.315554,-0.9,1.0,1.0,5,yes,0.662657,0.768900,0.621701,0.817213,0.684446
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,1.750883,1.469735,1.469053,1.473203,0.956816,1.519797,0.9,1.0,1.0,5,no,0.473203,0.750883,0.469735,0.469053,0.519797
0,4.018535,2.814896,4.802446,3.191262,0.934235,3.411628,0.5,1.0,10.0,5,yes,2.191262,3.018535,1.814896,3.802446,2.411628
0,3.895511,2.086191,2.076864,2.086234,0.982115,2.272068,0.5,1.0,10.0,5,no,1.086234,2.895511,1.086191,1.076864,1.272068
0,10.285971,10.174535,10.427017,10.223802,9.988553,10.238272,0.5,10.0,1.0,5,yes,0.022380,0.028597,0.017453,0.042702,0.023827


In [23]:
all_params = ['beta1', 'beta2', 'covariance', 'p']

def make_param_tables(dataframe, parameter, exp):

    if parameter != 'beta1':
        dataframe = dataframe.loc[(dataframe['beta1']==1)]
    if parameter != 'beta2':
        dataframe = dataframe.loc[(dataframe['beta2']==1)]
    if parameter != 'covariance':
        dataframe = dataframe.loc[(dataframe['covariance']==0.5)]
    if parameter != 'p':
        dataframe = dataframe.loc[(dataframe['p']==5)]
    dataframe = dataframe.loc[(dataframe['exp_of_var']==exp)]

    exp_mapper = {'yes':'exp', 'no':'no_exp'}

    dataframe = (dataframe.drop(columns = [variable for variable in all_params if variable != parameter])
                          .groupby(parameter)
                          .agg(['mean', 'std'])
                          .transpose()
                          .reset_index())

    dataframe.columns = dataframe.columns.map(str)

    dataframe_coefs = (dataframe.query('level_0.str.contains("coef")')
                                .query('level_0 != "true_val_coef"')
                                .query('not level_0.str.contains("ape")')
                                .round(3)
                                .astype(str))

    for variable in dataframe_coefs.columns:
        if variable != 'level_0' and variable != 'level_1':
            dataframe_coefs[variable] = dataframe_coefs.apply(lambda x: std_parenth_padder(x['level_1'], x[variable]), axis = 1)

    dataframe_coefs['level_0'] = dataframe_coefs.apply(lambda x: format_level_0(x.level_1, x.level_0), axis = 1)

    dataframe_coefs = (dataframe_coefs.drop(columns = 'level_1')
                                    .replace(coef_names_mapper))

    Minimal_Latex(dataframe_coefs, tables_dir + '/' + parameter + '_coefs_' + exp_mapper[exp] + '.tex')

    dataframe_apes = (dataframe.query('level_0.str.contains("ape")')
                           .query('level_1 != "std"')
                           .drop(columns = 'level_1')
                           .replace(ape_names_mapper)
                           .set_index('level_0')
                           .multiply(100)
                           .round(1)
                           .astype(str)
                           .add('%')
                           .reset_index())

    Minimal_Latex(dataframe_apes, tables_dir + '/' + parameter + '_apes_' + exp_mapper[exp] + '.tex')

In [26]:
make_param_tables(output, 'p', 'yes')
make_param_tables(output, 'p', 'no')
make_param_tables(output, 'covariance', 'yes')
make_param_tables(output, 'covariance', 'no')
make_param_tables(output, 'beta1', 'yes')
make_param_tables(output, 'beta1', 'no')
make_param_tables(output, 'beta2', 'yes')
make_param_tables(output, 'beta2', 'no')