In [None]:
pip install pca

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from pca import pca
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

In [None]:
N = 100
beta1 = 1
beta2 = 1

In [None]:
output = pd.DataFrame()
#for beta in [0.1,1,10,100]:
for covariance in [-1,-0.75,-0.5,-0.25,0,0.25,0.5,0.75,1]:
    for k in range(1000):
        # Initialize Lists
        pca_coef = []
        mismeasured_coef = []
        mismeasured_allvar_coef = []
        true_val_coef =[]
        nonscaled_true_val_coef = []

        # Create variables
        vars_mean = [0,0,0]
        vars_cov = np.array([[1,covariance,0],
                             [covariance,1,0],
                             [0,0,1]])
        vars_ = pd.DataFrame(np.random.multivariate_normal(vars_mean, vars_cov, N), columns = ['x','true_z','u'])
        vars_['y'] = beta1 * vars_['x'] + beta2 * vars_['true_z'] + vars_['u']

        # Create measurement errors
        errors_mean = [0,0,0,0,0]
        errors_cov = [[1,0,0,0,0],
                      [0,1,0,0,0],
                      [0,0,1,0,0],
                      [0,0,0,1,0],
                      [0,0,0,0,1]]
        errors = np.random.multivariate_normal(errors_mean, errors_cov, N)
        mismeasured_z = pd.DataFrame(errors, columns = ['z1','z2','z3','z4','z5'])
        for i in mismeasured_z.columns:
            mismeasured_z[i] = mismeasured_z[i] + vars_['true_z']

        # Do feature scaling (normalize to mean 0 and variance 1)
        scaled_mismeasured_z = mismeasured_z.copy()
        for i in mismeasured_z.columns:
            scaled_mismeasured_z[i] = (mismeasured_z[i] - mismeasured_z[i].mean()) / mismeasured_z[i].std()

        # Use PCA on the mismeasured values
        pca_model = pca()
        pca_results = pca_model.fit_transform(scaled_mismeasured_z)
        pca_z = pca_results['PC']['PC1']

        # Do feature scaling (normalize to mean 0 and variance 1)
        scaled_pca_z = (pca_z - pca_z.mean()) / pca_z.std()
        scaled_true_z = (vars_['true_z'] - vars_['true_z'].mean()) / vars_['true_z'].std()
        scaled_x = (vars_['x'] - vars_['x'].mean()) / vars_['x'].std()
        scaled_y = (vars_['y'] - vars_['y'].mean()) / vars_['y'].std()

        scaled_df = pd.DataFrame()
        scaled_df['scaled_y'] = scaled_y
        scaled_df['scaled_x'] = scaled_x
        scaled_df['scaled_pca_z'] = scaled_pca_z
        scaled_df[scaled_mismeasured_z.columns] = scaled_mismeasured_z
        scaled_df['scaled_true_z'] = scaled_true_z

        # Mismeasured Results
        model_mismeasured = sm.OLS(scaled_df['scaled_y'],scaled_df[['scaled_x','z1']])
        results_mismeasured = model_mismeasured.fit()
        mismeasured_coef.append(results_mismeasured.params[0])

        # Mismeasured Results
        model_mismeasured_allvar = sm.OLS(scaled_df['scaled_y'],scaled_df[['scaled_x','z1','z2','z3','z4','z5']])
        results_mismeasured_allvar = model_mismeasured_allvar.fit()
        mismeasured_allvar_coef.append(results_mismeasured_allvar.params[0])

        # PCA Results
        model_pca = sm.OLS(scaled_df['scaled_y'],scaled_df[['scaled_x','scaled_pca_z']])
        results_pca = model_pca.fit()
        pca_coef.append(results_pca.params[0])

        # True Results
        model_true = sm.OLS(scaled_df['scaled_y'],scaled_df[['scaled_x','scaled_true_z']])
        results_true = model_true.fit()
        true_val_coef.append(results_true.params[0])

        # Non-Scaled True Results
        model_nonscaled_true = sm.OLS(vars_['y'],vars_[['x','true_z']])
        results_nonscaled_true = model_nonscaled_true.fit()
        nonscaled_true_val_coef.append(results_nonscaled_true.params[0])

        # Output Findings
        new_output = pd.DataFrame()
        new_output['mismeasured_coef'] = mismeasured_coef
        new_output['mismeasured_allvar_coef'] = mismeasured_allvar_coef
        new_output['pca_coef'] = pca_coef
        new_output['true_val_coef'] = true_val_coef
        new_output['nonscaled_true_val_coef'] = nonscaled_true_val_coef   
        new_output['covariance'] = vars_cov[0][1]
        new_output['beta1'] = beta1
        output = output.append(new_output)
        
output

In [None]:
output.describe()

In [None]:
sns.kdeplot(x='pca_coef',hue='covariance',data=output)

In [None]:
sns.kdeplot(x='mismeasured_coef',hue='covariance',data=output)

In [None]:
sns.kdeplot(x='true_val_coef',hue='covariance',data=output)

In [None]:
output.groupby('covariance').mean()[['pca_coef','mismeasured_coef','true_val_coef']]

In [None]:
output = pd.DataFrame()
for beta1 in [0.1,1,10,100]:
    for beta2 in [0.1,1,10,100]:
        for covariance in [-1,-0.75,-0.5,-0.25,0,0.25,0.5,0.75,1]:
            for k in range(1):
                # Initialize Lists
                pca_coef = []
                mismeasured_coef = []
                mismeasured_allvar_coef = []
                mismeasured_avg_coef = []
                true_val_coef =[]
                nonscaled_true_val_coef = []

                # Create variables
                vars_mean = [0,0,0]
                vars_cov = np.array([[1,covariance,0],
                                     [covariance,1,0],
                                     [0,0,1]])
                vars_ = pd.DataFrame(np.random.multivariate_normal(vars_mean, vars_cov, N), columns = ['x','true_z','u'])
                vars_['y'] = beta1 * vars_['x'] + beta2 * vars_['true_z'] + vars_['u']

                # Create measurement errors
                errors_mean = [0,0,0,0,0]
                errors_cov = [[1,0,0,0,0],
                              [0,1,0,0,0],
                              [0,0,1,0,0],
                              [0,0,0,1,0],
                              [0,0,0,0,1]]
                errors = np.random.multivariate_normal(errors_mean, errors_cov, N)
                mismeasured_z = pd.DataFrame(errors, columns = ['z1','z2','z3','z4','z5'])
                for i in mismeasured_z.columns:
                    mismeasured_z[i] = mismeasured_z[i] + vars_['true_z']

                # Do feature scaling (normalize to mean 0 and variance 1)
                scaled_mismeasured_z = mismeasured_z.copy()
                for i in mismeasured_z.columns:
                    scaled_mismeasured_z[i] = (mismeasured_z[i] - mismeasured_z[i].mean()) / mismeasured_z[i].std()

                # Use PCA on the mismeasured values
                pca_model = pca()
                pca_results = pca_model.fit_transform(scaled_mismeasured_z)
                pca_z = pca_results['PC']['PC1']

                # Average mismeasured variables:
                vars_['avg_mismeasured_z'] = mismeasured_z[['z1','z2','z3','z4','z5']].mean(axis=1)

                # Add relevant variables to vars_ dataframe
                vars_[mismeasured_z.columns] = mismeasured_z
                vars_['pca_z'] = pca_z

                # Mismeasured Results
                model_mismeasured = sm.OLS(vars_['y'],vars_[['x','z1']])
                results_mismeasured = model_mismeasured.fit()
                mismeasured_coef.append(results_mismeasured.params[0])

                # All Variables Mismeasured Results
                model_mismeasured_allvar = sm.OLS(vars_['y'],vars_[['x','z1','z2','z3','z4','z5']])
                results_mismeasured_allvar = model_mismeasured_allvar.fit()
                mismeasured_allvar_coef.append(results_mismeasured_allvar.params[0])

                # Average Mismeasured Variables Results
                model_mismeasured_avg = sm.OLS(vars_['y'],vars_[['x','avg_mismeasured_z']])
                results_mismeasured_avg = model_mismeasured_avg.fit()
                mismeasured_avg_coef.append(results_mismeasured_avg.params[0])

                # PCA Results
                model_pca = sm.OLS(vars_['y'],vars_[['x','pca_z']])
                results_pca = model_pca.fit()
                pca_coef.append(results_pca.params[0])

                # True Results
                model_true = sm.OLS(vars_['y'],vars_[['x','true_z']])
                results_true = model_true.fit()
                true_val_coef.append(results_true.params[0])

                # Output Findings
                new_output = pd.DataFrame()
                new_output['mismeasured_coef'] = mismeasured_coef
                new_output['mismeasured_allvar_coef'] = mismeasured_allvar_coef
                new_output['mismeasured_avg_coef'] = mismeasured_avg_coef
                new_output['pca_coef'] = pca_coef
                new_output['true_val_coef'] = true_val_coef
                new_output['covariance'] = vars_cov[0][1]
                new_output['beta1'] = beta1
                new_output['beta2'] = beta2
                output = output.append(new_output)
        
output

In [None]:
z_vars = []
for i in range(p):
    z_vars.append('z'+str(i+1))
z_vars

In [None]:
    # Initialize Lists
    pca_coef = []
    mismeasured_coef = []
    mismeasured_allvar_coef = []
    mismeasured_avg_coef = []
    true_val_coef =[]
    nonscaled_true_val_coef = []

    # Create variables
    vars_mean = [0,0,0]
    vars_cov = np.array([[1,covariance,0],
                         [covariance,1,0],
                         [0,0,1]])
    vars_ = pd.DataFrame(np.random.multivariate_normal(vars_mean, vars_cov, N), columns = ['x','true_z','u'])
    vars_['y'] = beta1 * vars_['x'] + beta2 * vars_['true_z'] + vars_['u']

        # Create measurement errors
    errors_mean = np.zeros(p)
    errors_cov = np.zeros((p,p))
    for i in range(p):
        for j in range(p):
            if i == j:
                errors_cov[i,j] = 1
    errors = np.random.multivariate_normal(errors_mean, errors_cov, N)
    z_vars = []
    for i in range(p):
        z_vars.append('z'+str(i+1))
    mismeasured_z = pd.DataFrame(errors, columns = z_vars)
    for i in mismeasured_z.columns:
        mismeasured_z[i] = mismeasured_z[i] + vars_['true_z']

            # Do feature scaling (normalize to mean 0 and variance 1)
    scaled_mismeasured_z = mismeasured_z.copy()
    for i in mismeasured_z.columns:
        scaled_mismeasured_z[i] = (mismeasured_z[i] - mismeasured_z[i].mean()) / mismeasured_z[i].std()

            # Use PCA on the mismeasured values
    pca_model = pca()
    pca_results = pca_model.fit_transform(scaled_mismeasured_z)
    pca_z = pca_results['PC']['PC1']

        # Average mismeasured variables:
    vars_['avg_mismeasured_z'] = mismeasured_z[z_vars].mean(axis=1)
    # Add relevant variables to vars_ dataframe
    vars_[mismeasured_z.columns] = mismeasured_z
    vars_['pca_z'] = pca_z

        # Mismeasured Results
    model_mismeasured = sm.OLS(vars_['y'],vars_[['x','z1']])
    results_mismeasured = model_mismeasured.fit()
    mismeasured_coef.append(results_mismeasured.params[0])
    # All Variables Mismeasured Results
    tot_vars = ['x']
    tot_vars.extend(z_vars)
#    model_mismeasured_allvar = sm.OLS(vars_['y'],tot_vars)
#    results_mismeasured_allvar = model_mismeasured_allvar.fit()
#    mismeasured_allvar_coef.append(results_mismeasured_allvar.params[0])
tot_vars

In [None]:
output = pd.DataFrame()
for beta1 in [0.1,1,10,100]:
    for beta2 in [0.1,1,10,100]:
        for covariance in [-1,-0.75,-0.5,-0.25,0,0.25,0.5,0.75,1]:
            for p in [5,10,20,50]:
                for k in range(100):
                    # Initialize Lists
                    pca_coef = []
                    mismeasured_coef = []
                    mismeasured_allvar_coef = []
                    mismeasured_avg_coef = []
                    true_val_coef =[]
                    nonscaled_true_val_coef = []

                    # Create variables
                    vars_mean = [0,0,0]
                    vars_cov = np.array([[1,covariance,0],
                                         [covariance,1,0],
                                         [0,0,1]])
                    vars_ = pd.DataFrame(np.random.multivariate_normal(vars_mean, vars_cov, N), columns = ['x','true_z','u'])
                    vars_['y'] = beta1 * vars_['x'] + beta2 * vars_['true_z'] + vars_['u']

                    # Create measurement errors
                    errors_mean = np.zeros(p)
                    errors_cov = np.zeros((p,p))
                    for i in range(p):
                        for j in range(p):
                            if i == j:
                                errors_cov[i,j] = 1

                    errors = np.random.multivariate_normal(errors_mean, errors_cov, N)
                    z_vars = []
                    for i in range(p):
                        z_vars.append('z'+str(i+1))
                    mismeasured_z = pd.DataFrame(errors, columns = z_vars)
                    for i in mismeasured_z.columns:
                        mismeasured_z[i] = mismeasured_z[i] + vars_['true_z']

                    # Do feature scaling (normalize to mean 0 and variance 1)
                    scaled_mismeasured_z = mismeasured_z.copy()
                    for i in mismeasured_z.columns:
                        scaled_mismeasured_z[i] = (mismeasured_z[i] - mismeasured_z[i].mean()) / mismeasured_z[i].std()

                    # Use PCA on the mismeasured values
                    pca_model = pca()
                    pca_results = pca_model.fit_transform(scaled_mismeasured_z)
                    pca_z = pca_results['PC']['PC1']

                    # Average mismeasured variables:
                    vars_['avg_mismeasured_z'] = mismeasured_z[z_vars].mean(axis=1)

                    # Add relevant variables to vars_ dataframe
                    vars_[mismeasured_z.columns] = mismeasured_z
                    vars_['pca_z'] = pca_z

                    # Mismeasured Results
                    model_mismeasured = sm.OLS(vars_['y'],vars_[['x','z1']])
                    results_mismeasured = model_mismeasured.fit()
                    mismeasured_coef.append(results_mismeasured.params[0])

                    # All Variables Mismeasured Results
                    tot_vars = ['x']
                    tot_vars.extend(z_vars)
                    model_mismeasured_allvar = sm.OLS(vars_['y'],vars_[tot_vars])
                    results_mismeasured_allvar = model_mismeasured_allvar.fit()
                    mismeasured_allvar_coef.append(results_mismeasured_allvar.params[0])

                    # Average Mismeasured Variables Results
                    model_mismeasured_avg = sm.OLS(vars_['y'],vars_[['x','avg_mismeasured_z']])
                    results_mismeasured_avg = model_mismeasured_avg.fit()
                    mismeasured_avg_coef.append(results_mismeasured_avg.params[0])

                    # PCA Results
                    model_pca = sm.OLS(vars_['y'],vars_[['x','pca_z']])
                    results_pca = model_pca.fit()
                    pca_coef.append(results_pca.params[0])

                    # True Results
                    model_true = sm.OLS(vars_['y'],vars_[['x','true_z']])
                    results_true = model_true.fit()
                    true_val_coef.append(results_true.params[0])

                    # Output Findings
                    new_output = pd.DataFrame()
                    new_output['mismeasured_coef'] = mismeasured_coef
                    new_output['mismeasured_allvar_coef'] = mismeasured_allvar_coef
                    new_output['mismeasured_avg_coef'] = mismeasured_avg_coef
                    new_output['pca_coef'] = pca_coef
                    new_output['true_val_coef'] = true_val_coef
                    new_output['covariance'] = vars_cov[0][1]
                    new_output['beta1'] = beta1
                    new_output['beta2'] = beta2
                    new_output['p'] = p
                    output = output.append(new_output)
        
output

In [None]:
pd.set_option('display.max_rows', 10)

In [None]:
output.groupby(['covariance','beta1','beta2','p']).mean()[['pca_coef','mismeasured_coef','mismeasured_allvar_coef','mismeasured_avg_coef','true_val_coef']].reset_index()

In [None]:
output.groupby(['covariance','beta1','beta2','p']).std()[['pca_coef','mismeasured_coef','mismeasured_allvar_coef','mismeasured_avg_coef','true_val_coef']].reset_index()

In [None]:
output.loc[(output['beta1']==1)&(output['beta2']==1)].groupby(['covariance','beta1','beta2','p']).mean()[['pca_coef','mismeasured_coef','mismeasured_allvar_coef','mismeasured_avg_coef','true_val_coef']].reset_index()

In [None]:
means = output.groupby(['covariance','beta1','beta2','p']).mean()[['pca_coef','mismeasured_coef','mismeasured_allvar_coef','mismeasured_avg_coef','true_val_coef']].reset_index()
means

In [None]:
means['pca_ape'] = (means['pca_coef'] - means['beta1']).abs()/means['beta1']
means['mismeasured_coef_ape'] = (means['mismeasured_coef'] - means['beta1']).abs()/means['beta1']
means['mismeasured_allvar_coef_ape'] = (means['mismeasured_allvar_coef'] - means['beta1']).abs()/means['beta1']
means['mismeasured_avg_coef_ape'] = (means['mismeasured_avg_coef'] - means['beta1']).abs()/means['beta1']
means

In [None]:
means.describe()[['pca_ape','mismeasured_coef_ape','mismeasured_allvar_coef_ape','mismeasured_avg_coef_ape']]

In [None]:
means.groupby('covariance').mean()[['pca_ape','mismeasured_coef_ape','mismeasured_allvar_coef_ape','mismeasured_avg_coef_ape']]

In [None]:
means.groupby('beta1').mean()[['pca_ape','mismeasured_coef_ape','mismeasured_allvar_coef_ape','mismeasured_avg_coef_ape']]

In [None]:
means.groupby('beta2').mean()[['pca_ape','mismeasured_coef_ape','mismeasured_allvar_coef_ape','mismeasured_avg_coef_ape']]

In [None]:
means.groupby('p').mean()[['pca_ape','mismeasured_coef_ape','mismeasured_allvar_coef_ape','mismeasured_avg_coef_ape']]

In [None]:
stds = output.groupby(['covariance','beta1','beta2','p']).std()[['pca_coef','mismeasured_coef','mismeasured_allvar_coef','mismeasured_avg_coef','true_val_coef']].reset_index()
stds

In [None]:
stds['pca_ape'] = (stds['pca_coef'] - stds['beta1']).abs()/stds['beta1']
stds['mismeasured_coef_ape'] = (stds['mismeasured_coef'] - stds['beta1']).abs()/stds['beta1']
stds['mismeasured_allvar_coef_ape'] = (stds['mismeasured_allvar_coef'] - stds['beta1']).abs()/stds['beta1']
stds['mismeasured_avg_coef_ape'] = (stds['mismeasured_avg_coef'] - stds['beta1']).abs()/stds['beta1']
stds

In [None]:
stds.describe()[['pca_ape','mismeasured_coef_ape','mismeasured_allvar_coef_ape','mismeasured_avg_coef_ape']]

In [None]:
stds.groupby('covariance').mean()[['pca_ape','mismeasured_coef_ape','mismeasured_allvar_coef_ape','mismeasured_avg_coef_ape']]

In [None]:
stds.groupby('beta1').mean()[['pca_ape','mismeasured_coef_ape','mismeasured_allvar_coef_ape','mismeasured_avg_coef_ape']]

In [None]:
stds.groupby('beta2').mean()[['pca_ape','mismeasured_coef_ape','mismeasured_allvar_coef_ape','mismeasured_avg_coef_ape']]

In [None]:
stds.groupby('p').mean()[['pca_ape','mismeasured_coef_ape','mismeasured_allvar_coef_ape','mismeasured_avg_coef_ape']]

In [None]:
means.to_csv('C://Users//paulo//Documents//Econometrics and ML//Final Project//means_coefs.csv')
stds.to_csv('C://Users//paulo//Documents//Econometrics and ML//Final Project//stds_coefs.csv')