In [1]:
import csv
import numpy as np
import pandas as pd
from pandas import DataFrame
from DATA225utils import make_connection, dataframe_query

In [2]:
conn = make_connection(config_file = 'salary.ini')
cursor = conn.cursor()

In [3]:
def execute_multiple_SQL(cursor, sql, trace=False):
    """
    Use the cursor to execute multiple SQL statements.
    Print an execution trace if trace=True.
    """
    for crsr in cursor.execute(sql, multi=True):
        if crsr.with_rows:
            results = crsr.fetchall()
            if trace:
                print(crsr.statement)
                print('  ==> ', results)
        else:
            if trace:
                print(crsr.statement)
                if crsr.rowcount > 0:
                    print(f'  ==> {crsr.rowcount} row(s) affected.')

# 1. On the client side, without the records containing the missing values, calculate and print a pairwise correlation matrix.

In [4]:
df = pd.read_csv('Salary.csv')
values = list(df.itertuples(index=False, name=None))

In [5]:
df2 = df.dropna(subset=['Salary'])
correlation_matrix = df2.corr()
correlation_matrix

Unnamed: 0,Age,StartAge,Exp,Salary
Age,1.0,0.420664,0.979881,0.92315
StartAge,0.420664,1.0,0.231135,0.27164
Exp,0.979881,0.231135,1.0,0.930249
Salary,0.92315,0.27164,0.930249,1.0


# 2. On the server side, use SQL to calculate the linear regression coefficients for the data without the missing values. Query for and print the coefficients on the client side.

In [6]:
sql = ( """
    START TRANSACTION;

    DROP VIEW IF EXISTS base;
    DROP VIEW IF EXISTS regression_1;
    DROP VIEW IF EXISTS residuals_1;
    DROP VIEW IF EXISTS regression_2;
    DROP VIEW IF EXISTS residuals_2;
    DROP VIEW IF EXISTS multiple_regression_3;

    CREATE VIEW base AS
        SELECT
            Age,
            Age - AVG(Age) OVER() AS x1_centered,

            StartAge,
            StartAge - AVG(StartAge) OVER() AS x2_centered,

            Exp,
            Exp - AVG(Exp) OVER() AS x3_centered,

            Salary,
            Salary - AVG(Salary) OVER()   AS y_centered

        FROM salaryb;

    CREATE VIEW regression_1 AS
        SELECT
            AVG(Age) - AVG(StartAge) * SUM(x1_centered * x2_centered) 
                / SUM(x2_centered * x2_centered)                AS x1x2_const,
            SUM(x1_centered * x2_centered) 
                / SUM(x2_centered * x2_centered)                AS x1x2_coef,
            AVG(Age) - AVG(Exp) * SUM(x1_centered * x3_centered) 
                / SUM(x3_centered * x3_centered)                AS x1x3_const,
            SUM(x1_centered * x3_centered) 
                / SUM(x3_centered * x3_centered)                AS x1x3_coef,
            AVG(StartAge) - AVG(Age) * SUM(x2_centered * x1_centered) 
                / SUM(x1_centered * x1_centered)                AS x2x1_const,
            SUM(x2_centered * x1_centered) 
                / SUM(x1_centered * x1_centered)                AS x2x1_coef,
            AVG(StartAge) - AVG(Exp) * SUM(x2_centered * x3_centered) 
                / SUM(x3_centered * x3_centered)                AS x2x3_const,
            SUM(x2_centered * x3_centered) 
                / SUM(x3_centered * x3_centered)                AS x2x3_coef,
            AVG(Exp) - AVG(Age) * SUM(x3_centered * x1_centered) 
                / SUM(x1_centered * x1_centered)                AS x3x1_const,
            SUM(x3_centered * x1_centered) 
                / SUM(x1_centered * x1_centered)                AS x3x1_coef,
            AVG(Exp) - AVG(StartAge) * SUM(x3_centered * x2_centered) 
                / SUM(x2_centered * x2_centered)                AS x3x2_const,
            SUM(x3_centered * x2_centered) 
                / SUM(x2_centered * x2_centered)                AS x3x2_coef

        FROM base;

    CREATE VIEW residuals_1 AS
        SELECT
            Salary,
            y_centered,

            Age,
            x1_centered,
            StartAge,
            x2_centered,
            Exp,
            x3_centered,

            Age - (SELECT x1x2_coef  FROM regression_1) * StartAge 
               - (SELECT x1x2_const FROM regression_1)       AS x1x2_resid,
            Age - (SELECT x1x3_coef  FROM regression_1) * Exp 
               - (SELECT x1x3_const FROM regression_1)       AS x1x3_resid,
            StartAge - (SELECT x2x1_coef  FROM regression_1) * Age 
               - (SELECT x2x1_const FROM regression_1)       AS x2x1_resid,
            StartAge - (SELECT x2x3_coef  FROM regression_1) * Exp 
               - (SELECT x2x3_const FROM regression_1)       AS x2x3_resid,
            Exp - (SELECT x3x1_coef  FROM regression_1) * Age 
               - (SELECT x3x1_const FROM regression_1)       AS x3x1_resid,
            Exp - (SELECT x3x2_coef  FROM regression_1) * StartAge 
               - (SELECT x3x2_const FROM regression_1)       AS x3x2_resid

        FROM base;

    CREATE VIEW regression_2 AS
        SELECT
            SUM(x1_centered * x2x3_resid) 
                / SUM(x2x3_resid * x2x3_resid)                  AS x1_x2x3_coef,
            SUM(x1_centered * x3x2_resid) 
                / SUM(x3x2_resid * x3x2_resid)                  AS x1_x3x2_coef,
            AVG(Age) - AVG(StartAge) * SUM(x1_centered * x2x3_resid) 
                          / SUM(x2x3_resid * x2x3_resid)
                    - AVG(Exp) * SUM(x1_centered * x3x2_resid) 
                          / SUM(x3x2_resid * x3x2_resid)        AS x1_const,
            SUM(x2_centered * x1x3_resid) 
                / SUM(x1x3_resid * x1x3_resid)                  AS x2_x1x3_coef,
            SUM(x2_centered * x3x1_resid) 
                / SUM(x3x1_resid * x3x1_resid)                  AS x2_x3x1_coef,
            AVG(StartAge) - AVG(Age) * SUM(x2_centered * x1x3_resid) 
                          / SUM(x1x3_resid * x1x3_resid)
                    - AVG(Exp) * SUM(x2_centered * x3x1_resid) 
                          / SUM(x3x1_resid * x3x1_resid)        AS x2_const,
            SUM(x3_centered * x2x1_resid) 
                / SUM(x2x1_resid * x2x1_resid)                  AS x3_x2x1_coef,
            SUM(x3_centered * x1x2_resid) 
                / SUM(x1x2_resid * x1x2_resid)                  AS x3_x1x2_coef,
            AVG(Exp) - AVG(Age) * SUM(x3_centered * x1x2_resid) 
                          / SUM(x1x2_resid * x1x2_resid)
                    - AVG(StartAge) * SUM(x3_centered * x2x1_resid) 
                          / SUM(x2x1_resid * x2x1_resid)        AS x3_const

        FROM residuals_1;

    CREATE VIEW residuals_2 AS
        SELECT
            Salary,
            y_centered,

            Age,
            x1_centered,
            StartAge,
            x2_centered,
            Exp,
            x3_centered,

            Age - (SELECT x1_x2x3_coef FROM regression_2) * StartAge
               - (SELECT x1_x3x2_coef FROM regression_2) * Exp
               - (SELECT x1_const FROM regression_2)           AS x1_resid,
            StartAge - (SELECT x2_x1x3_coef FROM regression_2) * Age
               - (SELECT x2_x3x1_coef FROM regression_2) * Exp
               - (SELECT x2_const FROM regression_2)           AS x2_resid,
            Exp - (SELECT x3_x1x2_coef FROM regression_2) * Age
               - (SELECT x3_x2x1_coef FROM regression_2) * StartAge
               - (SELECT x3_const FROM regression_2)           AS x3_resid

        FROM base;

    CREATE VIEW multiple_regression_3 AS
        SELECT
            AVG(Salary) - AVG(Age) * SUM(y_centered * x1_resid) 
                         / SUM(x1_resid * x1_resid)
                   - AVG(StartAge) * SUM(y_centered * x2_resid) 
                         / SUM(x2_resid * x2_resid)
                   - AVG(Exp) * SUM(y_centered * x3_resid) 
                        / SUM(x3_resid * x3_resid)        AS beta0,
            SUM(y_centered * x1_resid) 
                / SUM(x1_resid * x1_resid)                AS beta1,
            SUM(y_centered * x2_resid) 
                / SUM(x2_resid * x2_resid)                AS beta2,
            SUM(y_centered * x3_resid) 
                / SUM(x3_resid * x3_resid)                AS beta3

        FROM residuals_2;
        
    SELECT * 
    FROM multiple_regression_3
    INTO @beta_0, @beta_1, @beta_2, @beta_3;

    UPDATE salaryb
    SET salary := Round(@beta_0 + @beta_1*Age + @beta_2*StartAge + @beta_3*Exp, 2)
    WHERE salary IS NULL;
        
    COMMIT;
    """
      )

In [7]:
execute_multiple_SQL(cursor, sql)
cursor.execute('SELECT * FROM multiple_regression_3')
result = cursor.fetchone()

result

(-11389.939794929218, 8963.079132839946, -7284.84216874243, -2402.595698899126)

In [8]:
β0, β1, β2, β3 = result

df = DataFrame([[β0, β1, β2, β3]])
df.columns = ['β0', 'β1', 'β2', 'β3']

display(df)

Unnamed: 0,β0,β1,β2,β3
0,-11389.939795,8963.079133,-7284.842169,-2402.595699


# 3. On the server side, use SQL code to replace each missing value with an estimate calculated from the regression equation

In [10]:
'''
sql = ("""
        SELECT * 
        FROM multiple_regression_3
        INTO @beta_0, @beta_1, @beta_2, @beta_3;
        
        UPDATE salaryb
        SET salary := Round(@beta_0 + @beta_1*Age + @beta_2*StartAge + @beta_3*Exp,2)
        WHERE salary IS NULL;
        
        COMMIT;
        """)
''';
# We ran this code at the end of question 2.


# 4. Download the cleaned data and redo step 1. Note any changes in the results.

In [11]:
df = pd.read_csv('SalaryB.csv')
values = list(df.itertuples(index=False, name=None))
df2 = df.dropna(subset=['Salary'])
correlation_matrix = df2.corr()
correlation_matrix

Unnamed: 0,Age,StartAge,Exp,Salary
Age,1.0,0.424971,0.978703,0.926727
StartAge,0.424971,1.0,0.231721,0.26848
Exp,0.978703,0.231721,1.0,0.934924
Salary,0.926727,0.26848,0.934924,1.0


#### If we replace the null values, the relationship between startage and salary decreased at a smaller number. The relationship between Age and salary and Years of Experience and Salary remained roughly the same.