In [1]:
import pandas as pd
from DATA225utils import make_connection, dataframe_query

In [2]:
conn = make_connection(config_file = 'salary.ini')
cursor = conn.cursor()

# 1. On the client side, without the records containing the missing values, calculate and print a pairwise correlation matrix of your four variables

In [30]:
df = pd.read_csv('Salary Data.csv')
# change Gender column to binary variables
df['Gender'] = df['Gender'].apply(lambda x: 1 if x == "Male" else 0)
df.head()

Unnamed: 0,Age,Gender,Years of Experience,Salary
0,32,1,5.0,90000.0
1,28,0,3.0,65000.0
2,45,1,15.0,
3,36,0,7.0,60000.0
4,52,1,20.0,200000.0


In [31]:
df2 = df.dropna(subset=['Salary'])
correlation_matrix = df2.corr()
correlation_matrix

Unnamed: 0,Age,Gender,Years of Experience,Salary
Age,1.0,-0.007703,0.979881,0.92315
Gender,-0.007703,1.0,0.017231,0.069547
Years of Experience,0.979881,0.017231,1.0,0.930249
Salary,0.92315,0.069547,0.930249,1.0


# 2. On the server side, without the missing values, use SQL to calculate the overall average of your target variable and its averages within the major subgroups (such as the Titanic passenger classes). Query for and print the averages on the client side.

In [32]:
def print_overall_salary_average():
    _, df = dataframe_query(conn, 
        """
        SELECT round(AVG(Salary),2) AS "Overall Salary Average"
        FROM `salary data`
        WHERE Salary IS NOT NULL
        """
                           )

    display(df)
print_overall_salary_average()

Unnamed: 0,Overall Salary Average
0,99909.33


In [33]:
def print_sub_average():
    _, df = dataframe_query(conn, 
        """
        SELECT Age, Round(AVG(Salary), 2) AS "Salary Average"
        FROM `salary data`
        WHERE Salary IS NOT NULL
        GROUP BY Age
        ORDER BY Age
        """
                           )
    display(df)
print_sub_average()

Unnamed: 0,Age,Salary Average
0,23,35000.0
1,25,35000.0
2,26,39166.67
3,27,44444.44
4,28,41666.67
5,29,41597.37
6,30,46666.67
7,31,55000.0
8,32,61000.0
9,33,71666.67


# 3. On the server side, use SQL to calculate the percentages of missing target variable values in ever smaller subgroups. Query for and print the percentages on the client side.

In [34]:
def print_age_counts_by_survived():
    _, df = dataframe_query(conn, 
        """
        SELECT gender, age,
        COUNT(*) AS total_records,
        SUM(CASE WHEN salary IS NULL THEN 1 ELSE 0 END) AS missing_values,
        (SUM(CASE WHEN salary IS NULL THEN 1 ELSE 0 END) * 100.0 / COUNT(*)) AS percentage_missing
        FROM `salary data`
        GROUP BY gender,age
        ORDER BY gender, age;
        """
                           )

    display(df)
print_age_counts_by_survived()

Unnamed: 0,gender,age,total_records,missing_values,percentage_missing
0,Female,23,1,0,0.0
1,Female,25,2,0,0.0
2,Female,26,3,0,0.0
3,Female,27,3,0,0.0
4,Female,28,4,0,0.0
5,Female,29,14,0,0.0
6,Female,30,10,0,0.0
7,Female,31,11,3,27.27273
8,Female,32,4,0,0.0
9,Female,33,6,1,16.66667


# 4. On the client side, determine the smallest subgroup whose averages you can use to replace the missing values

In [36]:
subgroup_missing_percentage = df.groupby('Gender')['Salary'].apply(lambda x: x.isnull().mean())

smallest_subgroup = subgroup_missing_percentage.idxmin()

print(f"The smallest subgroup for imputation is: {smallest_subgroup}")

The smallest subgroup for imputation is: 0


From the results of this code as well as the code from part 3, the smallest subgroup for which we are imputing the missing values does not exist, so we decide to use the next smallest group of Gender.

# 5. On the server side, use SQL code to replace each missing value with the appropriate average.

In [35]:
def replace_missing_salaries():
    cursor.execute('SET SQL_SAFE_UPDATES = 0')

    cursor.execute( 
        """
        UPDATE `salary data`
        SET salary = (
            SELECT Round(AVG(salary),2) 
            FROM (SELECT * FROM `salary data`) AS s2 
            WHERE s2.gender = `salary data`.gender
        )
        WHERE salary IS NULL;
        """
                  )

    conn.commit()
replace_missing_salaries()

# 6. Download the cleaned data and redo steps 1 and 2. Note any changes in the results

### Step 1:

In [39]:
df = pd.read_csv('Salary2.csv')
# change Gender column to binary variables
df['Gender'] = df['Gender'].apply(lambda x: 1 if x == "Male" else 0)
correlation_matrix = df.corr()
correlation_matrix

Unnamed: 0,Age,Gender,Years of Experience,Salary
Age,1.0,-0.020324,0.978703,0.864235
Gender,-0.020324,1.0,0.002169,0.074201
Years of Experience,0.978703,0.002169,1.0,0.875101
Salary,0.864235,0.074201,0.875101,1.0


#### After replacing the null values, the correlation between Age and Salary decreased, the correlation between Years of Experience and Salary decreased, and the correlation between Gender and Salary increased.

### Step 2:

In [40]:
def print_overall_salary_average():
    _, df = dataframe_query(conn, 
        """
        SELECT round(AVG(Salary),2) AS "Overall Salary Average"
        FROM `salary data`
        WHERE Salary IS NOT NULL
        """
                           )

    display(df)
print_overall_salary_average()

Unnamed: 0,Overall Salary Average
0,99992.49


In [41]:
def print_sub_average():
    _, df = dataframe_query(conn, 
        """
        SELECT Age, Round(AVG(Salary), 2) AS "Salary Average"
        FROM `salary data`
        WHERE Salary IS NOT NULL
        GROUP BY Age
        ORDER BY Age
        """
                           )
    display(df)
print_sub_average()

Unnamed: 0,Age,Salary Average
0,23,35000.0
1,24,103194.88
2,25,35000.0
3,26,48313.55
4,27,44444.44
5,28,46399.61
6,29,52309.98
7,30,46666.67
8,31,63226.67
9,32,68032.48


#### Both the overall average and the average by age has increased after replacing our null values