In [11]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import ttest_ind
from sqlalchemy import create_engine
from scipy.stats.mstats import winsorize
from scipy.stats import boxcox
from scipy.stats import jarque_bera
from scipy.stats import normaltest
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import warnings

%matplotlib inline
sns.set()

warnings.filterwarnings('ignore')

In [12]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'useducation'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

df_used = pd.read_sql_query('select * from useducation',con=engine)

engine.dispose()

In [17]:
df_used2 = df_used.copy()

#filling in missing values
columns = ["ENROLL", "TOTAL_REVENUE", "FEDERAL_REVENUE", "STATE_REVENUE", "LOCAL_REVENUE", "TOTAL_EXPENDITURE", 
             "INSTRUCTION_EXPENDITURE", "SUPPORT_SERVICES_EXPENDITURE", "OTHER_EXPENDITURE", "CAPITAL_OUTLAY_EXPENDITURE", "GRADES_PK_G", 
             "GRADES_KG_G", "GRADES_4_G", "GRADES_8_G", "GRADES_12_G", "GRADES_1_8_G", "GRADES_9_12_G", "GRADES_ALL_G"]

#interpolate missing values
for column in columns:
    df_used2.loc[:, column].fillna(df_used2.loc[:, column].interpolate(), inplace=True)
    
df_used2.dropna(inplace=True)


## 1. Create a new score variable from the weighted averages of all score variables in the datasets. Notice that the number of students in the 4th grade isn't the same as the number of students in the 8th grade. So, you should appropriately weigh the scores!.

In [18]:
df_used2["OVERALL_SCORE"] = (df_used2["GRADES_4_G"]*((df_used2["AVG_MATH_4_SCORE"] + df_used2["AVG_READING_4_SCORE"])*0.5) + df_used2["GRADES_8_G"]
                                 * ((df_used2["AVG_MATH_8_SCORE"] + df_used2["AVG_READING_8_SCORE"])*0.5))/(df_used2["GRADES_4_G"] + df_used2["GRADES_8_G"])


## 2. What are the correlations between this newly created score variable and the expenditure types? Which 1 of the expenditure types is more correlated than the others?

In [19]:
df_used2[["OVERALL_SCORE", "TOTAL_EXPENDITURE", "INSTRUCTION_EXPENDITURE",
              "SUPPORT_SERVICES_EXPENDITURE", "OTHER_EXPENDITURE", "CAPITAL_OUTLAY_EXPENDITURE"]].corr()

Unnamed: 0,OVERALL_SCORE,TOTAL_EXPENDITURE,INSTRUCTION_EXPENDITURE,SUPPORT_SERVICES_EXPENDITURE,OTHER_EXPENDITURE,CAPITAL_OUTLAY_EXPENDITURE
OVERALL_SCORE,1.0,0.085529,0.09854,0.090792,0.018222,0.005444
TOTAL_EXPENDITURE,0.085529,1.0,0.99117,0.992755,0.947953,0.925651
INSTRUCTION_EXPENDITURE,0.09854,0.99117,1.0,0.977085,0.910612,0.887075
SUPPORT_SERVICES_EXPENDITURE,0.090792,0.992755,0.977085,1.0,0.95152,0.906012
OTHER_EXPENDITURE,0.018222,0.947953,0.910612,0.95152,1.0,0.921163
CAPITAL_OUTLAY_EXPENDITURE,0.005444,0.925651,0.887075,0.906012,0.921163,1.0


INSTRUCTION_EXPENDITURE is the most highly correlated (but still low: ~0.115) variable with our new OVERALL_SCORE variable.

## 3. Now, apply PCA to the 4 expenditure types. How much of the total variance is explained by the 1st component?

In [22]:
X = df_used2[["INSTRUCTION_EXPENDITURE", "SUPPORT_SERVICES_EXPENDITURE",
                  "OTHER_EXPENDITURE", "CAPITAL_OUTLAY_EXPENDITURE"]]

X = StandardScaler().fit_transform(X)

sklearn_pca = PCA(n_components=1)
df_used2["PCA_1"] = sklearn_pca.fit_transform(X)

print(sklearn_pca.explained_variance_ratio_)

[0.94430736]


Approximately 94% of the total variance is explained by the first component.

## 4. What is the correlation between the overall score variable and the 1st principal component?

In [23]:
df_used2[["OVERALL_SCORE", "PCA_1"]].corr()

Unnamed: 0,OVERALL_SCORE,PCA_1
OVERALL_SCORE,1.0,0.055135
PCA_1,0.055135,1.0


The correlation between the overall score variable and the 1st principal component is ~ 0.055.

## 5. If you were to choose the best variables for your model, would you prefer using the 1st principal component instead of the expenditure variables? Why?

I don't think so for two reasons. First, the expenditure variables are all highly correlated with each other, which can have unstable results; and second, the expenditure variables show better correlation with the overall score variable than the 1st principal compenent.  