# Analysis by LEA
In this notebook, about 7,800 districts are analysed. The data includes the enrollment of males and females in various programs and high school level courses. Districts with low enrollment are removed from the analysis. The analysis consists of principal component analysis (PCA), K-Means clustering, linear discriminant analysis (LDA), correlation, covariance, and multiple regression predicting poverty rate based on enrollment. 

In [None]:
query="""
SELECT 
	rls.leaid
	,min(rls.lea_name) AS lea_name
	,min(rls.lea_state) as lea_state
	,sum(GREATEST(advmath.tot_mathenr_advm_m,0)) AS advmath_m_enr
	,sum(GREATEST(advmath.tot_mathenr_advm_f,0)) AS advmath_f_enr
	,sum(GREATEST(advpl.TOT_APEXAM_NONE_M,0)) AS advpl_m_noexam
	,sum(GREATEST(advpl.TOT_APEXAM_NONE_F,0)) AS advpl_f_noexam
	,sum(GREATEST(alg1.TOT_ALGPASS_GS0910_M,0)) AS alg1_m_0910_passed
	,sum(GREATEST(alg1.TOT_ALGPASS_GS1112_M,0)) AS alg1_m_1112_passed
	,sum(GREATEST(alg1.TOT_ALGPASS_GS0910_F,0)) AS alg1_f_0910_passed
	,sum(GREATEST(alg1.TOT_ALGPASS_GS1112_F,0)) AS alg1_f_1112_passed
	,sum(GREATEST(alg2.tot_mathenr_alg2_m,0)) AS alg2_m_enr
	,sum(GREATEST(alg2.tot_mathenr_alg2_f,0)) AS alg2_f_enr
	,sum(GREATEST(bio.TOT_SCIENR_BIOL_M,0)) AS bio_m_enr
	,sum(GREATEST(bio.TOT_SCIENR_BIOL_F,0)) AS bio_f_enr
	,sum(GREATEST(calc.TOT_MATHENR_CALC_M,0)) AS calc_m_enr
	,sum(GREATEST(calc.TOT_MATHENR_CALC_F,0)) AS calc_f_enr
	,sum(GREATEST(chem.TOT_SCIENR_CHEM_M,0)) AS chem_m_enr
	,sum(GREATEST(chem.TOT_SCIENR_CHEM_F,0)) AS chem_f_enr
	,sum(GREATEST(dual.TOT_DUAL_M,0)) AS dual_m_enr
	,sum(GREATEST(dual.TOT_DUAL_F,0)) AS dual_f_enr
	,sum(GREATEST(enr.tot_enr_m,0)) AS total_m_enr
	,sum(GREATEST(enr.tot_enr_f,0)) AS total_f_enr
	,sum(GREATEST(enr.SCH_ENR_LEP_M,0)) AS enr_lep_m
	,sum(GREATEST(enr.SCH_ENR_LEP_F,0)) AS enr_lep_f
	,sum(GREATEST(enr.SCH_ENR_504_M,0)) AS enr_504_m
	,sum(GREATEST(enr.SCH_ENR_504_F,0)) AS enr_504_f
	,sum(GREATEST(enr.SCH_ENR_IDEA_M,0)) AS enr_idea_m
	,sum(GREATEST(enr.SCH_ENR_IDEA_F,0)) AS enr_idea_f
	,sum(GREATEST(geo.TOT_MATHENR_GEOM_M,0)) AS geo_m_enr
	,sum(GREATEST(geo.TOT_MATHENR_GEOM_F,0)) AS geo_f_enr
	,sum(GREATEST(phys.TOT_SCIENR_PHYS_M,0)) AS phys_m_enr
	,sum(GREATEST(phys.TOT_SCIENR_PHYS_F,0)) AS phys_f_enr
	,sum(GREATEST(satact.TOT_SATACT_M,0)) AS satact_m
	,sum(GREATEST(satact.TOT_SATACT_F,0)) AS satact_f
	,avg(saipe.totalpopulation) AS totalpopulation 
	,avg(saipe.population5_17) AS population5_17
	,avg(saipe.population5_17inpoverty) AS population5_17inpoverty
FROM ref_schema.ref_lea_sch rls
JOIN data_schema.sch_advancedmathematics advmath ON advmath.combokey = rls.combokey
JOIN data_schema.sch_advancedplacement advpl ON advpl.combokey = rls.combokey
JOIN data_schema.sch_algebrai alg1 ON alg1.combokey = rls.combokey
JOIN data_schema.sch_algebraii alg2 ON alg2.combokey = rls.combokey 
JOIN data_schema.sch_biology bio ON bio.combokey = rls.combokey 
JOIN data_schema.sch_calculus calc ON calc.combokey = rls.combokey 
JOIN data_schema.sch_chemistry chem ON chem.combokey = rls.combokey 
JOIN data_schema.sch_dualenrollment dual ON dual.combokey = rls.combokey 
JOIN data_schema.sch_enrollment enr ON enr.combokey = rls.combokey 
JOIN data_schema.sch_geometry geo ON geo.combokey = rls.combokey 
JOIN data_schema.sch_physics phys ON phys.combokey = rls.combokey 
JOIN data_schema.sch_satandact satact ON satact.combokey = rls.combokey 
JOIN data_schema.sch_schoolcharacteristics chr ON chr.combokey = rls.combokey 
JOIN data_schema.saipe_ussd17 saipe ON saipe.leaid = rls.leaid
WHERE chr.hs_only = TRUE
group by rls.leaid
order by leaid;
"""

In [None]:
from sqlalchemy import create_engine
db_params = {
    "database": "postgres",
    "user": "postgres",
    "password": "pwd123",
    "host": "postgres-db",
    "port": "5432"
}
connection_string = f"postgresql://{db_params['user']}:{db_params['password']}@{db_params['host']}:{db_params['port']}/{db_params['database']}"
engine = create_engine(connection_string)

In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objs as go
import plotly.io as pio
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from kneed import KneeLocator
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

In [None]:
# df = pd.read_csv('LEA_agg_data.csv')

In [None]:
df = pd.read_sql(query, engine)

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
exclude_cols = ['leaid', 'lea_name', 'lea_state', 
                'totalpopulation', 'population5_17',
                'population5_17inpoverty', 'total_enrollment']
columns_to_modify = df.columns.difference(exclude_cols)
df[columns_to_modify] = df[columns_to_modify].clip(lower=0)

In [None]:
enrollment_sum = df['total_m_enr'] + df['total_f_enr']
df['total_enrollment'] = enrollment_sum
columns_to_modify = df.columns.difference(exclude_cols)
df[columns_to_modify] = df[columns_to_modify].div(enrollment_sum, axis=0).fillna(0)

In [None]:
df[enrollment_sum <= 10][['total_enrollment','leaid',
                         'lea_state','totalpopulation']]

In [None]:
df = df[enrollment_sum > 10]
df = df.reset_index(drop=True)

In [None]:
df['5_17_poverty_percent'] = df['population5_17inpoverty']/df['population5_17']

In [None]:
df.columns.difference(exclude_cols)

In [None]:
df.head()

# PCA

In [None]:
ids = df['leaid'].values
lea_names = df['lea_name'].values
states = df['lea_state'].values
pop5_17 = df['population5_17']
pov5_17 = df['5_17_poverty_percent']

In [None]:
ids = df['leaid'].values

# Step 1: Subset the DataFrame
subset_df = df[df.columns.difference(exclude_cols)]
for_pca_use = df[df['total_enrollment'] > 15][df.columns.difference(exclude_cols)]

# Step 2: Standardize the data
scaler = StandardScaler()
standardized_data = scaler.fit_transform(subset_df)
pca_data = scaler.fit_transform(for_pca_use)

# Step 3: Compute covariance matrix, eigenvectors, and eigenvalues for PCA
cov_matrix = np.cov(pca_data, rowvar=False)
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

# Sort eigenvectors by eigenvalue size (descending order)
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvectors = eigenvectors[:, sorted_indices]
eigenvalues = eigenvalues[sorted_indices]

# Step 4: Project data onto the top 3 principal components
projected_data = np.dot(pca_data, eigenvectors[:, :3])

# Step 5: Create an interactive 3D plot using Plotly
trace = go.Scatter3d(
    x=projected_data[:, 0],
    y=projected_data[:, 1],
    z=projected_data[:, 2],
    mode='markers',
    marker=dict(size=5, color='blue', opacity=0.8),
    text=[f"LEA ID: {i}, {state}<br>LEA Name: {lea}<br>5_17 Pop: {int(pop)}<br>5_17 Pov: {100*pov:.2f}%" 
          for i, lea, state, pop, pov in zip(ids, lea_names, states, pop5_17, pov5_17)],  
    # Display ID, School Name, and LEA Name when hovering
    hoverinfo="text+x+y+z"
)

PC1_range = [projected_data[:, 0].min(),projected_data[:, 0].max()]
PC2_range = [projected_data[:, 1].min(),projected_data[:, 1].max()]
PC3_range = [projected_data[:, 2].min(),projected_data[:, 2].max()]
for i in range(1,4):
    exec(f"extension = 0.1*(PC{i}_range[1] - PC{i}_range[0])")
    exec(f"PC{i}_range[0] -= extension")
    exec(f"PC{i}_range[1] += extension")

layout = go.Layout(
    title="Data Projected on Top 3 Principal Components",
    scene=dict(
        xaxis=dict(
            title="Principal Component 1",
            range=[projected_data[:, 0].min(), projected_data[:, 0].max()]  
        ),
        yaxis=dict(
            title="Principal Component 2"
        ),
        zaxis=dict(
            title="Principal Component 3"
        )
    )
)

fig = go.Figure(data=[trace], layout=layout)

pio.show(fig)

In [None]:
extreme_PC1 = df.iloc[np.argsort(np.abs(projected_data[:, 0]))[-3:]]
extreme_PC1.T

In [None]:
pc1 = eigenvectors[:, 0]
pc2 = eigenvectors[:, 1]

In [None]:
df.columns.difference(exclude_cols)
print(f"{'Column Name'.ljust(20)}: PC1 Weight")
for i in range(len(pc1)):
    col_name = df.columns.difference(exclude_cols)[i]
    print(f"{col_name.ljust(20)}: {100*pc1[i]:.2f}%")

In [None]:
print(f"{'Column Name'.ljust(20)}: PC2 Weight")
for i in range(len(pc2)):
    col_name = df.columns.difference(exclude_cols)[i]
    print(f"{col_name.ljust(20)}: {100*pc2[i]:.2f}%")

In [None]:
inertia = []
k_range = range(1, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(standardized_data)
    inertia.append(kmeans.inertia_)

# Plot the elbow curve
plt.figure(figsize=(8, 6))
plt.plot(k_range, inertia, 'bo-')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.show()

In [None]:
knee = KneeLocator(k_range, inertia, curve="convex", direction="decreasing")

# Elbow point
optimal_k = knee.elbow

print(f"The optimal number of clusters (k) is: {optimal_k}")

In [None]:
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(standardized_data)

In [None]:
enr_cols = []
unique_clusters = np.unique(df['cluster'])
print(f"{'Cluster'.ljust(10)}: LEAs in Dataset")
for cluster in unique_clusters:
    count = np.sum(df['cluster'] == cluster)
    print(f"{str(cluster).ljust(10)}: {count}")

In [None]:
def lda(X, y):
    mean = X.mean(axis=0)
    class_labels = np.unique(y)
    m, x_m, n = [[],[],[]]
    for cl in class_labels:
        data = X[y == cl]
        m.append(data.mean(axis=0))
        x_m.append(data - m[-1])
        n.append(len(data))
    Sw = sum((xm.T @ xm) for xm in x_m)
    Sb = sum((np.outer(d,d)*n_i) for d, n_i in zip(m-mean,n))
    eigval,eigvec=np.linalg.eig(np.linalg.inv(Sw)@Sb)
    idx = np.argsort(eigval)[::-1]
    return eigval[idx],np.real(eigvec[:,idx])

In [None]:
X = standardized_data
y = df['cluster']
eigval,eigvec = lda(X, y)
X_lda = X@eigvec

# Ensure that X_lda has at least 3 components for 3D plotting
if X_lda.shape[1] < 3:
    # Pad with zeros if fewer than 3 components
    X_lda = np.pad(X_lda, ((0, 0), (0, 3 - X_lda.shape[1])), mode='constant')

# Create an interactive 3D plot using Plotly
trace = go.Scatter3d(
    x=X_lda[:, 0],
    y=X_lda[:, 1],
    z=X_lda[:, 2],
    mode='markers',
    marker=dict(size=5, color=y, opacity=0.8),
    text=[f"LEA ID: {i}, {state}<br>LEA Name: {lea}<br>5_17 Pop: {int(pop)}<br>5_17 Pov: {100*pov:.2f}%" 
          for i, lea, state, pop, pov in zip(ids, lea_names, states, pop5_17, pov5_17)],  
    # Display ID, School Name, and LEA Name when hovering
    hoverinfo="text+x+y+z"
)



layout = go.Layout(
    title="LDA Projection on Top 3 Discriminant Components",
    scene=dict(
        xaxis_title="LDA Component 1",
        yaxis_title="LDA Component 2",
        zaxis_title="LDA Component 3"
    )
)

fig = go.Figure(data=[trace], layout=layout)

pio.show(fig)

In [None]:
extreme_LDA = df.iloc[np.argsort(np.abs(X_lda[:, 0]))[-3:]]
extreme_LDA.T

In [None]:
eig1, eig2 =(eigvec.T)[:2] # column = eigvec
exclude_cols.append('cluster')

In [None]:
print(f"{'Column Name'.ljust(20)}: PC1 Weight")
for i in range(len(eig1)):
    col_name = df.columns.difference(exclude_cols)[i]
    print(f"{col_name.ljust(20)}: {100*eig1[i]:.2f}%")

In [None]:
print(f"{'Column Name'.ljust(20)}: PC1 Weight")
for i in range(len(eig2)):
    col_name = df.columns.difference(exclude_cols)[i]
    print(f"{col_name.ljust(20)}: {100*eig2[i]:.2f}%")

## Covariance

In [None]:
standardized_df = pd.DataFrame(standardized_data)
standardized_df.columns = df.columns.difference(exclude_cols)
correlation_matrix = standardized_df.cov()

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=False, fmt=".2f", cmap="bwr", cbar=True)
plt.title('Correlation Matrix Heatmap')
plt.show()

In [None]:
covariance_matrix = df[df.columns.difference(exclude_cols)].cov()
plt.figure(figsize=(12, 8))
sns.heatmap(covariance_matrix, annot=False, fmt=".2f", cmap="bwr", cbar=True)
plt.title('Covariance Matrix Heatmap')
plt.show()

## Multiple Regression

In [None]:
dependent_var = '5_17_poverty_percent'
independent_vars = df.columns.difference(exclude_cols + [dependent_var])

In [None]:
high_p_vals = ['alg2_f_enr','enr_lep_f','calc_f_enr','total_m_enr',
               'alg1_m_0910_passed','alg1_m_1112_passed','enr_idea_f',
               'advmath_m_enr','enr_504_m','geo_f_enr','advpl_f_noexam',
               'satact_f','chem_m_enr', 'alg1_f_1112_passed']
independent_vars = independent_vars.difference(high_p_vals)
independent_vars

In [None]:
X = df[independent_vars]
X = sm.add_constant(X)
Y = df[dependent_var]
model = sm.OLS(Y, X).fit()
model.summary()

In [None]:
vif_data = pd.DataFrame()
vif_data["Variable"] = X.columns
vif_data["VIF"] = [vif(X.values, i) for i in range(X.shape[1])]
vif_data