# Education

In [1]:
#DB CONNECTION

import psycopg2
from psycopg2 import sql
from psycopg2.extensions import adapt, register_adapter, AsIs
from sqlalchemy import create_engine,text
import getpass

In [2]:
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import normalize
warnings.filterwarnings("ignore", category=UserWarning)


In [None]:
#Connection params
database = 'f24t03'
user = 'jsmm8'
password = getpass.getpass("Type password and hit enter: ")



Type password and hit enter: ········


In [None]:
#Create the connection and extract data from 'education_levels' table
try:
    conn = psycopg2.connect(database=database,
                            user=user,
                            host='pgsql',
                            password=password)
    print("I am able to connect to the database")
except Exception as e:
    print("I am unable to connect to the database:", e)
    raise

# Define the query to select all data from the education_levels table
query = "SELECT * FROM education_levels;"

# Use pandas to read the SQL query into a DataFrame
try:
    df = pd.read_sql(query, conn)
    print("Data extracted successfully!")
except Exception as e:
    print("Error extracting data:", e)
finally:
    # Close the connection
    conn.close()


# Display the DataFrame
df.shape

I am able to connect to the database
Data extracted successfully!


(24105, 15)

In [5]:
#check stats
df.describe()

Unnamed: 0,year,estimate_total,no_schooling,prek_8th,some_high_school_no_diploma,high_school_graduate,some_college_no_degree,associates_degree,bachelors_degree,advanced_degree
count,24105.0,24105.0,24105.0,24105.0,24105.0,24105.0,24105.0,24105.0,24105.0,24105.0
mean,2020.065339,864.493134,7.571417,17.970836,56.778511,264.263555,189.485086,69.586725,159.311305,99.5257
std,1.405003,467.82077,15.966596,28.783626,57.817699,167.29018,127.060137,61.655935,155.359748,121.159126
min,2018.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2019.0,542.0,0.0,0.0,16.0,148.0,100.0,26.0,52.0,22.0
50%,2020.0,765.0,0.0,8.0,43.0,240.0,161.0,54.0,110.0,56.0
75%,2021.0,1082.0,10.0,24.0,79.0,351.0,250.0,95.0,218.0,132.0
max,2022.0,7112.0,267.0,503.0,918.0,2014.0,1513.0,833.0,2305.0,1364.0


In [6]:
#check info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24105 entries, 0 to 24104
Data columns (total 15 columns):
geo_id                         24105 non-null object
block_group                    24105 non-null object
census_tract                   24105 non-null object
county                         24105 non-null object
state                          24105 non-null object
year                           24105 non-null int64
estimate_total                 24105 non-null int64
no_schooling                   24105 non-null int64
prek_8th                       24105 non-null int64
some_high_school_no_diploma    24105 non-null int64
high_school_graduate           24105 non-null int64
some_college_no_degree         24105 non-null int64
associates_degree              24105 non-null int64
bachelors_degree               24105 non-null int64
advanced_degree                24105 non-null int64
dtypes: int64(10), object(5)
memory usage: 2.8+ MB


In [7]:
#remove unwanted columns
columns_to_drop = [
    'block_group',
    'census_tract',
    'county',
    'state',
    'estimate_total'
]

#Drop the specified columns
df.drop(columns=columns_to_drop, inplace=True)

In [8]:
#check structure again
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24105 entries, 0 to 24104
Data columns (total 10 columns):
geo_id                         24105 non-null object
year                           24105 non-null int64
no_schooling                   24105 non-null int64
prek_8th                       24105 non-null int64
some_high_school_no_diploma    24105 non-null int64
high_school_graduate           24105 non-null int64
some_college_no_degree         24105 non-null int64
associates_degree              24105 non-null int64
bachelors_degree               24105 non-null int64
advanced_degree                24105 non-null int64
dtypes: int64(9), object(1)
memory usage: 1.8+ MB


In [9]:
#Approach: This formula uses the concept of ranks to calculate the Gini coefficient. It multiplies each sorted value by its rank, sums these products, and then applies a formula to measure inequality.

def gini_coefficient(values):
    """Calculate the Gini coefficient for an array of values."""
    values = np.array(values)
    if np.sum(values) == 0:
        return None  
    sorted_values = np.sort(values)
    n = len(values)
    cumulative_sum = np.cumsum(sorted_values)
    gini = (2 * np.sum((np.arange(1, n + 1) * sorted_values)) - (n + 1) * cumulative_sum[-1]) / (n * cumulative_sum[-1])
    return gini


# Define the original lowercase column names 
education_columns = [
    'no_schooling', 'prek_8th', 'some_high_school_no_diploma',
    'high_school_graduate', 'some_college_no_degree',
    'associates_degree', 'bachelors_degree', 'advanced_degree'
]

# Calculate Gini coefficient for each unique (geo_id, year) combination
df['gini_education'] = df[education_columns].apply(gini_coefficient, axis=1)


In [10]:
#Check gini stats
df['gini_education'].describe()

count    24034.000000
mean         0.529545
std          0.081300
min          0.178275
25%          0.474883
50%          0.525110
75%          0.581108
max          0.875000
Name: gini_education, dtype: float64

In [11]:
# Assigning sample weights to education levels (higher education gets higher weight). As it is ordered levels, we used weights in order of educational attainment
weights = {
    'no_schooling': 1,
    'prek_8th': 2,
    'some_high_school_no_diploma': 3,
    'high_school_graduate': 4,
    'some_college_no_degree': 5,
    'associates_degree': 6,
    'bachelors_degree': 7,
    'advanced_degree': 8
}

# Calculate weighted sum of education levels for each row
def weighted_average_education(row):
    total_population = row.sum()
    if total_population == 0:
        return None  #return zero for population
    weighted_sum = sum(row[col] * weights[col] for col in row.index)
    return weighted_sum / total_population

# Apply the function across all rows
df['weighted_education_score'] = df[education_columns].apply(weighted_average_education, axis=1)
df['weighted_education_score'] = df['weighted_education_score'].round(2) 


In [12]:
#create a column with higher education alone to measure highly educated blockgroups
def proportion_higher_education(row):
    higher_education = row[['bachelors_degree', 'advanced_degree']].sum()
    total_population = row.sum()
    if total_population == 0:
        return None
    return higher_education / total_population
# Apply the function across all rows
df['higher_education_proportion'] = df[education_columns].apply(proportion_higher_education, axis=1)
df['higher_education_proportion'] = df['higher_education_proportion'].round(2)

In [13]:


# Discretize Weighted_Education_Score into 5 bins with custom labels
df['education_score_category'] = pd.cut(
    df['weighted_education_score'], 
    bins=5, 
    labels=['Very Low', 'Low', 'Medium', 'High', 'Very High']
)

# Create a figure with subplots for each year
unique_years = df['year'].unique()
num_years = len(unique_years)

fig, axes = plt.subplots(num_years, 1, figsize=(10, 6 * num_years), sharex=True)

# Create a list to store legend handles for a single legend
legend_handles = []

for i, year in enumerate(unique_years):
    # Calculate median values for the current year
    median_gini = df[df['year'] == year]['gini_education'].median()
    median_higher_education = df[df['year'] == year]['higher_education_proportion'].median()
    
    # Create scatter plot for the current year
    scatter = sns.scatterplot(
        data=df[df['year'] == year], 
        x='higher_education_proportion', 
        y='gini_education', 
        hue='education_score_category', 
        palette='tab20', 
        ax=axes[i], 
        alpha=0.5
    )
    
    # Add median lines
    axes[i].axhline(y=median_gini, color='red', linestyle='--', linewidth=1.5, label=f'Median Gini = {median_gini:.2f}')
    axes[i].axvline(x=median_higher_education, color='blue', linestyle='--', linewidth=1.5, label=f'Median Higher Ed = {median_higher_education:.2f}')
    
    # Annotate median values on the plot at the bottom right corner
    axes[i].text(axes[i].get_xlim()[1] - 0.15, axes[i].get_ylim()[0] + 0.15, 
                  f'Median Gini: {median_gini:.2f}', 
                  color='red', fontsize=10, 
                  verticalalignment='bottom', horizontalalignment='right')
    
    axes[i].text(axes[i].get_xlim()[1] - 0.12, axes[i].get_ylim()[0] + 0.12, 
                  f'Median Higher Ed: {median_higher_education:.2f}', 
                  color='blue', fontsize=10, 
                  verticalalignment='bottom', horizontalalignment='right')

    # Add shaded region where Gini < median and Higher Education > median
    # Adjusted limits for the shaded region
    axes[i].axhspan(0, median_gini, xmin=median_higher_education / axes[i].get_xlim()[1], 
                    xmax=1, color='green', alpha=0.3)  # Ensure xmax is set properly
    
    # Set the limits to start from (0,0)
    axes[i].set_xlim(left=0)
    axes[i].set_ylim(bottom=0)

    # Store handles for the legend only for the first plot
    if i == 0:
        # Manually create legend handles for Education Score Categories
        legend_handles = [plt.Line2D([0], [0], marker='o', color='w', 
                                      markerfacecolor=sns.color_palette('tab20')[j], markersize=10) 
                          for j in range(5)]  # Changed to 5 bins
    
    # Set title and labels
    axes[i].set_title(f'Gini Index vs Higher Education Proportion ({year})')
    axes[i].set_ylabel('Gini Index')
    axes[i].grid(False)

# Set the x-label for the last subplot
axes[-1].set_xlabel('Higher Education Proportion')

plt.tight_layout()
plt.show()


<Figure size 1000x3000 with 5 Axes>

* Median gini index is same from 2020 to 2022 but there is a slight increase in higher education proportion
* The Green shaded areas are where we find high education equality.

In [14]:
#Check sample data
df.head()

Unnamed: 0,geo_id,year,no_schooling,prek_8th,some_high_school_no_diploma,high_school_graduate,some_college_no_degree,associates_degree,bachelors_degree,advanced_degree,gini_education,weighted_education_score,higher_education_proportion,education_score_category
0,1500000US290019501001,2018,0,71,158,363,145,48,173,88,0.416348,4.78,0.25,Medium
1,1500000US290019501002,2018,0,58,35,268,131,57,135,162,0.416371,5.36,0.35,Medium
2,1500000US290019501003,2018,4,27,68,207,146,56,50,15,0.481021,4.58,0.11,Low
3,1500000US290019502001,2018,22,21,69,270,95,37,31,17,0.508897,4.27,0.09,Low
4,1500000US290019502002,2018,3,0,27,287,156,12,50,57,0.619088,4.88,0.18,Medium


In [15]:


# Extract education level data
edu_subset_data = df[education_columns]

# Apply row-wise normalization (L2 normalization)
normalized_edu_data = pd.DataFrame(normalize(edu_subset_data, norm='l2', axis=1), 
                                   columns=education_columns)

normalized_edu_data = normalized_edu_data.round(2)

# Join the normalized data back to the original dataframe
df_normalized = df.drop(columns=education_columns).join(normalized_edu_data)

df_normalized['gini_education'] = df_normalized['gini_education'].round(2)

# Now df_normalized has normalized data per row
df_normalized.head()


Unnamed: 0,geo_id,year,gini_education,weighted_education_score,higher_education_proportion,education_score_category,no_schooling,prek_8th,some_high_school_no_diploma,high_school_graduate,some_college_no_degree,associates_degree,bachelors_degree,advanced_degree
0,1500000US290019501001,2018,0.42,4.78,0.25,Medium,0.0,0.15,0.33,0.77,0.31,0.1,0.37,0.19
1,1500000US290019501002,2018,0.42,5.36,0.35,Medium,0.0,0.15,0.09,0.71,0.35,0.15,0.36,0.43
2,1500000US290019501003,2018,0.48,4.58,0.11,Low,0.01,0.1,0.25,0.75,0.53,0.2,0.18,0.05
3,1500000US290019502001,2018,0.51,4.27,0.09,Low,0.07,0.07,0.23,0.9,0.32,0.12,0.1,0.06
4,1500000US290019502002,2018,0.62,4.88,0.18,Medium,0.01,0.0,0.08,0.85,0.46,0.04,0.15,0.17


In [16]:
#Check for NULL values
df_normalized.isnull().sum()

geo_id                          0
year                            0
gini_education                 71
weighted_education_score       71
higher_education_proportion    71
education_score_category       71
no_schooling                    0
prek_8th                        0
some_high_school_no_diploma     0
high_school_graduate            0
some_college_no_degree          0
associates_degree               0
bachelors_degree                0
advanced_degree                 0
dtype: int64

In [17]:
# Update null values in the specified columns with -1 as tehse are areas where there is no population
df_normalized['gini_education'].fillna(-1, inplace=True)
df_normalized['weighted_education_score'].fillna(-1, inplace=True)
df_normalized['higher_education_proportion'].fillna(-1, inplace=True)

# Convert the column to string type
df_normalized['education_score_category'] = df_normalized['education_score_category'].astype(str)

# Now fill null values with 'None'
df_normalized['education_score_category'].fillna('None', inplace=True)



In [18]:
#Check NULL values again
df_normalized.isnull().sum()

geo_id                         0
year                           0
gini_education                 0
weighted_education_score       0
higher_education_proportion    0
education_score_category       0
no_schooling                   0
prek_8th                       0
some_high_school_no_diploma    0
high_school_graduate           0
some_college_no_degree         0
associates_degree              0
bachelors_degree               0
advanced_degree                0
dtype: int64

In [19]:
#Check info
df_normalized.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24105 entries, 0 to 24104
Data columns (total 14 columns):
geo_id                         24105 non-null object
year                           24105 non-null int64
gini_education                 24105 non-null float64
weighted_education_score       24105 non-null float64
higher_education_proportion    24105 non-null float64
education_score_category       24105 non-null object
no_schooling                   24105 non-null float64
prek_8th                       24105 non-null float64
some_high_school_no_diploma    24105 non-null float64
high_school_graduate           24105 non-null float64
some_college_no_degree         24105 non-null float64
associates_degree              24105 non-null float64
bachelors_degree               24105 non-null float64
advanced_degree                24105 non-null float64
dtypes: float64(11), int64(1), object(2)
memory usage: 2.6+ MB


In [None]:
#Create Table education_data_processed

mypasswd = getpass.getpass("Enter your database password: ")

# Database connection details
database = "f24t03"
user = "jsmm8"
password = mypasswd
host = "pgsql"

# Create a connection to the database using SQLAlchemy for easier inserts
engine = create_engine(f'postgresql+psycopg2://{user}:{password}@{host}/{database}')


# Connect and create/drop the table, then insert data in chunks
try:
    # Connect to the database
    conn = psycopg2.connect(database=database, user=user, host=host, password=password)
    cursor = conn.cursor()
    print("Connected to the database.")

    # Drop the education_data_processed table if it exists
    cursor.execute("DROP TABLE IF EXISTS education_data_processed;")
    print("Dropped the education_data_processed table if it existed.")

    # Create the education_data_processed table
    create_table_query = """
CREATE TABLE education_data_processed (
    geo_id TEXT NOT NULL,
    year INT NOT NULL,
    gini_education FLOAT8 NOT NULL,
    weighted_education_score FLOAT8 NOT NULL,
    higher_education_proportion FLOAT8 NOT NULL,
    education_score_category TEXT NOT NULL,
    no_schooling FLOAT8 NOT NULL,
    prek_8th FLOAT8 NOT NULL,
    some_high_school_no_diploma FLOAT8 NOT NULL,
    high_school_graduate FLOAT8 NOT NULL,
    some_college_no_degree FLOAT8 NOT NULL,
    associates_degree FLOAT8 NOT NULL,
    bachelors_degree FLOAT8 NOT NULL,
    advanced_degree FLOAT8 NOT NULL,
    PRIMARY KEY (geo_id, year)
);
    """
    cursor.execute(create_table_query)
    conn.commit()
    print("Created the education_data_processed table.")

    # Insert data from df_normalized in chunks
    chunk_size = 500
    for start in range(0, len(df_normalized), chunk_size):
        chunk = df_normalized.iloc[start:start + chunk_size]
        chunk.to_sql('education_data_processed', engine, if_exists='append', index=False)
        print(f"Uploaded a chunk of {len(chunk)} records to the education_data_processed table.")

    print("Data uploaded successfully.")

    # Query the inserted data to confirm
    query = "SELECT * FROM education_data_processed;"
    df_normalized = pd.read_sql(query, conn)
    print("Data extracted successfully! DataFrame shape:", df_normalized.shape)

except Exception as e:
    print("An error occurred:", e)

finally:
    # Ensure that the cursor and connection are closed
    if cursor:
        cursor.close()
    if conn:
        conn.close()


Enter your database password: ········
Connected to the database.
Dropped the education_data_processed table if it existed.
Created the education_data_processed table.
Uploaded a chunk of 500 records to the education_data_processed table.
Uploaded a chunk of 500 records to the education_data_processed table.
Uploaded a chunk of 500 records to the education_data_processed table.
Uploaded a chunk of 500 records to the education_data_processed table.
Uploaded a chunk of 500 records to the education_data_processed table.
Uploaded a chunk of 500 records to the education_data_processed table.
Uploaded a chunk of 500 records to the education_data_processed table.
Uploaded a chunk of 500 records to the education_data_processed table.
Uploaded a chunk of 500 records to the education_data_processed table.
Uploaded a chunk of 500 records to the education_data_processed table.
Uploaded a chunk of 500 records to the education_data_processed table.
Uploaded a chunk of 500 records to the education_da

In [21]:
#validate sample
df_normalized.head()

Unnamed: 0,geo_id,year,gini_education,weighted_education_score,higher_education_proportion,education_score_category,no_schooling,prek_8th,some_high_school_no_diploma,high_school_graduate,some_college_no_degree,associates_degree,bachelors_degree,advanced_degree
0,1500000US290019501001,2018,0.42,4.78,0.25,Medium,0.0,0.15,0.33,0.77,0.31,0.1,0.37,0.19
1,1500000US290019501002,2018,0.42,5.36,0.35,Medium,0.0,0.15,0.09,0.71,0.35,0.15,0.36,0.43
2,1500000US290019501003,2018,0.48,4.58,0.11,Low,0.01,0.1,0.25,0.75,0.53,0.2,0.18,0.05
3,1500000US290019502001,2018,0.51,4.27,0.09,Low,0.07,0.07,0.23,0.9,0.32,0.12,0.1,0.06
4,1500000US290019502002,2018,0.62,4.88,0.18,Medium,0.01,0.0,0.08,0.85,0.46,0.04,0.15,0.17


In [22]:

# SQL query to grant privileges

grant_privileges_query = "GRANT ALL PRIVILEGES ON TABLE education_data_processed TO jsmm8, remcmf, sgdky;"

with engine.connect() as connection:
        connection.execute(text(grant_privileges_query))
        print("Privileges granted successfully.")

Privileges granted successfully.


**Create a Demographics Master table which contains data of Race, Income and Education attaributes that we created earlier.**

In [None]:
#Create the connection and extract data from all socio economic feature tables table
try:
    conn = psycopg2.connect(database=database,
                            user=user,
                            host='pgsql',
                            password=password)
    print("I am able to connect to the database")
except Exception as e:
    print("I am unable to connect to the database:", e)
    raise

# Define the query to select all data from the socio demographics tables
query = "SELECT  r.geo_id, r.year, r.total_population, r.shannon_index, r.simpsons_index, r.richness, r.simpson_dominance, r.berger_parker_dominance, r.simpson_evenness, r.block_group, r.census_tract, r.county, r.state, r.group_classification, r.hispanic_or_latino, r.asian, r.black, r.native, r.other, r.pac_isl, r.two_or_more_races, r.white,  e.gini_education, e.weighted_education_score, e.higher_education_proportion, e.education_score_category, e.no_schooling, e.prek_8th, e.some_high_school_no_diploma, e.high_school_graduate, e.some_college_no_degree, e.associates_degree, e.bachelors_degree, e.advanced_degree,  i.median_income, i.gini_coefficient, i.est_less_than_10k, i.est_10to15k, i.est_15to20k, i.est_20to25k, i.est_25to30k, i.est_30to35k, i.est_35to40k, i.est_40to45k, i.est_45to50k, i.est_50to60k, i.est_60to75k, i.est_75to100k, i.est_100to125k, i.est_125to150k, i.est_150to200k, i.est_more_than_200k FROM race_data_processed r JOIN education_data_processed e ON r.geo_id = e.geo_id AND r.year = e.year JOIN income_data_processed i ON r.geo_id = i.geo_id AND r.year = i.year WHERE r.year IN (2020, 2021, 2022) AND e.year IN (2020, 2021, 2022) AND i.year IN (2020, 2021, 2022);"

# Use pandas to read the SQL query into a DataFrame
try:
    df = pd.read_sql(query, conn)
    print("Data extracted successfully!")
except Exception as e:
    print("Error extracting data:", e)
finally:
    # Close the connection
    conn.close()


# Display the DataFrame
df.shape

I am able to connect to the database
Data extracted successfully!


(15093, 52)

In [24]:
#Check the sample
df.head()

Unnamed: 0,geo_id,year,total_population,shannon_index,simpsons_index,richness,simpson_dominance,berger_parker_dominance,simpson_evenness,block_group,...,est_35to40k,est_40to45k,est_45to50k,est_50to60k,est_60to75k,est_75to100k,est_100to125k,est_125to150k,est_150to200k,est_more_than_200k
0,1500000US290019501001,2020,1284,0.058,0.031,0.5,0.969,0.984,0.041,Block Group 1,...,0.14,0.19,0.11,0.7,0.24,0.39,0.31,0.14,0.05,0.03
1,1500000US290019501002,2020,1403,0.093,0.058,0.375,0.942,0.97,0.088,Block Group 2,...,0.04,0.16,0.09,0.25,0.46,0.4,0.26,0.12,0.24,0.02
2,1500000US290019501003,2020,1007,0.284,0.266,0.625,0.734,0.843,0.333,Block Group 3,...,0.1,0.09,0.31,0.43,0.38,0.38,0.09,0.16,0.15,0.09
3,1500000US290019502001,2020,891,0.201,0.14,0.5,0.86,0.926,0.187,Block Group 1,...,0.2,0.07,0.12,0.13,0.23,0.55,0.24,0.0,0.0,0.03
4,1500000US290019502002,2020,991,0.215,0.166,0.5,0.834,0.91,0.221,Block Group 2,...,0.14,0.18,0.04,0.41,0.35,0.38,0.25,0.17,0.33,0.11


In [25]:
#Check info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15093 entries, 0 to 15092
Data columns (total 52 columns):
geo_id                         15093 non-null object
year                           15093 non-null int64
total_population               15093 non-null int64
shannon_index                  15093 non-null float64
simpsons_index                 15093 non-null float64
richness                       15093 non-null float64
simpson_dominance              15093 non-null float64
berger_parker_dominance        15093 non-null float64
simpson_evenness               15093 non-null float64
block_group                    15093 non-null object
census_tract                   15093 non-null object
county                         15093 non-null object
state                          15093 non-null object
group_classification           15093 non-null object
hispanic_or_latino             15093 non-null float64
asian                          15093 non-null float64
black                          15093 

In [None]:
host='pgsql'
# Create a connection to the database using SQLAlchemy for easier inserts
engine = create_engine(f'postgresql+psycopg2://{user}:{password}@{host}/{database}')


# Connect and create/drop the table, then insert data in chunks
try:
    # Connect to the database
    conn = psycopg2.connect(database=database, user=user, host=host, password=password)
    cursor = conn.cursor()
    print("Connected to the database.")

    # Drop the demographics_master table if it exists
    cursor.execute("DROP TABLE IF EXISTS demographics_master;")
    print("Dropped the demographics_master table if it existed.")

    # Create the demographics_master table
    create_table_query = """
CREATE TABLE demographics_master (
    geo_id TEXT NOT NULL,
    year INT NOT NULL,
    total_population INT NOT NULL,
    shannon_index FLOAT8 NOT NULL,
    simpsons_index FLOAT8 NOT NULL,
    richness FLOAT8 NOT NULL,
    simpson_dominance FLOAT8 NOT NULL,
    berger_parker_dominance FLOAT8 NOT NULL,
    simpson_evenness FLOAT8 NOT NULL,
    block_group TEXT NOT NULL,
    census_tract TEXT NOT NULL,
    county TEXT NOT NULL,
    state TEXT NOT NULL,
    group_classification TEXT NOT NULL,
    hispanic_or_latino FLOAT8 NOT NULL,
    asian FLOAT8 NOT NULL,
    black FLOAT8 NOT NULL,
    native FLOAT8 NOT NULL,
    other FLOAT8 NOT NULL,
    pac_isl FLOAT8 NOT NULL,
    two_or_more_races FLOAT8 NOT NULL,
    white FLOAT8 NOT NULL,
    gini_education FLOAT8 NOT NULL,
    weighted_education_score FLOAT8 NOT NULL,
    higher_education_proportion FLOAT8 NOT NULL,
    education_score_category TEXT NOT NULL,
    no_schooling FLOAT8 NOT NULL,
    prek_8th FLOAT8 NOT NULL,
    some_high_school_no_diploma FLOAT8 NOT NULL,
    high_school_graduate FLOAT8 NOT NULL,
    some_college_no_degree FLOAT8 NOT NULL,
    associates_degree FLOAT8 NOT NULL,
    bachelors_degree FLOAT8 NOT NULL,
    advanced_degree FLOAT8 NOT NULL,
    median_income FLOAT8 NOT NULL,
    gini_coefficient FLOAT8 NOT NULL,
    est_less_than_10k FLOAT8 NOT NULL,
    est_10to15k FLOAT8 NOT NULL,
    est_15to20k FLOAT8 NOT NULL,
    est_20to25k FLOAT8 NOT NULL,
    est_25to30k FLOAT8 NOT NULL,
    est_30to35k FLOAT8 NOT NULL,
    est_35to40k FLOAT8 NOT NULL,
    est_40to45k FLOAT8 NOT NULL,
    est_45to50k FLOAT8 NOT NULL,
    est_50to60k FLOAT8 NOT NULL,
    est_60to75k FLOAT8 NOT NULL,
    est_75to100k FLOAT8 NOT NULL,
    est_100to125k FLOAT8 NOT NULL,
    est_125to150k FLOAT8 NOT NULL,
    est_150to200k FLOAT8 NOT NULL,
    est_more_than_200k FLOAT8 NOT NULL,
    PRIMARY KEY (geo_id, year)
);
    """
    cursor.execute(create_table_query)
    conn.commit()
    print("Created the demographics_master table.")

    # Insert data from df in chunks
    chunk_size = 500
    for start in range(0, len(df), chunk_size):
        chunk = df.iloc[start:start + chunk_size]
        chunk.to_sql('demographics_master', engine, if_exists='append', index=False)
        print(f"Uploaded a chunk of {len(chunk)} records to the demographics_master table.")

    print("Data uploaded successfully.")

    # Query the inserted data to confirm
    query = "SELECT * FROM demographics_master;"
    df = pd.read_sql(query, conn)
    print("Data extracted successfully! DataFrame shape:", df.shape)

except Exception as e:
    print("An error occurred:", e)

finally:
    # Ensure that the cursor and connection are closed
    if cursor:
        cursor.close()
    if conn:
        conn.close()


Connected to the database.
Dropped the demographics_master table if it existed.
Created the demographics_master table.
Uploaded a chunk of 500 records to the demographics_master table.
Uploaded a chunk of 500 records to the demographics_master table.
Uploaded a chunk of 500 records to the demographics_master table.
Uploaded a chunk of 500 records to the demographics_master table.
Uploaded a chunk of 500 records to the demographics_master table.
Uploaded a chunk of 500 records to the demographics_master table.
Uploaded a chunk of 500 records to the demographics_master table.
Uploaded a chunk of 500 records to the demographics_master table.
Uploaded a chunk of 500 records to the demographics_master table.
Uploaded a chunk of 500 records to the demographics_master table.
Uploaded a chunk of 500 records to the demographics_master table.
Uploaded a chunk of 500 records to the demographics_master table.
Uploaded a chunk of 500 records to the demographics_master table.
Uploaded a chunk of 500

In [27]:
        
# SQL query to grant privileges

grant_privileges_query = "GRANT ALL PRIVILEGES ON TABLE demographics_master TO jsmm8, remcmf, sgdky;"

with engine.connect() as connection:
        connection.execute(text(grant_privileges_query))
        print("Privileges granted successfully.")

Privileges granted successfully.
