In [28]:
import numpy as np
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import warnings

In [29]:
def filter_string(var):
    var = var.replace(" ", "_")
    var = var.lower()
    return var

def sql_to_dataframe(db_tup):
    return pd.DataFrame(data=db_tup[1], columns=db_tup[0])

def create_national_table(df):
    with sqlite3.connect("safe_to_delete.db") as conn:
        df.to_sql("national_data", con = conn, if_exists= "replace", index= False)

def create_state_table(df):
    with sqlite3.connect("safe_to_delete.db") as conn:
        df.to_sql("state_data", con = conn, if_exists= "replace", index= False)

def run_query(q):
    with sqlite3.connect("safe_to_delete.db") as conn:# create connection
        cur = conn.cursor() # create cursor object

        cur.execute(q) # create execution

        results = cur.fetchall()
        colnames = [n[0] for n in cur.description]

    return (colnames, results)

def run_commit_query(q):
    with sqlite3.connect("safe_to_delete.db") as conn:# create connection
        cur = conn.cursor() # create cursor object
        cur.execute(q) # create execution
        conn.commit()

def clean_df(df):
    def clean_column(var):
        col = df[var]
        col = col.astype(str)
        col = col.str.replace("$", "")
        col = col.str.replace("(", "")
        col = col.str.replace(")", "")
        col = col.str.replace(",", "")
        return col

    df.columns = [filter_string(word) for word in df.columns] # replaces spaces with underscores and makes every letter lowercase
    df = df.loc[:, ['occupation_title_(click_on_the_occupation_title_to_view_its_profile)', 'employment', 'mean_hourly_wage', 'annual_mean_wage', ]] # constrain the df to these columns
    df.columns = ['occupation_title', 'employment', 'mean_hourly_wage', 'annual_mean_wage', ]
    df = df.drop(df.index[0]) # remove the first row

    df = df.reset_index(drop=True)

    df["mean_hourly_wage"] = clean_column("mean_hourly_wage").astype(float)
    df["annual_mean_wage"] = clean_column("annual_mean_wage").astype(float)
    df["employment"] = clean_column("employment").astype(float)

    return df

def make_plot(df, title, xlabel, ylabel):
    df.plot.scatter(x=df.columns[0], y=df.columns[1])
    plt.title(title)
    plt.xlabel(xlabel, fontsize=8)
    plt.ylabel(ylabel, fontsize=8)
    plt.xticks(rotation=45,  ha="right", fontsize=8)
    plt.subplots_adjust(bottom=0.5)

In [30]:
# create and clean national and state data
national_df = pd.read_html("may_2023_national_occupational_employment_and_wage_estimates_(4_16_2024_8_54_09_am).html")[0]
state_df = pd.read_html("arizona_may_2023_oews_state_occupational_employment_and_wage_estimates_(4_19_2024_12_21_13_pm).html")[0]
national_df = clean_df(national_df)
state_df = clean_df(state_df)

# create national and state tables within an sqlite database
create_national_table(national_df)
create_state_table(state_df)

# delete specific rows in the occupation_title containing the Characters "Occupations"
run_commit_query("DELETE FROM national_data WHERE occupation_title LIKE '%Occupations%'")
run_commit_query("DELETE FROM state_data WHERE occupation_title LIKE '%Occupations%'")

# assign a dataframe from sql constrainted with the follow query
# returns distinct rows where mean_hourly_wage is higher than 10 USD, ordered from Highest employed -> Lowest employed.

#national_df = sql_to_dataframe(run_query("SELECT DISTINCT occupation_title, employment FROM national_data WHERE mean_hourly_wage > 10 ORDER BY employment DESC;"))


national_df = sql_to_dataframe(run_query("SELECT occupation_title, SUM(employment) as total_employment FROM national_data WHERE mean_hourly_wage > 10 GROUP BY occupation_title ORDER BY total_employment DESC;"))


state_df = sql_to_dataframe(run_query("SELECT DISTINCT occupation_title, employment FROM state_data WHERE mean_hourly_wage > 10 ORDER BY employment DESC LIMIT 20;"))

  col = col.str.replace("$", "")
  col = col.str.replace("(", "")
  col = col.str.replace(")", "")
  col = col.str.replace("$", "")
  col = col.str.replace("(", "")
  col = col.str.replace(")", "")


In [31]:
def create_shared_df(national_df, state_df):
    combined_df = pd.DataFrame()
    combined_df["occupation_title"] = national_df["occupation_title"]
    combined_df["national_employment"] = national_df["total_employment"]
    combined_df["state_employment"] = national_df["total_employment"]
    combined_df = combined_df.reset_index(drop=True)
    return combined_df

In [32]:
national_shared_occupations = []
for job in state_df["occupation_title"].values:
    if job in national_df["occupation_title"].values:
        national_shared_occupations.append(job)
national_shared_occupations = pd.Series(national_shared_occupations)

In [33]:
national_df = national_df[national_df["occupation_title"].isin(national_shared_occupations)]
national_df = national_df.reset_index(drop=True)
national_df = national_df.sort_values(by="occupation_title")

In [34]:
national_df

Unnamed: 0,occupation_title,total_employment
17,"Bookkeeping, Accounting, and Auditing Clerks",1501910.0
0,Cashiers,6617870.0
19,Construction Laborers,1019090.0
18,"Cooks, Restaurant",1412350.0
8,Customer Service Representatives,2858710.0
3,Fast Food and Counter Workers,3676580.0
15,First-Line Supervisors of Office and Administr...,1504570.0
4,General and Operations Managers,3507810.0
12,Heavy and Tractor-Trailer Truck Drivers,2044400.0
1,Home Health and Personal Care Aides,3689350.0


In [35]:
state_df = state_df.sort_values(by="occupation_title")

In [36]:
state_df

Unnamed: 0,occupation_title,employment
19,"Bookkeeping, Accounting, and Auditing Clerks",26540.0
4,Cashiers,71190.0
17,Construction Laborers,28780.0
18,"Cooks, Restaurant",28400.0
1,Customer Service Representatives,95930.0
0,Fast Food and Counter Workers,102570.0
16,First-Line Supervisors of Office and Administr...,31650.0
2,General and Operations Managers,94010.0
12,Heavy and Tractor-Trailer Truck Drivers,37210.0
6,Home Health and Personal Care Aides,64690.0


In [37]:
state_df["occupation_title"].values == national_df["occupation_title"].values

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

In [38]:
combined_df = pd.DataFrame()

In [39]:
combined_df["occupation_title"] = national_df["occupation_title"]
combined_df["national_employment"] = national_df["total_employment"]
combined_df["state_employment"] = national_df["total_employment"]

In [40]:
national_df

Unnamed: 0,occupation_title,total_employment
17,"Bookkeeping, Accounting, and Auditing Clerks",1501910.0
0,Cashiers,6617870.0
19,Construction Laborers,1019090.0
18,"Cooks, Restaurant",1412350.0
8,Customer Service Representatives,2858710.0
3,Fast Food and Counter Workers,3676580.0
15,First-Line Supervisors of Office and Administr...,1504570.0
4,General and Operations Managers,3507810.0
12,Heavy and Tractor-Trailer Truck Drivers,2044400.0
1,Home Health and Personal Care Aides,3689350.0


In [41]:
combined_df

Unnamed: 0,occupation_title,national_employment,state_employment
17,"Bookkeeping, Accounting, and Auditing Clerks",1501910.0,1501910.0
0,Cashiers,6617870.0,6617870.0
19,Construction Laborers,1019090.0,1019090.0
18,"Cooks, Restaurant",1412350.0,1412350.0
8,Customer Service Representatives,2858710.0,2858710.0
3,Fast Food and Counter Workers,3676580.0,3676580.0
15,First-Line Supervisors of Office and Administr...,1504570.0,1504570.0
4,General and Operations Managers,3507810.0,3507810.0
12,Heavy and Tractor-Trailer Truck Drivers,2044400.0,2044400.0
1,Home Health and Personal Care Aides,3689350.0,3689350.0


In [42]:
combined_df["occupation_title"]

17         Bookkeeping, Accounting, and Auditing Clerks
0                                              Cashiers
19                                Construction Laborers
18                                    Cooks, Restaurant
8                      Customer Service Representatives
3                         Fast Food and Counter Workers
15    First-Line Supervisors of Office and Administr...
4                       General and Operations Managers
12              Heavy and Tractor-Trailer Truck Drivers
1                   Home Health and Personal Care Aides
11    Janitors and Cleaners, Except Maids and Housek...
6     Laborers and Freight, Stock, and Material Move...
16              Maintenance and Repair Workers, General
9                                Office Clerks, General
5                                     Registered Nurses
2                                   Retail Salespersons
13    Secretaries and Administrative Assistants, Exc...
14                                  Software Dev