## Import Libraries

In [25]:

import os
import json
import requests
import itertools
import numpy as np
import pandas as pd
from pprint import pprint
from sqlalchemy.orm import Session
from sqlalchemy import create_engine


## Connect to Postgress SQL DB

In [96]:

pg_user = 'postgres'
pg_password = 'password'
db_name = 'Enterprises'

connection_string = f"{pg_user}:{pg_password}@localhost:5432/{db_name}"
engine = create_engine(f'postgresql://{connection_string}')



#### Get table Names

In [97]:

engine.table_names()


['NAICS', 'BUSINESSES', 'STATES']

#### Create dataframes from SQL Query

In [262]:

bus_df = pd.read_sql_query('select * from "BUSINESSES"', con=engine)
naics_df = pd.read_sql_query('select * from "NAICS"', con=engine)
states_df = pd.read_sql_query('select * from "STATES"', con=engine)


#### Refine Business Dataframe to only rows with employment values (where 'Employment Range Flag' is null)

In [263]:

bus_df1 = bus_df[pd.isnull(bus_df['EMPLOYMENT RANGE FLAG'])]
bus_df2 = bus_df1.copy()


#### Refine Business Dataframe to only rows with relevant Enterprise Employment range categories

In [264]:

# bus_df['ENTERPRISE EMPLOYMENT SIZE 2'].unique()
bus_df2['ENTERPRISE EMPLOYMENT SIZE 2'] = bus_df2['ENTERPRISE EMPLOYMENT SIZE 2'].astype(str).str.strip()

good_data = ['0--4','5--9', '10--19', '20--99','100--499','500+']
bad_data = ['Total',' <20','<500' ,'<20','<5']

bus_df2 = bus_df2.loc[~bus_df2['ENTERPRISE EMPLOYMENT SIZE 2'].isin(bad_data)]


#### Refine Business Dataframe to only rows with relevant Industry names

In [265]:

remove_data = ['Total', 'X']
bus_df2 = bus_df2.loc[~bus_df2['NAICS DESCRIPTION'].isin(remove_data)]
        

#### Create column to classify Enterprise Employment ranges into Small & Large Business categories

In [266]:

def classifier(row):
    if row["ENTERPRISE EMPLOYMENT SIZE 2"] in ['0--4', '20--99', '10--19', '5--9','100--499']:
        return "Small Business"
    elif row["ENTERPRISE EMPLOYMENT SIZE 2"] == '500+':
        return "Large Business" 
    else:
        return "N/A"
    
bus_df2["BUSINESS_CLASSIFICATION"] = bus_df2.apply(classifier, axis = 1)


#### Create 'BC Code' for Concatenated Column for Visualization

In [267]:

def bc_code(row):
    if row["ENTERPRISE EMPLOYMENT SIZE 2"] in ['0--4', '20--99', '10--19', '5--9','100--499']:
        return "S"
    elif row["ENTERPRISE EMPLOYMENT SIZE 2"] == '500+':
        return "L" 
    else:
        return "N/A"
    
bus_df2["BC_CODE"] = bus_df2.apply(bc_code, axis = 1)


#### Create column for color grouping to be used in Bubble Visualization

In [268]:

def color_grouping(row):
    if row["ENTERPRISE EMPLOYMENT SIZE 2"] in ['0--4', '20--99', '<20', '10--19', '5--9','100--499']:
        return 1
    elif row["ENTERPRISE EMPLOYMENT SIZE 2"] == '500+':
        return 2 
    else:
        return "N/A"

bus_df2["COLOR_GROUP"] = bus_df2.apply(color_grouping, axis=1)   


#### Create column for Sector grouping to be used in Bubble Visualization

In [269]:
bus_df2['SECTOR'] = bus_df2["BC_CODE"] + bus_df2['NAICS CODE']

#### Convert Payroll column to real value from rounded value

In [270]:
bus_df2['ANNUAL PAYROLL'] = bus_df2['ANNUAL PAYROLL ($1,000)'] * 1000

#### Refine Business & States Dataframes to only include relevant columns

In [271]:

business_df = bus_df2[[ 'STATE DESCRIPTION', 'NAICS CODE', 'ENTERPRISE EMPLOYMENT SIZE 2', 'SECTOR', 'BUSINESS_CLASSIFICATION', 'NUMBER OF FIRMS', 'NUMBER OF ESTABLISHMENTS', 'EMPLOYMENT', 'ANNUAL PAYROLL', 'YEAR', 'COLOR_GROUP']].copy()


states_df = states_df.rename(columns={'STATE':'STATE DESCRIPTION'})
states_df = states_df[["STATE DESCRIPTION","STATE CODE"]]


#### Create joins to establish consolidated dataframe

In [272]:

business_df1 = pd.merge(business_df,naics_df,on='NAICS CODE',how='left')
business_df1= pd.merge(business_df1,states_df,on='STATE DESCRIPTION',how='left')


#### Reorder Dataframe

In [273]:

business_df1 = business_df1[['YEAR','STATE DESCRIPTION', 'STATE CODE', 'NAICS CODE', 'NAME', 'ENTERPRISE EMPLOYMENT SIZE 2', 'BUSINESS_CLASSIFICATION', 'SECTOR', 'NUMBER OF FIRMS', 'NUMBER OF ESTABLISHMENTS', 'EMPLOYMENT', 'ANNUAL PAYROLL', 'COLOR_GROUP']].copy()
business_df1.head(5)


Unnamed: 0,YEAR,STATE DESCRIPTION,STATE CODE,NAICS CODE,NAME,ENTERPRISE EMPLOYMENT SIZE 2,BUSINESS_CLASSIFICATION,SECTOR,NUMBER OF FIRMS,NUMBER OF ESTABLISHMENTS,EMPLOYMENT,ANNUAL PAYROLL,COLOR_GROUP
0,2008,Montana,MT,23,Construction,0--4,Small Business,S23,5905,5913,26822,1046694000,1
1,2008,Alabama,AL,11,"Agriculture, Forestry, Fishing and Hunting",0--4,Small Business,S11,858,864,6077,189340000,1
2,2008,Alabama,AL,21,"Mining, Quarrying, and Oil and Gas Extraction",0--4,Small Business,S21,162,169,2928,149140000,1
3,2008,Alabama,AL,23,Construction,0--4,Small Business,S23,9668,9696,93263,3430552000,1
4,2008,Alabama,AL,31-33,Manufacturing,0--4,Small Business,S31-33,3977,4065,94975,3360886000,1


#### Table with all relavent data

In [274]:

table_df = business_df1.rename(columns = {'NAME': 'INDUSTRY','STATE DESCRIPTION': 'STATE_DESCRIPTION', 'STATE CODE' : 'STATE_CODE', 'NAICS CODE' : 'NAICS_CODE' , 'ENTERPRISE EMPLOYMENT SIZE 2' : 'ENTERPRISE_EMPLOYMENT_SIZE', 'NUMBER OF FIRMS': 'NUMBER_OF_FIRMS', 'NUMBER OF ESTABLISHMENTS' : 'NUMBER_OF_ESTABLISHMENTS', 'ANNUAL PAYROLL' : 'ANNUAL_PAYROLL' })
table_df.head(5)


Unnamed: 0,YEAR,STATE_DESCRIPTION,STATE_CODE,NAICS_CODE,INDUSTRY,ENTERPRISE_EMPLOYMENT_SIZE,BUSINESS_CLASSIFICATION,SECTOR,NUMBER_OF_FIRMS,NUMBER_OF_ESTABLISHMENTS,EMPLOYMENT,ANNUAL_PAYROLL,COLOR_GROUP
0,2008,Montana,MT,23,Construction,0--4,Small Business,S23,5905,5913,26822,1046694000,1
1,2008,Alabama,AL,11,"Agriculture, Forestry, Fishing and Hunting",0--4,Small Business,S11,858,864,6077,189340000,1
2,2008,Alabama,AL,21,"Mining, Quarrying, and Oil and Gas Extraction",0--4,Small Business,S21,162,169,2928,149140000,1
3,2008,Alabama,AL,23,Construction,0--4,Small Business,S23,9668,9696,93263,3430552000,1
4,2008,Alabama,AL,31-33,Manufacturing,0--4,Small Business,S31-33,3977,4065,94975,3360886000,1


#### Create data frame for Bubble Visualization

In [275]:

bub_df = table_df[['SECTOR', 'INDUSTRY', 'BUSINESS_CLASSIFICATION', 'EMPLOYMENT','ANNUAL_PAYROLL', 'YEAR', 'COLOR_GROUP']].copy()
bub_df.head()


Unnamed: 0,SECTOR,INDUSTRY,BUSINESS_CLASSIFICATION,EMPLOYMENT,ANNUAL_PAYROLL,YEAR,COLOR_GROUP
0,S23,Construction,Small Business,26822,1046694000,2008,1
1,S11,"Agriculture, Forestry, Fishing and Hunting",Small Business,6077,189340000,2008,1
2,S21,"Mining, Quarrying, and Oil and Gas Extraction",Small Business,2928,149140000,2008,1
3,S23,Construction,Small Business,93263,3430552000,2008,1
4,S31-33,Manufacturing,Small Business,94975,3360886000,2008,1


In [276]:
bubg_df = bub_df.groupby(["SECTOR", "INDUSTRY", 'BUSINESS_CLASSIFICATION', 'YEAR', 'COLOR_GROUP'])
payroll_sum = pd.DataFrame(bubg_df['ANNUAL_PAYROLL'].sum())
employment_sum = pd.DataFrame(bubg_df['EMPLOYMENT'].sum())



#Create a summary data frame by concatenating the results data frames into one data frame to hold the results
bubble_df = pd.concat([payroll_sum,employment_sum],axis=1)

#Sort by total purchase count column in descending order 
bubble_df.sort_values(by='YEAR', inplace=True, ascending = True)

#Display summary data frame
bubble_df.reset_index(inplace = True)
bubble_df.head()

Unnamed: 0,SECTOR,INDUSTRY,BUSINESS_CLASSIFICATION,YEAR,COLOR_GROUP,ANNUAL_PAYROLL,EMPLOYMENT
0,L11,"Agriculture, Forestry, Fishing and Hunting",Large Business,2008,2,762376000,22397
1,S62,Health Care and Social Assistance,Small Business,2008,1,614642774000,16421150
2,S61,Educational Services,Small Business,2008,1,80075492000,2815597
3,L42,Wholesale Trade,Large Business,2008,2,165901247000,2470501
4,S56,Administrative and Support and Waste Managemen...,Small Business,2008,1,231193982000,7528779


#### Create columns to show Average Salary per Employee

In [277]:
bubble_df['AVG_SALARY'] = bubble_df['ANNUAL_PAYROLL'] / bubble_df['EMPLOYMENT']
bubble_df['AVG_SALARY_F'] = bubble_df['AVG_SALARY'].astype(float).map("${:,.0f}".format)

#Sort by total purchase count column in descending order 

bubble_df.head()

Unnamed: 0,SECTOR,INDUSTRY,BUSINESS_CLASSIFICATION,YEAR,COLOR_GROUP,ANNUAL_PAYROLL,EMPLOYMENT,AVG_SALARY,AVG_SALARY_F
0,L11,"Agriculture, Forestry, Fishing and Hunting",Large Business,2008,2,762376000,22397,34039.201679,"$34,039"
1,S62,Health Care and Social Assistance,Small Business,2008,1,614642774000,16421150,37429.946989,"$37,430"
2,S61,Educational Services,Small Business,2008,1,80075492000,2815597,28439.969214,"$28,440"
3,L42,Wholesale Trade,Large Business,2008,2,165901247000,2470501,67152.875874,"$67,153"
4,S56,Administrative and Support and Waste Managemen...,Small Business,2008,1,231193982000,7528779,30708.031409,"$30,708"


#### Write Dataframes to PostgreSQL DB

In [282]:
bubble_df.to_sql(name="BUBBLE_SUMMARY", con=engine, if_exists='replace', index=False)
table_df.to_sql(name="DATA", con=engine, if_exists='replace', index=False)

#### Export DF's for temporary usage

In [None]:

# Output file name and path
file_path = os.path.join(".","static","data","table_df.csv")
table_df.to_csv(output_file, index=False, header=True)

# Output file name and path
file_path = os.path.join(".","static","data","bubble_df.csv")
bubble_df.to_csv(output_file, index=False, header=True)

