## Import Libraries

In [1]:

import os
import json
import requests
import itertools
import numpy as np
import pandas as pd
from pprint import pprint
from sqlalchemy.orm import Session
from sqlalchemy import create_engine


## Connect to Postgress SQL DB

In [2]:

pg_user = 'postgres'
pg_password = 'password'
db_name = 'Enterprises'

connection_string = f"{pg_user}:{pg_password}@localhost:5432/{db_name}"
engine = create_engine(f'postgresql://{connection_string}')



#### Get table Names

In [3]:

engine.table_names()


['BUSINESSES', 'NAICS', 'STATES', 'DATA']

#### Create dataframes from SQL Query

In [4]:

bus_df = pd.read_sql_query('select * from "BUSINESSES"', con=engine)
naics_df = pd.read_sql_query('select * from "NAICS"', con=engine)
states_df = pd.read_sql_query('select * from "STATES"', con=engine)


#### Refine Business Dataframe to only rows with employment values (where 'Employment Range Flag' is null)

In [5]:

bus_df1 = bus_df[pd.isnull(bus_df['EMPLOYMENT RANGE FLAG'])]
bus_df2 = bus_df1.copy()


#### Refine Business Dataframe to only rows with relevant Enterprise Employment range categories

In [6]:

# bus_df['ENTERPRISE EMPLOYMENT SIZE 2'].unique()
bus_df2['ENTERPRISE EMPLOYMENT SIZE 2'] = bus_df2['ENTERPRISE EMPLOYMENT SIZE 2'].astype(str).str.strip()

good_data = ['0--4','5--9', '10--19', '20--99','100--499','500+']
bad_data = ['Total',' <20','<500' ,'<20','<5']

bus_df2 = bus_df2.loc[~bus_df2['ENTERPRISE EMPLOYMENT SIZE 2'].isin(bad_data)]


#### Refine Business Dataframe to only rows with relevant Industry names

In [7]:

remove_data = ['Total', 'X']
bus_df2 = bus_df2.loc[~bus_df2['NAICS DESCRIPTION'].isin(remove_data)]
        

#### Create column to classify Enterprise Employment ranges into Small & Large Business categories

In [8]:

def classifier(row):
    if row["ENTERPRISE EMPLOYMENT SIZE 2"] in ['0--4', '20--99', '10--19', '5--9','100--499']:
        return "Small Business"
    elif row["ENTERPRISE EMPLOYMENT SIZE 2"] == '500+':
        return "Large Business" 
    else:
        return "N/A"
    
bus_df2["BUSINESS_CLASSIFICATION"] = bus_df2.apply(classifier, axis = 1)


#### Create 'BC Code' for Concatenated Column for Visualization

In [9]:

def bc_code(row):
    if row["ENTERPRISE EMPLOYMENT SIZE 2"] in ['0--4', '20--99', '10--19', '5--9','100--499']:
        return "S"
    elif row["ENTERPRISE EMPLOYMENT SIZE 2"] == '500+':
        return "L" 
    else:
        return "N/A"
    
bus_df2["BC_CODE"] = bus_df2.apply(bc_code, axis = 1)


#### Create column for color grouping to be used in Bubble Visualization

In [10]:

def color_grouping(row):
    if row["ENTERPRISE EMPLOYMENT SIZE 2"] in ['0--4', '20--99', '<20', '10--19', '5--9','100--499']:
        return 1
    elif row["ENTERPRISE EMPLOYMENT SIZE 2"] == '500+':
        return 2 
    else:
        return "N/A"

bus_df2["COLOR_GROUP"] = bus_df2.apply(color_grouping, axis=1)   


#### Create column for Sector grouping to be used in Bubble Visualization

In [11]:
bus_df2['SECTOR'] = bus_df2["BC_CODE"] + bus_df2['NAICS CODE']

#### Convert Payroll column to real value from rounded value

In [12]:
bus_df2['ANNUAL PAYROLL'] = bus_df2['ANNUAL PAYROLL ($1,000)'] * 1000

#### Refine Business & States Dataframes to only include relevant columns

In [13]:

business_df = bus_df2[[ 'STATE DESCRIPTION', 'NAICS CODE', 'ENTERPRISE EMPLOYMENT SIZE 2', 'SECTOR', 'BUSINESS_CLASSIFICATION', 'NUMBER OF FIRMS', 'NUMBER OF ESTABLISHMENTS', 'EMPLOYMENT', 'ANNUAL PAYROLL', 'YEAR', 'COLOR_GROUP']].copy()


states_df = states_df.rename(columns={'STATE':'STATE DESCRIPTION'})
states_df = states_df[["STATE DESCRIPTION","STATE CODE"]]


#### Create joins to establish consolidated dataframe

In [14]:

business_df1 = pd.merge(business_df,naics_df,on='NAICS CODE',how='left')
business_df1= pd.merge(business_df1,states_df,on='STATE DESCRIPTION',how='left')


#### Reorder Dataframe

In [15]:

business_df1 = business_df1[['YEAR','STATE DESCRIPTION', 'STATE CODE', 'NAICS CODE', 'NAME', 'ENTERPRISE EMPLOYMENT SIZE 2', 'BUSINESS_CLASSIFICATION', 'SECTOR', 'NUMBER OF FIRMS', 'NUMBER OF ESTABLISHMENTS', 'EMPLOYMENT', 'ANNUAL PAYROLL', 'COLOR_GROUP', 'INDEX']].copy()
business_df1.head(5)


Unnamed: 0,YEAR,STATE DESCRIPTION,STATE CODE,NAICS CODE,NAME,ENTERPRISE EMPLOYMENT SIZE 2,BUSINESS_CLASSIFICATION,SECTOR,NUMBER OF FIRMS,NUMBER OF ESTABLISHMENTS,EMPLOYMENT,ANNUAL PAYROLL,COLOR_GROUP,INDEX
0,2008,Montana,MT,23,Construction,0--4,Small Business,S23,5905,5913,26822,1046694000,1,4
1,2008,Alabama,AL,11,"Agriculture, Forestry, Fishing and Hunting",0--4,Small Business,S11,858,864,6077,189340000,1,1
2,2008,Alabama,AL,21,"Mining, Quarrying, and Oil and Gas Extraction",0--4,Small Business,S21,162,169,2928,149140000,1,2
3,2008,Alabama,AL,23,Construction,0--4,Small Business,S23,9668,9696,93263,3430552000,1,4
4,2008,Alabama,AL,31-33,Manufacturing,0--4,Small Business,S31-33,3977,4065,94975,3360886000,1,5


#### Table with all relavent data

In [16]:

table_df = business_df1.rename(columns = {'INDEX': 'INDUSTRY_INDEX','NAME': 'INDUSTRY','STATE DESCRIPTION': 'STATE_DESCRIPTION', 'STATE CODE' : 'STATE_CODE', 'NAICS CODE' : 'NAICS_CODE' , 'ENTERPRISE EMPLOYMENT SIZE 2' : 'ENTERPRISE_EMPLOYMENT_SIZE', 'NUMBER OF FIRMS': 'NUMBER_OF_FIRMS', 'NUMBER OF ESTABLISHMENTS' : 'NUMBER_OF_ESTABLISHMENTS', 'ANNUAL PAYROLL' : 'ANNUAL_PAYROLL' })
# Output file name and path
#file_path = os.path.join(".","static","data","table_df.csv")
#table_df.to_csv(file_path, index=False, header=True)


In [18]:
    pmobs_df = table_df[['YEAR', 'NAICS_CODE', 'INDUSTRY', "NUMBER_OF_FIRMS", "BUSINESS_CLASSIFICATION",'EMPLOYMENT','ANNUAL_PAYROLL']].copy()
    remove_data = ['Industries not classified', 'X']
    pmobs_df = pmobs_df.loc[~pmobs_df["INDUSTRY"].isin(remove_data)]
    pmobg_df = pmobs_df.groupby(['YEAR', 'NAICS_CODE', 'INDUSTRY', "BUSINESS_CLASSIFICATION" ])
    ppayroll_sum = pd.DataFrame(pmobg_df['ANNUAL_PAYROLL'].sum())
    pemployment_sum = pd.DataFrame(pmobg_df['EMPLOYMENT'].sum())
    pfirms_sum = pd.DataFrame(pmobg_df['NUMBER_OF_FIRMS'].sum())
    pmob_df = pd.concat([ppayroll_sum,pemployment_sum, pfirms_sum],axis=1)
    pmob_df.sort_values(by='YEAR', inplace=True, ascending = True)
    pmob_df.reset_index(inplace = True)
    pmob_df['FIRMS_log'] = np.log2(pmob_df['NUMBER_OF_FIRMS'])
    
    dictionary = pmob_df.to_dict(orient='records')
    
    pprint(dictionary[0])

{'ANNUAL_PAYROLL': 762376000,
 'BUSINESS_CLASSIFICATION': 'Large Business',
 'EMPLOYMENT': 22397,
 'FIRMS_log': 7.409390936137702,
 'INDUSTRY': 'Agriculture, Forestry, Fishing and Hunting',
 'NAICS_CODE': '11',
 'NUMBER_OF_FIRMS': 170,
 'YEAR': 2008}


In [None]:
    "ANNUAL_PAYROLL": 1046694000.0, 
    "BUSINESS_CLASSIFICATION": "Small Business", 
    "COLOR_GROUP": 1, 
    "EMPLOYMENT": 26822, 
    "ENTERPRISE_EMPLOYMENT_SIZE": "0--4", 
    "INDUSTRY": "Construction", 
    "INDUSTRY_INDEX": 4, 
    "NAICS_CODE": "23", 
    "NUMBER_OF_ESTABLISHMENTS": "5913", 
    "NUMBER_OF_FIRMS": "5905", 
    "SECTOR": "S23", 
    "STATE_CODE": "MT", 
    "STATE_DESCRIPTION": "Montana", 
    "YEAR": 2008, 
    "id": 1

#### Create data frame for Bubble Visualization

In [None]:

bub_df = table_df[['INDUSTRY_INDEX', 'SECTOR', 'INDUSTRY', 'BUSINESS_CLASSIFICATION', 'EMPLOYMENT','ANNUAL_PAYROLL', 'YEAR', 'COLOR_GROUP']].copy()
bub_df.head()


In [None]:
bubg_df = bub_df.groupby(["INDUSTRY_INDEX","SECTOR", "INDUSTRY", 'BUSINESS_CLASSIFICATION', 'YEAR', 'COLOR_GROUP'])
payroll_sum = pd.DataFrame(bubg_df['ANNUAL_PAYROLL'].sum())
employment_sum = pd.DataFrame(bubg_df['EMPLOYMENT'].sum())



#Create a summary data frame by concatenating the results data frames into one data frame to hold the results
bubble_df = pd.concat([payroll_sum,employment_sum],axis=1)

#Sort by total purchase count column in descending order 
bubble_df.sort_values(by='YEAR', inplace=True, ascending = True)

#Display summary data frame
bubble_df.reset_index(inplace = True)
bubble_df.head()

#### Create columns to show Average Salary per Employee

In [None]:
bubble_df['AVG_SALARY'] = bubble_df['ANNUAL_PAYROLL'] / bubble_df['EMPLOYMENT']
bubble_df['AVG_SALARY_F'] = bubble_df['AVG_SALARY'].astype(float).map("${:,.0f}".format)

#Sort by total purchase count column in descending order 

bubble_df.head()

In [None]:

mapp_df = table_df[["STATE_DESCRIPTION","STATE_CODE","BUSINESS_CLASSIFICATION", "YEAR", 'EMPLOYMENT','ANNUAL_PAYROLL']].copy()
mapp_df.head()


In [None]:
mapg_df = mapp_df.groupby(["STATE_DESCRIPTION","STATE_CODE","BUSINESS_CLASSIFICATION", "YEAR"])
mpayroll_sum = pd.DataFrame(mapg_df['ANNUAL_PAYROLL'].sum())
memployment_sum = pd.DataFrame(mapg_df['EMPLOYMENT'].sum())

#Create a summary data frame by concatenating the results data frames into one data frame to hold the results
map_df = pd.concat([mpayroll_sum,memployment_sum],axis=1)

#Sort by total purchase count column in descending order 
map_df.sort_values(by='YEAR', inplace=True, ascending = True)

#Display summary data frame
map_df.reset_index(inplace = True)
map_df.head()


#### Write Dataframes to PostgreSQL DB

In [None]:
# bubble_df.to_sql(name="BUBBLE_SUMMARY", con=engine, if_exists='replace', index=False)
# table_df.to_sql(name="DATA", con=engine, if_exists='replace', index=False)

#### Export DF's for temporary usage

In [None]:

# Output file name and path
file_path = os.path.join(".","static","data","table_df.csv")
table_df.to_csv(file_path, index=False, header=True)

# Output file name and path
file_path = os.path.join(".","static","data","bubble_df.csv")
bubble_df.to_csv(file_path, index=False, header=True)

# Output file name and path
file_path = os.path.join(".","static","data","map_df.csv")
map_df.to_csv(file_path, index=False, header=True)



In [None]:
test = bubble_df.to_dict(orient='records')
test


#### Transformation for Dual Line Chart

In [None]:
businesses_df = table_df[["YEAR", "STATE_DESCRIPTION","STATE_CODE", 'NAICS_CODE', 'INDUSTRY', "BUSINESS_CLASSIFICATION",  'EMPLOYMENT','ANNUAL_PAYROLL']].copy()

In [None]:
remove_datas = ['Large Business', 'X']
remove_datab = ['Small Business', 'X']
s_businesses_df = businesses_df.loc[~businesses_df["BUSINESS_CLASSIFICATION"].isin(remove_datas)]
b_businesses_df = businesses_df.loc[~businesses_df["BUSINESS_CLASSIFICATION"].isin(remove_datab)]


In [None]:
#### Small Business Data Frame Transformations
sg_businesses_df = s_businesses_df.groupby(["YEAR", "STATE_DESCRIPTION","STATE_CODE", 'NAICS_CODE', 'INDUSTRY', "BUSINESS_CLASSIFICATION"])
s_payroll_sum = pd.DataFrame(sg_businesses_df['ANNUAL_PAYROLL'].sum())
s_employment_sum = pd.DataFrame(sg_businesses_df['EMPLOYMENT'].sum())

#Create a summary data frame by concatenating the results data frames into one data frame to hold the results
s_business_df = pd.concat([s_payroll_sum,s_employment_sum],axis=1)

#Sort by total purchase count column in descending order 
s_business_df.sort_values(by='YEAR', inplace=True, ascending = True)

#Display summary data frame
s_business_df.reset_index(inplace = True)

s_business_df['AVG_SALARY_S'] = s_business_df['ANNUAL_PAYROLL'] / s_business_df['EMPLOYMENT']
s_business_df = s_business_df.rename(columns = {'ANNUAL_PAYROLL': 'ANNUAL_PAYROLL_S','EMPLOYMENT': 'EMPLOYMENT_S', "BUSINESS_CLASSIFICATION": "BUSINESS_CLASSIFICATION_S"}) 
s_business_df.head()

In [None]:
#### Large Business Data Frame Transformations
bg_businesses_df = b_businesses_df.groupby(["YEAR", "STATE_DESCRIPTION","STATE_CODE", 'NAICS_CODE', 'INDUSTRY', "BUSINESS_CLASSIFICATION"])
b_payroll_sum = pd.DataFrame(bg_businesses_df['ANNUAL_PAYROLL'].sum())
b_employment_sum = pd.DataFrame(bg_businesses_df['EMPLOYMENT'].sum())

#Create a summary data frame by concatenating the results data frames into one data frame to hold the results
b_business_df = pd.concat([b_payroll_sum,b_employment_sum],axis=1)

#Sort by total purchase count column in descending order 
b_business_df.sort_values(by='YEAR', inplace=True, ascending = True)

#Display summary data frame
b_business_df.reset_index(inplace = True)

b_business_df['AVG_SALARY_B'] = b_business_df['ANNUAL_PAYROLL'] / b_business_df['EMPLOYMENT']
b_business_df = b_business_df.rename(columns = {'ANNUAL_PAYROLL': 'ANNUAL_PAYROLL_B','EMPLOYMENT': 'EMPLOYMENT_B', "BUSINESS_CLASSIFICATION": "BUSINESS_CLASSIFICATION_B"}) 
b_business_df.head()

In [None]:
merge = pd.merge(s_business_df,b_business_df,  how='left', left_on=["YEAR", "STATE_DESCRIPTION","STATE_CODE", 'NAICS_CODE', 'INDUSTRY'], right_on = ["YEAR", "STATE_DESCRIPTION","STATE_CODE", 'NAICS_CODE', 'INDUSTRY'])
merge.dropna(subset = ["BUSINESS_CLASSIFICATION_B"], inplace=True)

In [None]:
# Output file name and path
file_path = os.path.join(".","static","data","linechart_df.csv")
merge.to_csv(file_path, index=False, header=True)