In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup
import io
import re
import json
import time
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
from math import ceil
import zipfile
import os


from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
import google.cloud.storage

In [2]:
# Read the JSON config file
config_file_path = 'config.json'
with open(config_file_path, 'r') as config_file:
    config = json.load(config_file) 

# Get the connection string
CONNECTION_STRING = config['AZURE_CONNECTION_STRING']

In [3]:
def download_file(url):
    response = requests.get(url)
    return io.BytesIO(response.content)

In [4]:
def upload_to_azure(data, blob_name, container_name):
    # This function uploads the data to azure blob storage
    # data: data to be uploaded
    # blob_name: name of the blob
    # container_name: name of the container in the blob storage
    blob_service_client = BlobServiceClient.from_connection_string(CONNECTION_STRING)
    container_client = blob_service_client.get_container_client(container_name)
    blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
    blob_client.upload_blob(data.getvalue(), overwrite=True)

Loan data: 

Go to https://data.sba.gov/dataset/ppp-foia 

13 CSV files on this specific page that need to be uploaded 

Example link of file: https://data.sba.gov/dataset/8aa276e2-6cab-4f86-aca4-a7dde42adf24/resource/738e639c-1fbf-4e16-beb0-a223831011e8/download/public_150k_plus_230930.csv 

In [5]:
# Function to process PPP loan data
def process_ppp_loan_data():
    # Azure container name
    container_name = "pppdata"
    
    # Get the base URL
    ppp_url = "https://data.sba.gov/dataset/ppp-foia"
    response = requests.get(ppp_url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the link to each CSV file
    for link in soup.find_all('a', href=True):
        # Find the CSV files
        if link['href'].endswith('.csv'):
            file_url = link['href']
            file_name = file_url.split('/')[-1]

            # Download the file
            print(f"Processing:\t{file_name}")
            file_content = download_file(file_url)
            print(f"{file_name} downloaded successfully")

            # Read the CSV file
            df = pd.read_csv(file_content)
            print(f"CSV file {file_name} read successfully")
            print(f"Number of rows in {file_name}:\t{len(df)}")
            
            # Upload to Azure
            output = io.StringIO()
            df.to_csv(output, index=False)
            output.seek(0)
            print(f"Uploading {file_name} to Azure")
            upload_to_azure(output, file_name, container_name)
            print(f"{file_name} uploaded to Azure successfully \n")
    print("All files processed successfully")
            


NAICS codes: 

Get the most recent year's “NAICS Descriptions” XLSX file from: https://www.census.gov/naics/?48967  
In this case, it is “2022 NAICS Descriptions” and the link to the file is: https://www.census.gov/naics/2022NAICS/2022_NAICS_Descriptions.xlsx 

In [7]:
def process_naics_data():
    # Azure container name
    container_name = "naicsdata"

    # Get the NAICS Descriptions file
    naics_url = "https://www.census.gov/naics/2022NAICS/2022_NAICS_Descriptions.xlsx"
    file_name = naics_url.split('/')[-1]
    
    # Download the file
    print(f"Processing:\t{file_name}")
    file_content = download_file(naics_url)
    print(f"{file_name} downloaded successfully")
    
    # Convert the XLSX file to CSV
    df = pd.read_excel(file_content)
    print(f"File {file_name} read successfully")
    print(f"Converting {file_name} to CSV")
    csv_output = io.StringIO()
    df.to_csv(csv_output, index=False)
    csv_output.seek(0)
    file_name = file_name.replace(".xlsx", ".csv")
    print(f"{file_name} converted to CSV successfully")
    print(f"Number of rows in NAICS Descriptions:\t{len(df)}")

    # Upload to Azure
    print(f"Uploading {file_name} to Azure")
    upload_to_azure(csv_output, file_name, container_name)
    print(f"{file_name} uploaded to Azure successfully\n")
    print("All files processed successfully")


GDP data: 

Go to this website and download a Zip file: https://apps.bea.gov/regional/zip/CAGDP1.zip 

From this Zip file, we only need the CSV that reads
“CAGDP1__ALL_AREAS”

In [8]:
# Function to process GDP data
def process_gdp_data():
    # Azure container name
    container_name = "gdpdata"

    # Get the GDP data
    gdp_url = "https://apps.bea.gov/regional/zip/CAGDP1.zip"
    zip_content = download_file(gdp_url)
    
    # Process the ZIP file
    with zipfile.ZipFile(zip_content) as zip_ref:
        for filename in zip_ref.namelist():
            # Find the CSV file with all areas
            if "CAGDP1__ALL_AREAS" in filename:
                # Process the CSV file
                print(f"Processing:\t{filename}")
                with zip_ref.open(filename) as f:
                    print(f"{filename} downloaded successfully")

                    # Read the CSV file
                    df = pd.read_csv(f, encoding='latin-1')  # Specify the encoding as 'latin-1'
                    print(f"CSV file {filename} read successfully")
                    print(f"Number of rows in {filename}: {len(df)}")
                    
                    # Upload to Azure
                    output = io.StringIO()
                    df.to_csv(output, index=False)
                    output.seek(0)
                    print(f"Uploading {filename} to Azure")
                    upload_to_azure(output, filename, container_name)
                    print(f"{filename} uploaded to Azure successfully\n")
    print("All files processed successfully")

# Main 

In [9]:
if __name__ == "__main__":
    process_ppp_loan_data()
    process_naics_data()
    process_gdp_data()

Processing:	2022_NAICS_Descriptions.xlsx
2022_NAICS_Descriptions.xlsx downloaded successfully
File 2022_NAICS_Descriptions.xlsx read successfully
Converting 2022_NAICS_Descriptions.xlsx to CSV
2022_NAICS_Descriptions.csv converted to CSV successfully
Number of rows in NAICS Descriptions:	2125
Uploading 2022_NAICS_Descriptions.csv to Azure
2022_NAICS_Descriptions.csv uploaded to Azure successfully

All files processed successfully
