In [93]:
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from sqlalchemy import create_engine
import pandas as pd
import json
import requests
import io

In [94]:
# Read the JSON config file
config_file_path = 'config.json'
with open(config_file_path, 'r') as config_file:
    config = json.load(config_file) 

# Get the connection string
CONNECTION_STRING = config['AZURE_CONNECTION_STRING']

In [95]:
def upload_to_azure(data, blob_name, container_name):
    """
    This function uploads the data to azure blob storage
    data: data to be uploaded
    blob_name: name of the blob
    container_name: name of the container in the storage of the resource group 
    """
    blob_service_client = BlobServiceClient.from_connection_string(CONNECTION_STRING)
    container_client = blob_service_client.get_container_client(container_name)
    blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
    blob_client.upload_blob(data.getvalue(), overwrite=True)

In [96]:
def get_azure_blob(container_name):
    blob_service_client = BlobServiceClient.from_connection_string(CONNECTION_STRING)
    container_client = blob_service_client.get_container_client(container_name)
    blob_list = container_client.list_blobs()
    return blob_list

In [97]:
def get_azure_blob_data(container_name, blob):
    blob_service_client = BlobServiceClient.from_connection_string(CONNECTION_STRING)
    container_client = blob_service_client.get_container_client(container_name)
    blob_client = container_client.get_blob_client(blob.name)
    blob_content = blob_client.download_blob().readall()
    return blob_content

In [98]:
def download_file(url):
    response = requests.get(url)
    return io.BytesIO(response.content)

In [99]:
def transform_ppp_loan_data():
    container_name = 'pppdata'
    blob_list = get_azure_blob(container_name)
    print(f"Found blobs in the container")
    for blob in blob_list:
        if "public_150k_plus" in blob.name:
            print(f"Downloading {blob.name}")
            blob_data = get_azure_blob_data(container_name, blob)
            print(f"Downloaded {blob.name} successfully")
            data = io.BytesIO(blob_data)
            print(f"Reading {blob.name}")
            df = pd.read_csv(data)
            return df


In [100]:
def transform_naics_data():
    return None

In [101]:
def transform_gdp_data():
    return None

In [103]:
df = transform_ppp_loan_data()
df.head()

Found blobs in the container
Downloading public_150k_plus_230930.csv
Downloaded public_150k_plus_230930.csv successfully
Reading public_150k_plus_230930.csv


Unnamed: 0,LoanNumber,DateApproved,SBAOfficeCode,ProcessingMethod,BorrowerName,BorrowerAddress,BorrowerCity,BorrowerState,BorrowerZip,LoanStatusDate,...,BusinessType,OriginatingLenderLocationID,OriginatingLender,OriginatingLenderCity,OriginatingLenderState,Gender,Veteran,NonProfit,ForgivenessAmount,ForgivenessDate
0,9547507704,05/01/2020,464,PPP,"SUMTER COATINGS, INC.",2410 Highway 15 South,Sumter,,29150-9662,12/18/2020,...,Corporation,19248,Synovus Bank,COLUMBUS,GA,Unanswered,Unanswered,,773553.37,11/20/2020
1,9777677704,05/01/2020,464,PPP,"PLEASANT PLACES, INC.",7684 Southrail Road,North Charleston,,29420-9000,09/28/2021,...,Sole Proprietorship,19248,Synovus Bank,COLUMBUS,GA,Male Owned,Non-Veteran,,746336.24,08/12/2021
2,5791407702,05/01/2020,1013,PPP,BOYER CHILDREN'S CLINIC,1850 BOYER AVE E,SEATTLE,,98112-2922,03/17/2021,...,Non-Profit Organization,9551,"Bank of America, National Association",CHARLOTTE,NC,Unanswered,Unanswered,Y,696677.49,02/10/2021
3,6223567700,05/01/2020,920,PPP,KIRTLEY CONSTRUCTION INC,1661 MARTIN RANCH RD,SAN BERNARDINO,,92407-1740,10/16/2021,...,Corporation,9551,"Bank of America, National Association",CHARLOTTE,NC,Male Owned,Non-Veteran,,395264.11,09/10/2021
4,9662437702,05/01/2020,101,PPP,AERO BOX LLC,,,,,08/17/2021,...,,57328,The Huntington National Bank,COLUMBUS,OH,Unanswered,Unanswered,,370819.35,04/08/2021


In [102]:
if __name__ == "__main__":
    # Get the head of the data
    transform_gdp_data()
    #transform_naics_data()
    #transform_gdp_data()

Found blobs in the container
Downloading public_150k_plus_230930.csv
Downloaded public_150k_plus_230930.csv successfully
Reading public_150k_plus_230930.csv
