# Purpose

# Imports

In [1]:
import autoreload
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import logging
logger = logging.getLogger()

import os
os.chdir("..") # go up to root

import datetime as dt
import pytz

from io import BytesIO

import pickle
from googleapiclient.discovery import build

# SBA Paycheck Protection Program Data



## Load Latest Cleaned File

Other members of the volunteer data scientist team are doing data cleaning of their own and pushing updates into a shared Google Drive, so let's get that first!

In [63]:
# Pull in our Google Drive creds
with open('secure_keys/token.pickle', 'rb') as token:
    creds = pickle.load(token)
    
service = build('drive', 'v3', credentials=creds)

In [82]:
# Find the info for All Data by State folder that contains raw PPP data
results = service.files().list(
    #fields="nextPageToken, files(id, name)",
q = "name = 'All Data by State'"
).execute()
data_folder_info = results.get('files', [])[0] # just access results dict with 'files' key or returns [] if key not there
data_folder_info

{'kind': 'drive#file',
 'id': '1uuO0075wElCtZVqk9HaUWPCxizohc0L9',
 'name': 'All Data by State',
 'mimeType': 'application/vnd.google-apps.folder'}

In [73]:
# Find the child folders in the All Data by State folder
results = service.files().list(
q = f"'{data_folder_info['id']}' in parents and mimeType = 'application/vnd.google-apps.folder'").execute()
data_subfolders = results.get('files', []) # just access results dict with 'files' key or returns [] if key not there
data_subfolders

[{'kind': 'drive#file',
  'id': '1g92YGs6BD40S89CqRZhlL2XXPQE-JIR8',
  'name': 'Wyoming',
  'mimeType': 'application/vnd.google-apps.folder'},
 {'kind': 'drive#file',
  'id': '19OOYSCuzi2jGJXUBMb8EDtfUxjRW8NyN',
  'name': 'Vermont',
  'mimeType': 'application/vnd.google-apps.folder'},
 {'kind': 'drive#file',
  'id': '1LCRqy05NV8nwaOg-YJOnt0ueow0WFJNq',
  'name': 'Virginia',
  'mimeType': 'application/vnd.google-apps.folder'},
 {'kind': 'drive#file',
  'id': '1XVK0rzn9DteU5Jg5rMootujPKzyfP5bS',
  'name': 'Wisconsin',
  'mimeType': 'application/vnd.google-apps.folder'},
 {'kind': 'drive#file',
  'id': '1NJNG2ewGUrela4JCNZOeq-rCs5WNX-RT',
  'name': 'Washington',
  'mimeType': 'application/vnd.google-apps.folder'},
 {'kind': 'drive#file',
  'id': '1rUua0BiZFioslEY-LJJMXglGIdFMzGly',
  'name': 'Virgin Islands',
  'mimeType': 'application/vnd.google-apps.folder'},
 {'kind': 'drive#file',
  'id': '1NeT6s5LznLXYTlyY1Z-jLuTGXFzQVLOk',
  'name': 'Utah',
  'mimeType': 'application/vnd.google-apps

In [76]:
# Find all CSV files in the child folders
data_subfolder_ids = []

# Get the subfolder IDs
for subfolder in data_subfolders:
    data_subfolder_ids.append(subfolder['id'])
    
data_subfolder_ids

['1g92YGs6BD40S89CqRZhlL2XXPQE-JIR8',
 '19OOYSCuzi2jGJXUBMb8EDtfUxjRW8NyN',
 '1LCRqy05NV8nwaOg-YJOnt0ueow0WFJNq',
 '1XVK0rzn9DteU5Jg5rMootujPKzyfP5bS',
 '1NJNG2ewGUrela4JCNZOeq-rCs5WNX-RT',
 '1rUua0BiZFioslEY-LJJMXglGIdFMzGly',
 '1NeT6s5LznLXYTlyY1Z-jLuTGXFzQVLOk',
 '1y9S9t8zVdazAjiqQQJMvC6vJcpqn940e',
 '1ujNT5X0T8HyXWIgSgmyOACqLjb7lIRfz',
 '1qlt355n6bPlpXgb0e65PVwliRiPQX_Z1',
 '1Gea62RaeWCFVRFv0SdNtTE1URaU1YCPN',
 '1xbGhYS-kfBp2sePhquDU8fy7Wy8Vm8Kn',
 '1lMJJjNh_MKXRWpkyt4YDsyJfPBu06hNM',
 '1tYblyYoqg-ggsHvrjTvJ2ntFiMr0pY4M',
 '11iQjkDi6HT2YengU3beDzgbuHZk9LQhj',
 '12vf4_vOxS3HScnP5Ba9GMoiGECLuKD7A',
 '17gkIBQe2eAcCkRKeJ0FSxvQGVFsEmpqU',
 '1s2nsIS6j7Vt2pdiTmFk7O5EI0ybRosqM',
 '1TSry8Gw8yy9JhWTX2W8sqRiYUd3MjdM-',
 '1r1NnJlUJmDVhX0ecdxfizr9BwbZH0rI5',
 '1e2hVMMlyWHs1podHUsA2sbyrK8TGv-B4',
 '121M-DDixDLNXy43bu-dKNpIHR8pEDKas',
 '1UR8CofrtH8EFMPSf7chIlSKjkCxWSacY',
 '1waA32WjdtBktdDUi4ECcbDHae2zSGeog',
 '1VfG6425yIcUKXhJqG-DuooheYmGd_l_-',
 '1CioK5rO3ytUg82ViytlegaNpGy-8UqyW',
 '1q3ktB6faZ

In [86]:
query = " in parents or ".join([f"'{folder_id}'" for folder_id in data_subfolder_ids])
query += " in parents"
query = f"({query})"
query +=  "and mimeType = 'text/csv'"

In [None]:
# Get all CSV file IDs from data subfolders as a list of ByteStrings
data_file_ids = service.files().list(
q = query).execute().get('files', []) 


# Pull and concatenate all ByteStrings, skipping headers, 
# and decode into single DataFrame for further analysis
# Note that this would be more efficient long-term to ZIP all CSVs into one file 
# and then pull that down alone
for i, file in enumerate(data_file_ids):
    if i == 0:
        data_str = service.files()\
        .get_media(fileId=file['id'])\
        .execute()
        
    # just concatenating here, without header, since we already have it
    else:
        temp_data_str = service.files()\
        .get_media(fileId=file['id'])\
        .execute()
        
        # Assuming here that header is the same across files and thus we can skip it
        # Find end of header by finding first newline character
        data_start_index = temp_data_str.find(b"\n") + 1

        data_str += temp_data_str[data_start_index:]
        # Check that \r\n is at end of string, add it if not
        if data_str[-2:] != b'\r\n':
            data_str += b'\r\n'

# Decode ByteString into something that pandas can make a DataFrame out of
data = data_str.decode('utf8').encode('latin-1')
df = pd.read_csv(BytesIO(data), encoding='latin-1', low_memory=False)

df.info()

### All Together Now!

In [2]:
%%time
from src.data.make_dataset import pull_ppp_data

df = pull_ppp_data(local_copy='/home/jovyan/work/data/raw/unprocessed_ppp_data.csv')
df.info()

100%|██████████| 58/58 [02:23<00:00,  2.47s/it]
  call = lambda f, *a, **k: f(*a, **k)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4885388 entries, 0 to 4885387
Data columns (total 16 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   LoanRange      object
 1   BusinessName   object
 2   Address        object
 3   City           object
 4   State          object
 5   Zip            object
 6   NAICSCode      object
 7   BusinessType   object
 8   RaceEthnicity  object
 9   Gender         object
 10  Veteran        object
 11  NonProfit      object
 12  JobsRetained   object
 13  DateApproved   object
 14  Lender         object
 15  CD             object
dtypes: object(16)
memory usage: 596.4+ MB
CPU times: user 2min 55s, sys: 55.3 s, total: 3min 50s
Wall time: 8min 14s


In [3]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4885388 entries, 0 to 4885387
Data columns (total 16 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   LoanRange      object
 1   BusinessName   object
 2   Address        object
 3   City           object
 4   State          object
 5   Zip            object
 6   NAICSCode      object
 7   BusinessType   object
 8   RaceEthnicity  object
 9   Gender         object
 10  Veteran        object
 11  NonProfit      object
 12  JobsRetained   object
 13  DateApproved   object
 14  Lender         object
 15  CD             object
dtypes: object(16)
memory usage: 3.9 GB


In [4]:
# First memory_usage=es = 87,689 CA counts as top
df['State'].value_counts()

722511.0    135496
531210.0    110875
541110.0    107600
621210.0     88677
CA           87689
             ...  
311224           1
485113           1
311225           1
335220           1
314991.0         1
Name: State, Length: 2170, dtype: int64