<a href="https://colab.research.google.com/github/jhatfi/colab/blob/main/get_folder_ids.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook finds google drive ids for PB data and produces a spreadsheet.

In [None]:
# it appears that this install is no longer needed
# !pip install google-auth google-auth-oauthlib google-auth-httplib2 google-api-python-client


In [None]:
from google.colab import auth
from googleapiclient.discovery import build

In [None]:
auth.authenticate_user()

In [None]:
service = build('drive', 'v3')

In [None]:
def list_folders(folder_id, service):
    """Retrieves a list of all folders under the specified folder ID
    recursively using the Google Drive API.
    """
    # Build the query to retrieve all subfolders under the specified folder ID
    query = "mimeType='application/vnd.google-apps.folder' and trashed = false and parents in '"+folder_id+"'"
    # Use the files().list() method to retrieve the list of subfolders
    results = service.files().list(q=query,fields="nextPageToken, files(id, name)").execute()
    items = results.get('files', [])
    # Initialize an empty list to store the folder information
    folders = []
    # Iterate through the list of subfolders and call the function recursively
    for item in items:
        folders.append({'id': item['id'], 'name': item['name']})
        folders.extend(list_folders(item['id'], service))
    return folders

In [None]:
# Base folder: FactGrid "15RC0mYEaXAk1xYaP8nDMq_82KbkZtNMs"
# (I found that folder ID using the web interface)
folders = list_folders('15RC0mYEaXAk1xYaP8nDMq_82KbkZtNMs', service)
deprecated_folder = list_folders('1WIlpJLUZivSDI7UG_xv2_hwXg7kn3NbB', service)


In [None]:
len(folders)


138

In [None]:
# omitting deprecated CSV folders
deprecated_ids = [f['id'] for f in deprecated_folder]
parent_ids = [f['id'] for f in folders if f['id'] not in deprecated_ids]
parents_clause = ' or '.join([f"parents in '{p}'" for p in parent_ids])
# modify this extraction date to match the most recent
EXTRACTION_DATE='2024-09-15'
query = f"name contains '{EXTRACTION_DATE}' and mimeType='text/csv' and trashed = false and ({parents_clause})"
query

"name contains '2024-09-15' and mimeType='text/csv' and trashed = false and (parents in '1BHaaOlt0DOvEclzXLePXQxP7TE-u8IMt' or parents in '1ov6yTdnM4OtwiRR13NYbVjZnA9Kci7pn' or parents in '1FRZSgOHLFQuazg2KhlUrXKa3nUG4HgL4' or parents in '10EENaYaT3llQMNTv8tCkpBxmi83cUZAB' or parents in '11E-XbIWF6IF2PavqoEcKlAQhCmNGmst-' or parents in '1pV-Y35hTKSjwPKjtSIXh7lbyHhmjNEDX' or parents in '1wfT2qKHhNuQVougqg3EPUWhDhgcEJKTW' or parents in '1mkXz52FgJfLkA48LP2b-Dv3O5nYcMbv3' or parents in '1lJnt6NsGqSxZNfkRz86e_MkDDkzSuuC8' or parents in '1IDmqANdeS-RAYOmwhw657irWsUKnU_YB' or parents in '1hADE1K1c8sUBJErZpG9_vJrUKtPxDBOF' or parents in '12WbvJ36xGt2YiNzjKjZR7vLNtEy3Bjmw' or parents in '1D0r5nSjYGLV7CEo2FveV6QlEpQtsp8By' or parents in '1eWJXN5qDEJi8nG0Gnvg1gRnKYkKijAyh' or parents in '1EOdFKc7NRetEc4Kuh3tL6T8p0RD4Cicv' or parents in '1FK-M5xAyYLhKqA2lvZeA5DCHqezVlkC0' or parents in '1OfM_k-wEFGYOVx7BUJgsNixmxbSN8o9q' or parents in '12cOeaJTRywzbgZwOLCOVnpdMUSXUruc6' or parents in '1D7QwFbqKs1

In [None]:
results = service.files().list(q=query,fields="nextPageToken, files(id, name, mimeType)").execute()
items = results.get('files', [])
items

[{'mimeType': 'text/csv',
  'id': '1a9Fbj_88-MOOR28AdtsE_sbaGbsQsRC1',
  'name': 'BETA - SUBJECT - 2024-09-15.CSV'}]

In [None]:
len(items)

1

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(items)

In [None]:
df

Unnamed: 0,mimeType,id,name
0,text/csv,1a9Fbj_88-MOOR28AdtsE_sbaGbsQsRC1,BETA - SUBJECT - 2024-09-15.CSV


In [None]:
def extract_bib(s):
  r = None
  for bib in ['BETA', 'BITECA', 'BITAGAP']:
    if bib in s:
      r = bib
  return r

def extract_table(s):
  r = None
  for tab in [('ANALYTIC', 'ANA'), ('BIBLIOGRAPHY', 'BIB'),
              ('BIOGRAPHY', 'BIO'), ('COPIES', 'COP'),
              ('GEOGRAPHY', 'GEO'), ('INSTITUTIONS', 'INS'),
              ('LIBRARY', 'LIB'), ('MS_ED', 'MAN'),
              ('SUBJECT', 'SUB'), ('UNIFORM_TITLE', 'UNI')]:
    for tname in tab:
      if tname in s:
        r = tab[-1]
  return r



In [None]:
def classify_pb_csv(name):
  if 'Data Dictionary' in name:
    t = 'datadict'
    bib = 'ALL'
    tab = extract_table(name)
  elif 'DATACLIPS' in name:
    t = 'dataclip'
    bib = extract_bib(name)
    tab = 'ALL'
  else:
    t = 'table'
    bib = extract_bib(name)
    tab = extract_table(name)
  return (t, bib, tab)



In [None]:
classify_pb_csv('BETA - ANALYTIC - 2022-12-28.CSV')

('table', 'BETA', 'ANA')

In [None]:
classify_pb_csv('BETA - DATACLIPS - 2022-12-28.CSV')

('dataclip', 'BETA', 'ALL')

In [None]:
classify_pb_csv('Data Dictionary - BIO - 2022-12-28.csv')

('datadict', 'ALL', 'BIO')

In [None]:
def classify_pb_csv_row(row):
  t, bib, tab = classify_pb_csv(row['name'])
  row['type'] = t
  row['bib'] = bib
  row['tab'] = tab
  return row



In [None]:
df = df.apply(classify_pb_csv_row, axis=1)

In [None]:
df

Unnamed: 0,mimeType,id,name,type,bib,tab
0,text/csv,1a9Fbj_88-MOOR28AdtsE_sbaGbsQsRC1,BETA - SUBJECT - 2024-09-15.CSV,table,BETA,SUB


In [None]:
cols = ['type', 'bib', 'tab', 'id', 'name']
df = df[cols]
df = df.sort_values(by=cols)

In [None]:
df


Unnamed: 0,type,bib,tab,id,name
0,table,BETA,SUB,1a9Fbj_88-MOOR28AdtsE_sbaGbsQsRC1,BETA - SUBJECT - 2024-09-15.CSV


In [None]:
print(df.to_csv(None, index=False))

type,bib,tab,id,name
table,BETA,SUB,1a9Fbj_88-MOOR28AdtsE_sbaGbsQsRC1,BETA - SUBJECT - 2024-09-15.CSV

