<a href="https://colab.research.google.com/github/jhatfi/colab/blob/main/qs_file_splitter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Script to break up quickstatement files into managable chunks for importing into factgrid
import io
import os
from google.colab import auth
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from googleapiclient.http import MediaFileUpload

# auth user
auth.authenticate_user()
print('Authenticated')

service = build('drive', 'v3')

Authenticated


In [None]:
# Set variables for folder id's, bib, tables and force statements

QS_FILES_SOURCE = {'beta': '1efJDT_HJoIsrRBw1bySuIke6n3xaSTyt',
                   'biteca': '',
                   'bitagap': ''}

SPLIT_FILES_STAGING = {'beta': '187KTNwJ2LZXf5d8WAFO1qnP6g8Yqderw',
                          'biteca': '',
                          'bitagap': ''}


In [None]:
def find_file_id(folder_id, table):
    query = f"'{folder_id}' in parents and not mimeType='application/vnd.google-apps.folder'"
    results = service.files().list(q=query,fields="nextPageToken, files(id, name)").execute()
    file_name = [item['name'] for item in results['files'] if item['name'].endswith('.qs') and table in item['name']]
    file_id = [item['id'] for item in results['files'] if item['name'].endswith('.qs') and table in item['name']]
    if len(file_id) > 0:
        return file_name[0], file_id[0]

In [None]:
def download_file(file_name, file_id):
    # Download the file
    if len(file_id) > 0: # Check if file_id list contains any elements
        print(f'Downloading file: {file_name} with id: {file_id}')
        request = service.files().get_media(fileId=file_id)
        fh = io.BytesIO()
        downloader = MediaIoBaseDownload(fh, request)
        done = False
        while done is False:
            status, done = downloader.next_chunk()
            print("Download %d%%." % int(status.progress() * 100))
    else:
        print(f'No file found for {file_name}') # Handle the case of an empty file_id list

    # Get the current working directory
    cwd = os.getcwd()

    # Construct the full path to the downloaded file
    file_path = os.path.join(cwd, file_name)

    # Save the downloaded file to the current working directory
    with open(file_path, 'wb') as f:
        fh.seek(0)
        f.write(fh.read())

    print(f'File downloaded to: {file_path}')

In [None]:
def move_file(destination_id, file_name):
    file_metadata = {'name': os.path.basename(file_name)}
    media = MediaFileUpload(file_name)  # Adjust mimetype if needed
    file = service.files().create(body=file_metadata, media_body=media, fields='id').execute()
    file_id = file['id']

    # Move the uploaded file to the desired folder
    file = service.files().get(fileId=file_id, fields='parents').execute()
    previous_parents = ",".join(file.get('parents'))
    file = service.files().update(fileId=file_id,
                                  addParents=destination_id,
                                  removeParents=previous_parents,
                                  fields='id, parents').execute()

    print(f'File {file_name} copied successfully! File ID: {file_id}')

In [None]:
def find_unique_items(unique_items, update_file):
    # Find all other rows that start with the same 'Q' item
    print(update_file)
    updated_lines = []
    for line in original_lines:
        if line.startswith('Q'):
            item = line.split('\t')[0]
            if item in unique_items:
                updated_lines.append(line)

    # Write the updated quickstatements file
    with open(update_file, 'w') as file:
        file.writelines(updated_lines)
        print(f'File {update_file} created successfully!')

    # Move the updated file to the staging folder
    move_file(SPLIT_FILES_STAGING[bib], update_file)

In [None]:
# Table to be split
table = 'subject' #['uniform_title','analytic', 'biography', 'library', 'copies', 'ms_ed', 'institutions', 'geography', 'bibliography', 'subject']
bib = 'beta' #['beta', 'bitagap', 'biteca']

# Find and download files to be split
file_name, file_id = find_file_id(QS_FILES_SOURCE[bib], table)
download_file(file_name, file_id)

# Read the original quickstatements file
with open(f'{bib}_{table}.qs', 'r') as file:
    original_lines = file.readlines()

# Lets find the first 250 unique items where the rows start with 'Q'
unique_items = set()
file_number = 0
last_item = None

for line in original_lines:
    update_file = f'split_{bib}_{table}_qs' + "_" + str(file_number) + ".qs"
    if line.startswith('Q'):
        item = line.split('\t')[0]
        unique_items.add(item)
        if len(unique_items) == 250:
            # Check for duplication after adding the first item
            if last_item in unique_items:
                print(f'Last item {last_item} is in the set')
                unique_items.remove(last_item)

            find_unique_items(unique_items, update_file)
            file_number += 1

            # Set last_item after clearing the previous set
            unique_items.clear()
            last_item = item
        else:
            continue

# If we've processed all lines and there are less than 250 unique items left, lets include them after removing last item if there
if last_item in unique_items:
    print(f'Last item {last_item} is in the set')
    unique_items.remove(last_item)
find_unique_items(unique_items, update_file)


print(f'File splits compeleted successfully')


Downloading file: beta_subject.qs with id: 1U7TRy1rg7wF-cWEONUZGgbN0dwsaegmx
Download 100%.
File downloaded to: /content/beta_subject.qs
split_beta_subject_qs_0.qs
File split_beta_subject_qs_0.qs created successfully!
File split_beta_subject_qs_0.qs copied successfully! File ID: 1u55LSXqfIV0qvakug8_dfrgHVcOG42Dm
Last item Q43189 is in the set
split_beta_subject_qs_1.qs
File split_beta_subject_qs_1.qs created successfully!
File split_beta_subject_qs_1.qs copied successfully! File ID: 1Tp3v-uHSPL9P2mkyRFRB8HLbh92R99x_
File splits compeleted successfully
