In [1]:
import pandas as pd
import requests
import json
from pathlib import Path
from datetime import datetime

In [2]:
tokens = pd.read_csv('tokens.csv', index_col=0)

In [3]:
env = 'qa'
# env = 'qa-b2b'

In [4]:
DOMAIN = tokens.loc[env]['domain']

headers = {
    "Accept": "application/json",
    "Content-Type": "application/json",
    "Authorization": "Token " + tokens.loc[env]['token']
}

# Profiles

## Create profile

In [5]:
sample_profiles = pd.read_csv('sample_profiles.csv').to_dict('records')
sample_profiles[0]

{'name': 'Sample array_file1',
 'ethnicity': 'White or European',
 'sex': 'Male',
 'birth_year': 1984,
 'is_v2': True}

In [6]:
url = f"{DOMAIN}/service/profiles/profile/"

for profile in sample_profiles:
    payload = profile

    response = requests.post(url, json=payload, headers=headers)
    
json.loads(response.text)

{'id': '454109a2-197e-4bfe-af7a-f1b617b4073b',
 'user_id': '7ad00679-a136-4b7c-975a-80c099e258f5',
 'client_user_id': None,
 'name': 'Sample for_deletion',
 'ethnicity': 'East Asian',
 'secondary_ethnicity': None,
 'sex': 'Female',
 'birth_day': None,
 'birth_month': None,
 'birth_year': 1990,
 'has_data_missing': False,
 'dna_sample_id': None,
 'is_v2': True,
 'created_at': 1666796975.155713,
 'email': '',
 'height': None,
 'weight': None,
 'created_by_migration': False,
 'questionnaire_invites': []}

In [7]:
# """Used for creating a profile before running the whole notebook"""
# url = f"{DOMAIN}/service/profiles/profile/"

# payload = {
#     'name': 'Sample regular_file',
#     'ethnicity': 'Ashkenazi Jewish',
#     'sex': 'Male',
#     'birth_year': 1984,
#     'is_v2': True
# }

# response = requests.post(url, json=payload, headers=headers)
    
# profile_id = json.loads(response.text)['id']
# filename = "genome_sample1.zip"
# # filename = "genome_sample1.txt"

## List profiles in account

In [8]:
url = f"{DOMAIN}/service/profiles/profile/"
response = requests.get(url, headers=headers)

profiles = json.loads(response.text)
profiles[0]

{'id': '4d8b2526-fe5f-4403-a884-8c8f121727db',
 'user_id': '7ad00679-a136-4b7c-975a-80c099e258f5',
 'client_user_id': None,
 'name': 'Sample regular_file',
 'ethnicity': 'Ashkenazi Jewish',
 'secondary_ethnicity': None,
 'sex': 'Male',
 'birth_day': None,
 'birth_month': None,
 'birth_year': 1984,
 'has_data_missing': False,
 'dna_sample_id': None,
 'is_v2': True,
 'created_at': 1666794271.976844,
 'email': '',
 'height': None,
 'weight': None,
 'created_by_migration': False,
 'questionnaire_invites': []}

In [9]:
profiles_df = pd.json_normalize(profiles)
profiles_df

Unnamed: 0,id,user_id,client_user_id,name,ethnicity,secondary_ethnicity,sex,birth_day,birth_month,birth_year,has_data_missing,dna_sample_id,is_v2,created_at,email,height,weight,created_by_migration,questionnaire_invites
0,4d8b2526-fe5f-4403-a884-8c8f121727db,7ad00679-a136-4b7c-975a-80c099e258f5,,Sample regular_file,Ashkenazi Jewish,,Male,,,1984,False,,True,1666794000.0,,,,False,[]
1,3a48d34f-e9fa-4fdd-bbbc-47f933b37a7a,7ad00679-a136-4b7c-975a-80c099e258f5,,Sample array_file1,White or European,,Male,,,1984,False,,True,1666797000.0,,,,False,[]
2,6ff18ffd-aba5-44d4-a429-6739c10cf154,7ad00679-a136-4b7c-975a-80c099e258f5,,Sample array_file2,Ashkenazi Jewish,,Female,,,1981,False,,True,1666797000.0,,,,False,[]
3,5cd2279b-7fba-4995-a340-4b5fa72dfd0b,7ad00679-a136-4b7c-975a-80c099e258f5,,Sample LPGS,"Latino (e.g. Mexican, Peruvian, Colombian)",,Male,,,1974,False,,True,1666797000.0,,,,False,[]
4,454109a2-197e-4bfe-af7a-f1b617b4073b,7ad00679-a136-4b7c-975a-80c099e258f5,,Sample for_deletion,East Asian,,Female,,,1990,False,,True,1666797000.0,,,,False,[]


## Delete profile

In [10]:
profile_id = profiles_df.loc[profiles_df['name'] == 'Sample for_deletion', 'id'].item()

url = f"{DOMAIN}/service/profiles/profile/{profile_id}"

response = requests.delete(url, headers=headers)
response

<Response [204]>

In [11]:
# """DELETE ALL PROFILES except"""
# for index, profile in profiles_df.iterrows():
#     profile_id = profile['id']
#     if profile_id == '4d8b2526-fe5f-4403-a884-8c8f121727db':
#         continue
#     url = f"{DOMAIN}/service/profiles/profile/{profile_id}"
#     response = requests.delete(url, headers=headers)

# Genome Files
## Upload genotype file

In [12]:
profile_id = profiles_df.loc[profiles_df['name'] == 'Sample array_file1', 'id'].item()
filename = "genome_sample1.txt"

print(f'array file size: {Path(filename).stat().st_size/1024/1024:.2f} MB')

array file size: 14.68 MB


### 1. Create a genome file using the profile id

In [13]:
regular_file_start_dt = datetime.now()

In [14]:
url = f"{DOMAIN}/service/genome-files/file/"
payload = {"profile_id": profile_id}

response = requests.post(url, json=payload, headers=headers)

file_id = response.json()["id"]
file_id

'c20bfe20-6c81-4e33-8c4d-1b7e770570f4'

### 2. Create a pre-signed URL using the genome file id

In [15]:
url = f"{DOMAIN}/service/genome-files/upload-url/"

payload = {
    "filename": filename,
    "genome_file": file_id
}

response = requests.post(url, json=payload, headers=headers)
upload_id = response.json()["id"]
upload_url = response.json()["upload_url"]
print(upload_id)
print(upload_url)

2eb6d457-0765-4c7a-a874-d90909e470d9
https://sd-platform-staging-userdatas3bucket-1vuokazil28t9.s3.amazonaws.com/7ad00679-a136-4b7c-975a-80c099e258f5/genome-files/c20bfe20-6c81-4e33-8c4d-1b7e770570f4/original_c20bfe20_221026150946.txt?AWSAccessKeyId=ASIA24LREHUCVBLWWAMP&Signature=znYss2IiBilLMmEaPgr7Qfnyrm4%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEBcaCXVzLWVhc3QtMSJGMEQCICvFfBqI7U2W6XgmgyT925cwW8V6KxqNeEy16tqXUiD%2BAiBEkSFo7FLOxEST5XvClzPK59XplKF85TGDKUy7W%2FwqFSrOAwjw%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAMaDDc0ODA5ODQzNjM1NyIMsBHtsGI3r48stkLQKqIDTNECu9NMncs7F%2B1pQhv%2B%2BpNhqJUaajscA%2FBscXil5a%2BMZPi6Tm76FSA4WKJceBAPfAPm9iM2SM8URSs47On5zDAlxF6rhmnyjTHkfkeU5hTZNzS%2F0Uzs6Zmj4qwRDLCaD5SsKcM313ThgbPuoFfT6r4Q1AH0bdn9AtJxlNu2s9wsxyoduSx6IPk2iJJZ%2FveVcQ%2Bl6VNINRkQzbTbnRrhby7h5SEZZ1P6WJKt5lVsAYxx%2B9glnmxN%2BtgJW8EjLthVnTj0C1fUjw%2BPIxUetrgTfKEeYifKPlq%2BMyOAw%2B8a4%2B8nygoENJ0ECOeYw%2BoygEktF6i30G74d6M32y2Nq42daU%2BcWnCiRbB0nd7A14gJpj5M1Nu%2Bx4UQOAwXHMJpeOyPzpli%2BlbTdLVe%2BAxZr7fVDcg5uDmt1

### 3. Upload the file to the pre-signed URL

In [16]:
def upload_file(filename, url):
    """Upload a file to the presigned url"""
    with open(filename, "rb") as f:
        response = requests.put(url, data=f)
    response.raise_for_status()

In [17]:
upload_file(filename, upload_url)

### 4. Notify the service the upload is complete

In [18]:
url = f"{DOMAIN}/service/genome-files/upload-url/{upload_id}/"

response = requests.delete(url, headers=headers)
response

<Response [204]>

### 5. Start a scan job to process the file

In [19]:
url = f"{DOMAIN}/service/genome-files/scan-job/"

payload = {"genome_file": file_id}

response = requests.post(url, json=payload, headers=headers)
scan_id = response.json()["id"]

In [20]:
print(f"Started job {scan_id} for profile {profile_id}.")

Started job f1b225a2-7188-4a39-b8e7-e9f8d38d1936 for profile 3a48d34f-e9fa-4fdd-bbbc-47f933b37a7a.


In [21]:
regular_file_end_dt = datetime.now()
print(f"Entire upload process took: {str(regular_file_end_dt - regular_file_start_dt)}")

Entire upload process took: 0:00:16.319587


## Upload raw reads files (fastq)

In [22]:
profile_id = profiles_df.loc[profiles_df['name'] == 'Sample LPGS', 'id'].item()

LPGS_folder = r'C:\Users\leo_h\Downloads\LPGS\\'
file_fwd = 'LPGS_test_1.fq.gz'
file_rev = 'LPGS_test_2.fq.gz'

forward_reads = Path(LPGS_folder + file_fwd)
reverse_reads = Path(LPGS_folder + file_rev)

print(f'forward_reads file size: {forward_reads.stat().st_size/1024/1024/1024:.2f} GB')
print(f'reverse_reads file size: {reverse_reads.stat().st_size/1024/1024/1024:.2f} GB')

forward_reads file size: 3.02 GB
reverse_reads file size: 2.99 GB


### 1. Create a genome file using the profile id

In [23]:
rawreads_files_start_dt = datetime.now()

In [24]:
url = f"{DOMAIN}/service/genome-files/file/"
payload = {"profile_id": profile_id}

response = requests.post(url, json=payload, headers=headers)

file_id = response.json()["id"]
file_id

'462bac9a-7b38-4be5-ab60-69b90ab9575c'

### 2. Create a pre-signed URLs using the genome file id
1st URL for forward reads file, 2nd URL for reverse reads file

In [25]:
url = f"{DOMAIN}/service/genome-files/upload-raw-reads/"

payload = {
    "genome_file": file_id,
    "filename": file_fwd,
    "filename_reverse": file_rev
}

response = requests.post(url, json=payload, headers=headers)
upload_id = response.json()["id"]
upload_urls = response.json()["upload_url"]
print(upload_id)
print(upload_urls)

e8b139b5-dbc7-4ade-89e6-3478fee3a33a
['https://sd-platform-staging-userdatas3bucket-1vuokazil28t9.s3-accelerate.amazonaws.com/7ad00679-a136-4b7c-975a-80c099e258f5/genome-files/462bac9a-7b38-4be5-ab60-69b90ab9575c/1_original_462bac9a_221026151002.fq.gz?AWSAccessKeyId=ASIA24LREHUCVBLWWAMP&Signature=acKwSUTO9SMiIVe7wSVZRdAGuDU%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEBcaCXVzLWVhc3QtMSJGMEQCICvFfBqI7U2W6XgmgyT925cwW8V6KxqNeEy16tqXUiD%2BAiBEkSFo7FLOxEST5XvClzPK59XplKF85TGDKUy7W%2FwqFSrOAwjw%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAMaDDc0ODA5ODQzNjM1NyIMsBHtsGI3r48stkLQKqIDTNECu9NMncs7F%2B1pQhv%2B%2BpNhqJUaajscA%2FBscXil5a%2BMZPi6Tm76FSA4WKJceBAPfAPm9iM2SM8URSs47On5zDAlxF6rhmnyjTHkfkeU5hTZNzS%2F0Uzs6Zmj4qwRDLCaD5SsKcM313ThgbPuoFfT6r4Q1AH0bdn9AtJxlNu2s9wsxyoduSx6IPk2iJJZ%2FveVcQ%2Bl6VNINRkQzbTbnRrhby7h5SEZZ1P6WJKt5lVsAYxx%2B9glnmxN%2BtgJW8EjLthVnTj0C1fUjw%2BPIxUetrgTfKEeYifKPlq%2BMyOAw%2B8a4%2B8nygoENJ0ECOeYw%2BoygEktF6i30G74d6M32y2Nq42daU%2BcWnCiRbB0nd7A14gJpj5M1Nu%2Bx4UQOAwXHMJpeOyPzpli%2BlbTdLVe%2

### 3. Upload the file to the pre-signed URL

In [26]:
"""Previously defined function"""
upload_file(forward_reads, upload_urls[0])
print(f"{file_fwd} upload finished")
upload_file(reverse_reads, upload_urls[1])
print(f"{file_rev} upload finished")

LPGS_test_1.fq.gz upload finished
LPGS_test_2.fq.gz upload finished


### 4. Notify the service the upload is complete

In [27]:
url = f"{DOMAIN}/service/genome-files/upload-raw-reads/{upload_id}/"

response = requests.delete(url, headers=headers)

result = response.json()
if not result["did_succeed"]:
    raise Exception("One or both files did not upload, please try again.")

gb_ = 1024 * 1024
forward_size = result["forward_reads"]["size"] / gb_
reverse_size = result["reverse_reads"]["size"] / gb_
print(f"{file_fwd}: {forward_size:.2f} GB uploaded")
print(f"{file_rev}: {reverse_size:.2f} GB uploaded")

LPGS_test_1.fq.gz: 3091.25 GB uploaded
LPGS_test_2.fq.gz: 3064.02 GB uploaded


### 5. Start a scan job to process the file

In [28]:
url = f"{DOMAIN}/service/genome-files/scan-job/"

payload = {"genome_file": file_id}

response = requests.post(url, json=payload, headers=headers)
scan_id = response.json()["id"]

In [29]:
print(f"Started job {scan_id} for profile {profile_id}.")

Started job ee1408cc-af85-406d-b7df-bd58ad50c25e for profile 5cd2279b-7fba-4995-a340-4b5fa72dfd0b.


In [30]:
rawreads_files_end_dt = datetime.now()
print(f"Entire upload process took: {str(rawreads_files_end_dt - rawreads_files_start_dt)}")

Entire upload process took: 0:32:05.368324


## List all files uploaded to account
Used in case you need to look for the file_id for a give profile_id

In [31]:
url = f"{DOMAIN}/service/genome-files/file/"

response = requests.get(url, headers=headers)
file_ids = json.loads(response.text)
file_ids[0]

{'id': 'f90cabc5-e0f5-4450-b8e7-3af95ef90a5b',
 'profile_id': '4d8b2526-fe5f-4403-a884-8c8f121727db',
 'provider': 'DIRECT_UPLOAD',
 'chipset': '61b966aa6b330f5179d497d5b6641685',
 'status': 'COMPLETED',
 'athena_status': 'INGESTED',
 'last_scan_started_at': '2022-10-26T14:25:08.495476Z',
 'last_scan_ended_at': '2022-10-26T14:49:08.713133Z',
 'file_processing_version': '2.00',
 'has_download_url': True,
 'download_url': '/service/genome-files/file/f90cabc5-e0f5-4450-b8e7-3af95ef90a5b/download/',
 'upload_provider': None,
 'is_shared': False}

In [32]:
file_ids_df = pd.json_normalize(file_ids)
file_ids_df

Unnamed: 0,id,profile_id,provider,chipset,status,athena_status,last_scan_started_at,last_scan_ended_at,file_processing_version,has_download_url,download_url,upload_provider,is_shared
0,f90cabc5-e0f5-4450-b8e7-3af95ef90a5b,4d8b2526-fe5f-4403-a884-8c8f121727db,DIRECT_UPLOAD,61b966aa6b330f5179d497d5b6641685,COMPLETED,INGESTED,2022-10-26T14:25:08.495476Z,2022-10-26T14:49:08.713133Z,2.0,True,/service/genome-files/file/f90cabc5-e0f5-4450-...,,False
1,c20bfe20-6c81-4e33-8c4d-1b7e770570f4,3a48d34f-e9fa-4fdd-bbbc-47f933b37a7a,DIRECT_UPLOAD,61b966aa6b330f5179d497d5b6641685,COMPLETED,NOT_STARTED,2022-10-26T15:09:57.450844Z,2022-10-26T15:33:13.639429Z,2.0,True,/service/genome-files/file/c20bfe20-6c81-4e33-...,,False
2,462bac9a-7b38-4be5-ab60-69b90ab9575c,5cd2279b-7fba-4995-a340-4b5fa72dfd0b,RAW_READS,,SCANNING,NOT_STARTED,2022-10-26T15:42:02.792915Z,,,True,/service/genome-files/file/462bac9a-7b38-4be5-...,,False


In [33]:
date_format = "%Y-%m-%dT%H:%M:%S.%fZ" 
for file in file_ids:
    if file['status'] == 'COMPLETED':
        start_time = datetime.strptime(file['last_scan_started_at'], date_format)
        end_time = datetime.strptime(file['last_scan_ended_at'], date_format)
        print(f"{file['provider']} type file scan took: {str(end_time - start_time)}")
    else:
        print(f"{file['provider']} type file status is {file['status']}")

DIRECT_UPLOAD type file scan took: 0:24:00.217657
DIRECT_UPLOAD type file scan took: 0:23:16.188585
RAW_READS type file status is SCANNING


- SNP array files take ~25 minutes to process
- LPGS raw reads files take ~5 hours to process