In [36]:
import os

from sqlalchemy import create_engine
import pandas as pd
from dotenv import load_dotenv, find_dotenv

DOTENV_PATH = find_dotenv()
if DOTENV_PATH:
    load_dotenv(DOTENV_PATH)

os.environ
dataservice_db_url = os.getenv("KF_DATASERVICE_DB_URL")
db_user = os.getenv("DB_USER")
db_pass = os.getenv("DB_GATE")

url = dataservice_db_url.format(DB_USER=db_user,
                                DB_GATE=db_pass)

engine = create_engine(url)

In [37]:
portal_studies_df = pd.read_table("studies_on_portal.tsv")
study_list = portal_studies_df['studies_on_portal'].to_list()
study_list_str = ','.join([f"'{item}'" for item in study_list])
study_list_str

"'SD_0TYVY1TW','SD_1P41Z782','SD_46RR9ZR6','SD_46SK55A3','SD_6FPYJQBR','SD_7NQ9151J','SD_8Y99QZJJ','SD_9PYZAHHE','SD_AQ9KVN5P','SD_B8X3C1MX','SD_BHJXBDQK','SD_DK0KRWK8','SD_DYPMEHHF','SD_DZ4GPQX6','SD_DZTB5HRR','SD_FFVQ3T38','SD_JWS3V24D','SD_NMVV8A1Y','SD_P445ACHV','SD_PET7Q6F2','SD_PREASA7S','SD_R0EPRSGS','SD_RM8AFW0R','SD_VTTSHWV4','SD_W0V965XZ','SD_YGVA0E1C','SD_YNSSAPHE','SD_Z6MWD3H0','SD_ZFGDG5YS','SD_ZXJFFMEF','SD_Z0D9N23X','SD_2CEKQ05V','SD_54G4WG4R','SD_JK4Z4T6V','SD_W6FWTD8A'"

In [38]:
query = f"""
select count(*)
from study s
join participant p on p.study_id = s.kf_id
where s.kf_id  in
({study_list_str})
and 
p.visible = true
"""

part_count_df = pd.read_sql(query,engine)

part_count_df

Unnamed: 0,count
0,29809


In [39]:
query = f"""
select count(*)
from study s
join participant p on p.study_id = s.kf_id
join biospecimen b on b.participant_id  = p.kf_id 
where s.kf_id  in
({study_list_str})
and 
p.visible = true
and
b.visible = true
"""

biospecimen_count_df = pd.read_sql(query,engine)

biospecimen_count_df

Unnamed: 0,count
0,40706


In [40]:
query = f"""
select 
    s.kf_id as study_id, 
    gf.kf_id as file_id, 
    p.visible as subject_visibility, 
    b.visible as biosample_visibility, 
    gf.visible as file_visibility
from study s
join participant p on p.study_id = s.kf_id
join biospecimen b on b.participant_id  = p.kf_id 
join biospecimen_genomic_file bgf on bgf.biospecimen_id = b.kf_id
join genomic_file gf on gf.kf_id = bgf.genomic_file_id
where s.kf_id  in
({study_list_str})
"""

file_count_df = pd.read_sql(query,engine)

file_count_df

Unnamed: 0,study_id,file_id,subject_visibility,biosample_visibility,file_visibility
0,SD_YGVA0E1C,GF_W9T03GGS,True,True,True
1,SD_YGVA0E1C,GF_Q204Z18D,True,True,True
2,SD_YGVA0E1C,GF_CJ0YDCKW,True,True,True
3,SD_YGVA0E1C,GF_RK5KKT62,True,True,True
4,SD_YGVA0E1C,GF_GF723KCT,True,True,True
...,...,...,...,...,...
1031195,SD_P445ACHV,GF_MC7VM126,True,True,False
1031196,SD_P445ACHV,GF_5CVKCC80,True,True,False
1031197,SD_P445ACHV,GF_78K2HAAP,True,True,True
1031198,SD_P445ACHV,GF_9DSG0J2P,True,True,True


In [41]:
file_count_df.to_csv('files_with_visibility.tsv',sep='\t', index=False)

In [42]:
engine.dispose()