In [1]:
import os

from sqlalchemy import create_engine
import pandas as pd
from dotenv import load_dotenv, find_dotenv

DOTENV_PATH = find_dotenv()
if DOTENV_PATH:
    load_dotenv(DOTENV_PATH)

os.environ
dataservice_db_url = os.getenv("KF_DATASERVICE_DB_URL")
db_user = os.getenv("DB_USER")
db_pass = os.getenv("DB_GATE")

url = dataservice_db_url.format(DB_USER=db_user,
                                DB_GATE=db_pass)

engine = create_engine(url)

In [2]:
portal_studies_df = pd.read_table("studies_on_portal.tsv")
study_list = portal_studies_df['studies_on_portal'].to_list()
study_list_str = ','.join([f"'{item}'" for item in study_list])

In [3]:
query = f"""
select count(*)
from study s
join participant p on p.study_id = s.kf_id
where s.kf_id  in
({study_list_str})
and 
p.visible = true
"""

part_count_df = pd.read_sql(query,engine)

part_count_df

Unnamed: 0,count
0,29809


In [4]:
query = f"""
select count(*)
from study s
join participant p on p.study_id = s.kf_id
join biospecimen b on b.participant_id  = p.kf_id 
where s.kf_id  in
({study_list_str})
and 
p.visible = true
and
b.visible = true
"""

biospecimen_count_df = pd.read_sql(query,engine)

biospecimen_count_df

Unnamed: 0,count
0,40706


In [5]:
query = f"""
select 
    s.kf_id as study_id, 
    gf.kf_id as file_id, 
    p.visible as subject_visibility, 
    b.visible as biosample_visibility, 
    gf.visible as file_visibility
from study s
join participant p on p.study_id = s.kf_id
join biospecimen b on b.participant_id  = p.kf_id 
join biospecimen_genomic_file bgf on bgf.biospecimen_id = b.kf_id
join genomic_file gf on gf.kf_id = bgf.genomic_file_id
where s.kf_id  in
({study_list_str})
"""

file_count_df = pd.read_sql(query,engine)

file_count_df

Unnamed: 0,study_id,file_id,subject_visibility,biosample_visibility,file_visibility
0,SD_YGVA0E1C,GF_BSEXEZTP,True,True,True
1,SD_YGVA0E1C,GF_4ADX9JPB,True,True,True
2,SD_YGVA0E1C,GF_YY8GXMPK,True,True,True
3,SD_YGVA0E1C,GF_YATVYXSG,True,True,True
4,SD_YGVA0E1C,GF_82F5DH84,True,True,True
...,...,...,...,...,...
1031195,SD_DYPMEHHF,GF_QXGVJ3ZN,True,True,False
1031196,SD_DYPMEHHF,GF_REZ9XE29,False,True,False
1031197,SD_DYPMEHHF,GF_JTCA00AH,True,True,False
1031198,SD_DYPMEHHF,GF_RRSVMXJF,True,True,False


In [6]:
file_count_df.to_csv('files_with_visibility.tsv',sep='\t', index=False)

In [7]:
engine.dispose()