# Biosample Diff Q4 vs Q3 

In [9]:
import sqlite3
import pandas as pd

DB1, DB2 = 'q3', 'q4'

In [10]:
db1_conn = sqlite3.connect(f'{DB1}.db')
db2_conn = sqlite3.connect(f'{DB2}.db')

In [11]:
query = f"""
select *
from biosample
"""

db1_biosample_df = pd.read_sql_query(query,db1_conn)

In [12]:
query = f"""
select * 
from biosample
"""

db2_biosample_df = pd.read_sql_query(query,db2_conn)

In [13]:
unique_count_db1 = db1_biosample_df['local_id'].nunique()
unique_count_db2 = db2_biosample_df['local_id'].nunique()
unique_count_db2 - unique_count_db1

-55977

In [14]:
missing_in_q4 = db1_biosample_df[~db1_biosample_df['local_id'].isin(db2_biosample_df['local_id'])]
missing_in_q4

Unnamed: 0,id_namespace,local_id,project_id_namespace,project_local_id,persistent_id,creation_time,sample_prep_method,anatomy
0,kidsfirst:,BS_0000QNBM,kidsfirst:,SD_BHJXBDQK,,2020-11-20 22:12:45.415083,,
1,kidsfirst:,BS_000KK0N8,kidsfirst:,SD_BHJXBDQK,,2020-11-20 22:13:16.175450,,
2,kidsfirst:,BS_0014X1NM,kidsfirst:,SD_BHJXBDQK,,2020-11-20 22:24:20.797484,,
3,kidsfirst:,BS_0024M38B,kidsfirst:,SD_BHJXBDQK,,2022-01-07 18:19:09.695777,,
5,kidsfirst:,BS_0032YCBC,kidsfirst:,SD_BHJXBDQK,,2020-12-07 22:52:14.660365,,UBERON:0000955
...,...,...,...,...,...,...,...,...
96672,kidsfirst:,BS_ZZWBSQBJ,kidsfirst:,SD_BHJXBDQK,,2021-03-08 19:37:07.712360,,UBERON:0000955
96675,kidsfirst:,BS_ZZWSZ8KA,kidsfirst:,SD_BHJXBDQK,,2021-03-08 19:19:30.798283,,UBERON:0000955
96676,kidsfirst:,BS_ZZX0PF6K,kidsfirst:,SD_BHJXBDQK,,2020-11-20 22:32:46.178836,,
96678,kidsfirst:,BS_ZZY5HXJ5,kidsfirst:,SD_BHJXBDQK,,2020-11-20 22:38:50.801892,,UBERON:0000955


In [15]:
study_info_df = pd.read_sql_query("select * from project",db2_conn)

studies_losing_biosamples = pd.DataFrame({'project_local_id': missing_in_q4['project_local_id'].unique()})

studies_losing_biosamples = study_info_df.merge(studies_losing_biosamples,
                                                how='inner',
                                                left_on='local_id',
                                                right_on='project_local_id')

studies_losing_biosamples

Unnamed: 0,id_namespace,local_id,persistent_id,creation_time,abbreviation,name,description,project_local_id
0,kidsfirst:,SD_Z6MWD3H0,,,SD_Z6MWD3H0,Kids First: Leukemia & Heart Defects in Down S...,Kids First: Genomic Analysis of Congenital Hea...,SD_Z6MWD3H0
1,kidsfirst:,SD_NMVV8A1Y,,,SD_NMVV8A1Y,Kids First: Kidney and Urinary Tract Defects,Kids First: Genetics of Structural Defects of ...,SD_NMVV8A1Y
2,kidsfirst:,SD_BHJXBDQK,,,SD_BHJXBDQK,Pediatric Brain Tumor Atlas: CBTTC,Pediatric Brain Tumor Atlas - Children's Brain...,SD_BHJXBDQK
3,kidsfirst:,SD_AQ9KVN5P,,,SD_AQ9KVN5P,Kids First: T Cell ALL,Comprehensive Genomic Profiling to Improve Pre...,SD_AQ9KVN5P
4,kidsfirst:,SD_8Y99QZJJ,,,SD_8Y99QZJJ,Pediatric Brain Tumor Atlas: PNOC,Pediatric Brain Tumor Atlas: PNOC,SD_8Y99QZJJ


In [16]:
missing_in_q4_with_anatomy = missing_in_q4[~missing_in_q4['anatomy'].isnull()]
missing_in_q4_without_anatomy = missing_in_q4[missing_in_q4['anatomy'].isnull()]

In [17]:
missing_anatomy_count = missing_in_q4_with_anatomy.groupby("project_local_id") \
                                                    .agg(bio_anatomy_count=('local_id', 'size'))

In [18]:
missing_without_anatomy_count = missing_in_q4_without_anatomy.groupby("project_local_id") \
                                                    .agg(bio_without_anatomy_count=('local_id','size'))

In [19]:
study_anatomy_breakdown = missing_anatomy_count.merge(missing_without_anatomy_count,
                                                           how='outer',
                                                           on='project_local_id')

study_anatomy_breakdown = study_anatomy_breakdown.fillna(0).astype(int)
study_anatomy_breakdown

Unnamed: 0_level_0,bio_anatomy_count,bio_without_anatomy_count
project_local_id,Unnamed: 1_level_1,Unnamed: 2_level_1
SD_8Y99QZJJ,14,22
SD_BHJXBDQK,38500,18262
SD_AQ9KVN5P,0,74
SD_NMVV8A1Y,0,4
SD_Z6MWD3H0,0,15
