# Quarterly Submission Overview Checks

This notebook will be used for comparing and contrasting quarterly cfde submissions.

The general practice will be to create a directory using the name of the quarter in the directory this notebook exists in, and put that directory name in the first code cell that sets **DB1** and **DB2**. The first db being the oldest quarter and the second db being the newest quarter.

In [47]:
import sqlite3
import pandas as pd

DB1, DB2 = 'q3', 'q4'

In [48]:
db1_conn = sqlite3.connect(f'{DB1}.db')

query = f"""
select project_local_id, count(local_id) as {DB1}_subject_count
from subject
group by project_local_id
"""
db1_subject_df = pd.read_sql_query(query,db1_conn)

In [49]:
db2_conn = sqlite3.connect(f'{DB2}.db')

query = f"""
select project_local_id, count(local_id) as {DB2}_subject_count
from subject
group by project_local_id
"""
db2_subject_df = pd.read_sql_query(query,db2_conn)

In [50]:
combined_quarters_df = db1_subject_df.merge(db2_subject_df,
                                   how='inner',
                                   on='project_local_id') \
                            .sort_values(by='project_local_id')

In [51]:
query = f"""
select project_local_id, count(distinct(local_id)) as {DB1}_biosample_count
from biosample
group by project_local_id
"""
db1_biosample_df = pd.read_sql_query(query,db1_conn)

In [52]:
query = f"""
select project_local_id, count(distinct(local_id)) as {DB2}_biosample_count
from biosample
group by project_local_id
"""
db2_biosample_df = pd.read_sql_query(query,db2_conn)

In [53]:
combined_quarters_df = combined_quarters_df.merge(db1_biosample_df,
                                                  how='inner',
                                                  on='project_local_id') \
                                            .merge(db2_biosample_df,
                                                   how='inner',
                                                   on='project_local_id')


In [54]:
query = f"""
select project_local_id, count(local_id) as {DB1}_file_count
from file
group by project_local_id
"""
db1_file_df = pd.read_sql_query(query,db1_conn)

In [55]:
query = f"""
select project_local_id, count(local_id) as {DB2}_file_count
from file
group by project_local_id
"""
db2_file_df = pd.read_sql_query(query,db2_conn)


In [56]:
combined_quarters_df = combined_quarters_df.merge(db1_file_df,
                                                  how='inner',
                                                  on='project_local_id') \
                                            .merge(db2_file_df,
                                                   how='inner',
                                                   on='project_local_id')

In [57]:
query = f"""
select project_local_id, count(biosample_local_id) as {DB1}_biosample_disease_count
from biosample_disease bd
join biosample b on b.local_id = bd.biosample_local_id
group by project_local_id
"""
db1_biosample_disease_df = pd.read_sql_query(query,db1_conn)

In [58]:
query = f"""
select project_local_id, count(biosample_local_id) as {DB2}_biosample_disease_count
from biosample_disease bd
join biosample b on b.local_id = bd.biosample_local_id
group by project_local_id
"""
db2_biosample_disease_df = pd.read_sql_query(query,db2_conn)

In [59]:
combined_quarters_df = combined_quarters_df.merge(db1_biosample_disease_df,
                                                  how='inner',
                                                  on='project_local_id') \
                                            .merge(db2_biosample_disease_df,
                                                   how='inner',
                                                   on='project_local_id')

In [60]:
combined_quarters_df['subject_count_change'] = combined_quarters_df.apply(lambda row: row[f'{DB2}_subject_count'] - row[f'{DB1}_subject_count'],axis=1)
combined_quarters_df['biosample_count_change'] = combined_quarters_df.apply(lambda row: row[f'{DB2}_biosample_count'] - row[f'{DB1}_biosample_count'],axis=1)
combined_quarters_df['file_count_change'] = combined_quarters_df.apply(lambda row: row[f'{DB2}_file_count'] - row[f'{DB1}_file_count'],axis=1)
combined_quarters_df['biosample_disease_count_change'] = combined_quarters_df.apply(lambda row: row[f'{DB2}_biosample_disease_count'] - row[f'{DB1}_biosample_disease_count'],axis=1)

combined_quarters_df

Unnamed: 0,project_local_id,q3_subject_count,q4_subject_count,q3_biosample_count,q4_biosample_count,q3_file_count,q4_file_count,q3_biosample_disease_count,q4_biosample_disease_count,subject_count_change,biosample_count_change,file_count_change,biosample_disease_count_change
0,SD_0TYVY1TW,334,334,334,334,1494,1494,334,334,0,0,0,0
1,SD_1P41Z782,137,137,259,259,259,259,259,259,0,0,0,0
2,SD_46RR9ZR6,341,341,1220,1220,2208,2208,2440,2440,0,0,0,0
3,SD_46SK55A3,2031,2031,2121,2121,10213,10213,2121,2121,0,0,0,0
4,SD_6FPYJQBR,183,183,183,183,1021,1021,183,183,0,0,0,0
5,SD_7NQ9151J,211,211,217,217,1441,1441,434,434,0,0,0,0
6,SD_8Y99QZJJ,84,77,400,365,20540,18807,400,365,-7,-35,-1733,-35
7,SD_9PYZAHHE,1414,1414,1295,1295,7227,7227,1295,1295,0,0,0,0
8,SD_AQ9KVN5P,1358,1339,4873,4799,50651,49550,9746,9598,-19,-74,-1101,-148
9,SD_B8X3C1MX,245,245,222,222,1092,1092,222,222,0,0,0,0
