# Studies
---

In [28]:
from datetime import datetime
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
file="./data/extracted/aact/studies.csv"

In [2]:
df = pd.read_csv(file, index_col=0, low_memory=False)
print(df.shape)
print(df.columns)
date_cols = [col for col in df.columns if (col.endswith("date") or col.endswith("at"))]
df = pd.read_csv(file, index_col=0, parse_dates=date_cols, low_memory=False, dtype={"nct_id":"string", "nlm_last_extracted_description":"string",
"start_month_year":"string","start_date_type":"string", "verification_month_year":"string",
"completion_month_year":"string", 
"completion_date_type":"string", "primary_completion_month_year":"string",
"primary_completion_date_type":"string", 
"target_duration":"string", "study_type":"string", "acronym":"string", "baseline_population":"string",
"brief_title":"string", "official_title":"string", "overall_status":"string", "last_known_status":"string",
"phase":"string", "enrollment":"float", "enrollment_type":"string", "source":"string", "limitations_and_caveats":"string",
"number_of_arms":"float", "number_of_groups":"float", "why_stopped":"string", "has_expanded_access":"boolean",
"expanded_access_type_individual":"boolean", "expanded_access_type_intermediate":"boolean", "expanded_access_type_treatment":"boolean",
"has_dmc":"boolean", "is_fda_regulated_drug":"boolean", "is_fda_regulated_device":"boolean", "is_unapproved_device":"boolean",
"is_ppsd":"boolean", "is_us_export":"boolean", "biospec_retention":"string", "biospec_description":"string",
"plan_to_share_ipd":"string", "plan_to_share_ipd_description":"string", "ipd_time_frame":"string", "ipd_access_criteria":"string",
"ipd_url":"string"})

eda_study = {}
eda_study['observations'] = df.shape[0]
print(eda_study)
eda_study['variables'] = df.shape[1] 
print(eda_study)
eda_study_missing = df.isnull().sum()
print(eda_study)
eda_study_pct_missing = df.isnull().sum() / (df.shape[0] * df.shape[1]) * 100
print(eda_study)
eda_study['interventional'] = df["study_type"][df["study_type"] == 'Interventional'].count()
print(eda_study)

eda_study['memory'] = df.memory_usage(index=True, deep=True).sum(axis=0)

eda_study_features = ["nct_id", "results_first_submitted_date", "disposition_first_submitted_date",
"start_date", "completion_date", "primary_completion_date", "target_duration", "study_type",
"baseline_population", "overall_status", "last_known_status", "phase", "enrollment", "enrollment_type", 
"number_of_arms", "number_of_groups", "is_fda_regulated_drug"]

eda_study['nfeatures'] = len(eda_study_features)
eda_study["dropped"] = eda_study['variables'] - len(eda_study_features)

eda_study_dtype_counts = df.dtypes.value_counts()

eda_study_df = pd.DataFrame(data=eda_study, index=[0])



print(eda_study_df)
print(eda_study_dtype_counts)
print(eda_study_pct_missing)




(380740, 64)
Index(['nct_id', 'nlm_last_extracted_description', 'study_first_submitted_date',
       'results_first_submitted_date', 'disposition_first_submitted_date',
       'last_update_submitted_date', 'study_first_submitted_qc_date',
       'study_first_posted_date', 'study_first_posted_date_type',
       'results_first_submitted_qc_date', 'results_first_posted_date',
       'results_first_posted_date_type', 'disposition_first_submitted_qc_date',
       'disposition_first_posted_date', 'disposition_first_posted_date_type',
       'last_update_submitted_qc_date', 'last_update_posted_date',
       'last_update_posted_date_type', 'start_month_year', 'start_date_type',
       'start_date', 'verification_month_year', 'verification_date',
       'completion_month_year', 'completion_date_type', 'completion_date',
       'primary_completion_month_year', 'primary_completion_date_type',
       'primary_completion_date', 'target_duration', 'study_type', 'acronym',
       'baseline_population

In [8]:
print(eda_study_dtype_counts.index)
print(eda_study_missing)

Index([string, datetime64[ns], boolean, object, float64], dtype='object')
nct_id                                   0
nlm_last_extracted_description            0
study_first_submitted_date               0
results_first_submitted_date        331143
disposition_first_submitted_date    372951
                                     ...  
ipd_url                             374865
plan_to_share_ipd                   234586
plan_to_share_ipd_description       346690
created_at                               0
updated_at                               0
Length: 64, dtype: int64


In [29]:
df_dt = df.select_dtypes(include=['datetime'])
d2 = {}
d2['min'] = df_dt.min()
d2['max'] = df_dt.max().values
df2 = pd.DataFrame(data=d2)
print(df2)

                                                           min                        max
study_first_submitted_date          1999-09-17 00:00:00.000000 2021-06-15 00:00:00.000000
results_first_submitted_date        2008-09-25 00:00:00.000000 2021-06-02 00:00:00.000000
disposition_first_submitted_date    2008-10-09 00:00:00.000000 2021-06-14 00:00:00.000000
last_update_submitted_date          2005-06-23 00:00:00.000000 2021-06-15 00:00:00.000000
study_first_submitted_qc_date       1999-09-17 00:00:00.000000 2021-06-15 00:00:00.000000
study_first_posted_date             1999-09-20 00:00:00.000000 2021-06-16 00:00:00.000000
results_first_submitted_qc_date     2008-09-25 00:00:00.000000 2021-06-15 00:00:00.000000
results_first_posted_date           2008-09-26 00:00:00.000000 2021-06-16 00:00:00.000000
disposition_first_submitted_qc_date 2008-10-22 00:00:00.000000 2021-06-14 00:00:00.000000
disposition_first_posted_date       2009-08-10 00:00:00.000000 2021-06-16 00:00:00.000000
last_updat

In [31]:
df[['is_ppsd', 'has_dmc']].describe()

Unnamed: 0,is_ppsd,has_dmc
count,9,311816
unique,1,2
top,True,False
freq,9,196221
