In [1]:
#import packages needed:

import pandas as pd
import regex as re
import numpy as np
import psycopg2
from sqlalchemy import create_engine

In [2]:
#import data files to compare and explore:

ct_covid = pd.read_csv('../data/all_covid_studies_ctgov.csv') #-covid only trials from clinicaltrials.gov downloaded 12/10/20

#Read in main clinicaltrials.gov dataset from SQL database, additional tables will be pulled in later:

# establish a database connection to SQL for complete clinicaltrials.gov dataset
engine = create_engine("postgres+psycopg2://postgres:postgres@localhost:5432/AACT")

# use the connection to run a query using pandas:
ct_all = pd.read_sql("SELECT * FROM studies;", con=engine)

ct_all.head()


Unnamed: 0,nct_id,nlm_download_date_description,study_first_submitted_date,results_first_submitted_date,disposition_first_submitted_date,last_update_submitted_date,study_first_submitted_qc_date,study_first_posted_date,study_first_posted_date_type,results_first_submitted_qc_date,...,is_us_export,biospec_retention,biospec_description,ipd_time_frame,ipd_access_criteria,ipd_url,plan_to_share_ipd,plan_to_share_ipd_description,created_at,updated_at
0,NCT04331431,ClinicalTrials.gov processed this data on Nove...,2020-03-28,,,2020-03-31,2020-03-31,2020-04-02,Actual,,...,,,,,,,,,2020-12-01 07:41:58.458186,2020-12-01 07:41:58.458186
1,NCT04645472,ClinicalTrials.gov processed this data on Nove...,2020-11-25,,,2020-11-25,2020-11-25,2020-11-27,Actual,,...,,,,,,,No,,2020-12-01 06:44:46.185147,2020-12-01 06:44:46.185147
2,NCT04646369,ClinicalTrials.gov processed this data on Nove...,2020-11-20,,,2020-11-20,2020-11-20,2020-11-27,Actual,,...,,,,These data will be released to the NDCT soon a...,"In addition to public access to the NDCT, data...",,Yes,All requests for study data will follow NIMH's...,2020-12-01 06:44:35.512935,2020-12-01 06:44:35.512935
3,NCT04646356,ClinicalTrials.gov processed this data on Nove...,2020-10-20,,,2020-11-20,2020-11-20,2020-11-27,Actual,,...,False,,,,,,No,,2020-12-01 06:44:36.897833,2020-12-01 06:44:36.897833
4,NCT04646330,ClinicalTrials.gov processed this data on Nove...,2020-11-23,,,2020-11-23,2020-11-23,2020-11-27,Actual,,...,,,,,,,,,2020-12-01 06:44:37.204421,2020-12-01 06:44:37.204421


In [3]:
#check columns for full ct data to see which to keep for analysis:

ct_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360145 entries, 0 to 360144
Data columns (total 64 columns):
 #   Column                               Non-Null Count   Dtype         
---  ------                               --------------   -----         
 0   nct_id                               360145 non-null  object        
 1   nlm_download_date_description        360145 non-null  object        
 2   study_first_submitted_date           360145 non-null  object        
 3   results_first_submitted_date         46342 non-null   object        
 4   disposition_first_submitted_date     7523 non-null    object        
 5   last_update_submitted_date           360145 non-null  object        
 6   study_first_submitted_qc_date        360145 non-null  object        
 7   study_first_posted_date              360145 non-null  object        
 8   study_first_posted_date_type         360145 non-null  object        
 9   results_first_submitted_qc_date      46342 non-null   object        
 

In [4]:
#check for duplicates:

ctduplicates = ct_all.nct_id.duplicated()
ctduplicates.value_counts()

False    360145
Name: nct_id, dtype: int64

In [5]:
#select columns to focus on for ct_all dataset, will pull in more info from other tables in SQL once this is done:

ct_all_sub = ct_all[[
 'nct_id',
 'study_first_submitted_date',      
 'results_first_submitted_date',              
 'last_update_submitted_date',   
 'start_date_type',      
 'start_date',       
 'completion_date_type',        
 'completion_date',
 'results_first_posted_date',
 'target_duration',       
 'study_type',
 'brief_title',                          
 'official_title',                    
 'overall_status',
 'phase',                                        
 'enrollment',                                  
 'enrollment_type',
 'source',
 'limitations_and_caveats',
 'why_stopped',                                
 'has_expanded_access',
 'is_fda_regulated_drug',                       
 'is_fda_regulated_device'
]] 
    
    
ct_all_sub.head(2)

Unnamed: 0,nct_id,study_first_submitted_date,results_first_submitted_date,last_update_submitted_date,start_date_type,start_date,completion_date_type,completion_date,results_first_posted_date,target_duration,...,overall_status,phase,enrollment,enrollment_type,source,limitations_and_caveats,why_stopped,has_expanded_access,is_fda_regulated_drug,is_fda_regulated_device
0,NCT04331431,2020-03-28,,2020-03-31,Actual,2008-04-01,Actual,2015-04-08,,,...,Completed,,24.0,Actual,Sohag University,,,False,False,False
1,NCT04645472,2020-11-25,,2020-11-25,Actual,2020-04-20,Anticipated,2021-04-30,,,...,Recruiting,,70.0,Anticipated,Sun Yat-Sen Memorial Hospital of Sun Yat-Sen U...,,,False,False,False


In [6]:
#add year column for start_date, ignoring the warning for now:

ct_all_sub['start_date'] = pd.to_datetime(ct_all_sub['start_date'])
ct_all_sub['start_year'] = ct_all_sub['start_date'].dt.year.astype(str).replace('\.0', '', regex=True)

ct_all_sub.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ct_all_sub['start_date'] = pd.to_datetime(ct_all_sub['start_date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ct_all_sub['start_year'] = ct_all_sub['start_date'].dt.year.astype(str).replace('\.0', '', regex=True)


Unnamed: 0,nct_id,study_first_submitted_date,results_first_submitted_date,last_update_submitted_date,start_date_type,start_date,completion_date_type,completion_date,results_first_posted_date,target_duration,...,phase,enrollment,enrollment_type,source,limitations_and_caveats,why_stopped,has_expanded_access,is_fda_regulated_drug,is_fda_regulated_device,start_year
0,NCT04331431,2020-03-28,,2020-03-31,Actual,2008-04-01,Actual,2015-04-08,,,...,,24.0,Actual,Sohag University,,,False,False,False,2008
1,NCT04645472,2020-11-25,,2020-11-25,Actual,2020-04-20,Anticipated,2021-04-30,,,...,,70.0,Anticipated,Sun Yat-Sen Memorial Hospital of Sun Yat-Sen U...,,,False,False,False,2020
2,NCT04646369,2020-11-20,,2020-11-20,Actual,2020-11-05,Anticipated,2022-04-30,,,...,,100.0,Anticipated,University of Pittsburgh,,,False,False,False,2020
3,NCT04646356,2020-10-20,,2020-11-20,Actual,2020-10-20,Anticipated,2022-09-30,,,...,Phase 2,30.0,Anticipated,"St. Michael's Hospital, Toronto",,,False,False,False,2020
4,NCT04646330,2020-11-23,,2020-11-23,Anticipated,2020-11-30,Anticipated,2023-12-31,,,...,Phase 1/Phase 2,120.0,Anticipated,Akeso,,,False,False,False,2020


In [7]:
ct_all_sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360145 entries, 0 to 360144
Data columns (total 24 columns):
 #   Column                        Non-Null Count   Dtype         
---  ------                        --------------   -----         
 0   nct_id                        360145 non-null  object        
 1   study_first_submitted_date    360145 non-null  object        
 2   results_first_submitted_date  46342 non-null   object        
 3   last_update_submitted_date    360145 non-null  object        
 4   start_date_type               166306 non-null  object        
 5   start_date                    355250 non-null  datetime64[ns]
 6   completion_date_type          335700 non-null  object        
 7   completion_date               342540 non-null  object        
 8   results_first_posted_date     46342 non-null   object        
 9   target_duration               6935 non-null    object        
 10  study_type                    360145 non-null  object        
 11  brief_title  

In [8]:
#pull in interventions, conditions, sponsors, countries, and calculated_values tables from SQL and check how many records in each:

interventions =  pd.read_sql("SELECT * FROM interventions;", con=engine)
conditions =  pd.read_sql("SELECT * FROM conditions;", con=engine)
sponsors =  pd.read_sql("SELECT * FROM sponsors;", con=engine)
countries =  pd.read_sql("SELECT * FROM countries;", con=engine)
calculated_values =  pd.read_sql("SELECT * FROM calculated_values;", con=engine)

print(interventions.shape)
print(conditions.shape)
print(sponsors.shape)
print(countries.shape)
print(calculated_values.shape)

(618521, 5)
(607333, 4)
(574931, 5)
(510391, 4)
(360145, 19)


In [9]:
#checking the interventions table, there are multiple entries per study so it will be difficult to join with main data:

iduplicates = interventions.nct_id.duplicated()
print(iduplicates.value_counts())
interventions = interventions[['nct_id', 'intervention_type', 'name']]
interventions = interventions.rename(columns = {'name': 'intervention'}) 
interventions.head()

False    322164
True     296357
Name: nct_id, dtype: int64


Unnamed: 0,nct_id,intervention_type,intervention
0,NCT04423627,Drug,Placebo
1,NCT04645888,Procedure,Impacted tooth surgery
2,NCT04646369,Behavioral,"""screening as usual"""
3,NCT04646369,Behavioral,Screening Wizard 2.0
4,NCT04646369,Behavioral,Screening Wizard 2.0 + SOVA


In [10]:
#checking the conditions table, there are multiple entries per study so it will be difficult to join with main data:

cduplicates = conditions.nct_id.duplicated()
print(cduplicates.value_counts())
conditions = conditions[['nct_id','downcase_name']]
conditions = conditions.rename(columns = {'downcase_name': 'condition'})
conditions.head()

False    359277
True     248056
Name: nct_id, dtype: int64


Unnamed: 0,nct_id,condition
0,NCT02892617,modic 1
1,NCT02889016,pans
2,NCT02887209,hiv
3,NCT02882607,hiv
4,NCT02882256,head injury


In [11]:
#checking the countries table, there are multiple entries per study so it will be difficult to join with main data:

cnduplicates = countries.nct_id.duplicated()
print(cnduplicates.value_counts())
countries = countries[['nct_id', 'name', 'removed']]
countries = countries.rename(columns = {'name' : 'country','removed': 'country_removed'})
countries.head()

False    323475
True     186916
Name: nct_id, dtype: int64


Unnamed: 0,nct_id,country,country_removed
0,NCT04646369,United States,
1,NCT04646356,Canada,
2,NCT04646330,China,
3,NCT04646317,Pakistan,
4,NCT04646278,"Korea, Republic of",


In [12]:
#not sure what "removed" means in the countries table - will look into

countries.country_removed.value_counts()

True    29944
Name: country_removed, dtype: int64

In [13]:
#checking the sponsors table, there are multiple entries per study so it will be difficult to join with main data:

sduplicates = sponsors.nct_id.duplicated()
print(sduplicates.value_counts())
sponsors = sponsors[['nct_id','agency_class', 'lead_or_collaborator', 'name']]
sponsors = sponsors.rename(columns = {'agency_class' : 'sponsor_type','name': 'sponsor'})
sponsors.head()

False    360145
True     214786
Name: nct_id, dtype: int64


Unnamed: 0,nct_id,sponsor_type,lead_or_collaborator,sponsor
0,NCT04646369,Other,lead,University of Pittsburgh
1,NCT04646369,Other,collaborator,Kaiser Foundation Research Institute
2,NCT04646369,NIH,collaborator,National Institute of Mental Health (NIMH)
3,NCT04646356,Other,lead,"St. Michael's Hospital, Toronto"
4,NCT04646356,U.S. Fed,collaborator,United States Department of Defense


In [14]:
#But, could separate out for lead only and it should match with the main study data set:

sponsors.lead_or_collaborator.value_counts()

lead            360145
collaborator    214786
Name: lead_or_collaborator, dtype: int64

In [15]:
#separate out lead sponsors so I can join it with the main dataset:

lead_sponsors = sponsors[sponsors.lead_or_collaborator == 'lead']
lead_sponsors

Unnamed: 0,nct_id,sponsor_type,lead_or_collaborator,sponsor
0,NCT04646369,Other,lead,University of Pittsburgh
3,NCT04646356,Other,lead,"St. Michael's Hospital, Toronto"
5,NCT04646330,Industry,lead,Akeso
6,NCT04646317,Other,lead,"Dr. Ruth K.M. Pfau Civil Hospital, Karachi"
7,NCT04646304,Other,lead,Ottawa Hospital Research Institute
...,...,...,...,...
574926,NCT00001645,NIH,lead,National Institute of Allergy and Infectious D...
574927,NCT00001582,NIH,lead,National Cancer Institute (NCI)
574928,NCT00001521,NIH,lead,Eunice Kennedy Shriver National Institute of C...
574929,NCT00001456,NIH,lead,National Human Genome Research Institute (NHGRI)


In [16]:
#checking the calculated values table, it looks like there are no duplicates so it will be easy to join with main data:

cvduplicates = calculated_values.nct_id.duplicated()
print(cvduplicates.value_counts())
calculated_values = calculated_values[['nct_id', 'number_of_facilities', 'registered_in_calendar_year', 'actual_duration', 'were_results_reported', 'months_to_report_results']]
calculated_values.head()

False    360145
Name: nct_id, dtype: int64


Unnamed: 0,nct_id,number_of_facilities,registered_in_calendar_year,actual_duration,were_results_reported,months_to_report_results
0,NCT00506311,1.0,2007,58.0,False,
1,NCT00652496,1.0,2008,4.0,False,
2,NCT00581698,1.0,2007,227.0,False,
3,NCT00643214,1.0,2008,1.0,False,
4,NCT00632684,1.0,2008,31.0,False,


In [17]:
#combine ct all with calculated values and lead sponsors datasets:

ct_all_sub_cv = pd.merge(ct_all_sub, calculated_values, on='nct_id')
ct_whole = pd.merge(ct_all_sub_cv, lead_sponsors, on='nct_id')
print(ct_whole.shape)
ct_whole.head(3)

(360145, 32)


Unnamed: 0,nct_id,study_first_submitted_date,results_first_submitted_date,last_update_submitted_date,start_date_type,start_date,completion_date_type,completion_date,results_first_posted_date,target_duration,...,is_fda_regulated_device,start_year,number_of_facilities,registered_in_calendar_year,actual_duration,were_results_reported,months_to_report_results,sponsor_type,lead_or_collaborator,sponsor
0,NCT04331431,2020-03-28,,2020-03-31,Actual,2008-04-01,Actual,2015-04-08,,,...,False,2008,,2020,85.0,False,,Other,lead,Sohag University
1,NCT04645472,2020-11-25,,2020-11-25,Actual,2020-04-20,Anticipated,2021-04-30,,,...,False,2020,1.0,2020,,False,,Other,lead,Zhenyu Wu
2,NCT04646369,2020-11-20,,2020-11-20,Actual,2020-11-05,Anticipated,2022-04-30,,,...,False,2020,4.0,2020,,False,,Other,lead,University of Pittsburgh


In [18]:
ct_whole.start_year.value_counts()

2020    29521
2019    28265
2018    27852
2017    26493
2016    25615
        ...  
1958        1
1968        1
1967        1
1948        1
1931        1
Name: start_year, Length: 70, dtype: int64

In [19]:
ct_whole.registered_in_calendar_year.value_counts()

2020    32436
2019    32168
2018    30589
2017    29465
2016    27982
2015    24374
2014    23585
2013    20535
2012    19671
2011    18252
2010    17743
2009    17141
2008    16998
2007    13380
2005    12893
2006    10926
1999     3645
2000     1980
2004     1787
2003     1662
2002     1587
2001     1346
Name: registered_in_calendar_year, dtype: int64

**Attemped to join the interventions data to the main data. Would like to have each intervention populate a new column.
Unfortunately, it looks like there are even more than I expected per study, so will keep it separate for now.**


*Did a couple different types of joins - here:*
all_interventions = ct_whole.append(interventions)
print(all_interventions.shape)
print(all_interventions.nct_id.nunique())
all_interventions

*and here:*
ct_interventions = ct_whole.merge(interventions, how='inner', on='nct_id')
print(ct_interventions.shape)
print(ct_interventions.nct_id.nunique())
ct_interventions

**Tried this to separate out into different columns**

#see https://stackoverflow.com/questions/22798934/pandas-long-to-wide-reshape-by-two-variables

interventions['idx'] = interventions.groupby('nct_id').cumcount()

tmp = []
for var in ['name']:
    interventions['tmp_idx'] = var + '_' + interventions.idx.astype(str)
    tmp.append(interventions.pivot(index='nct_id',columns='tmp_idx',values=var))

reshape = pd.concat(tmp,axis=1)

reshape.head()




In [20]:
conditions.condition.value_counts()

healthy                                                            7946
breast cancer                                                      5876
obesity                                                            5305
hiv infections                                                     3413
hypertension                                                       3318
                                                                   ... 
testicular embryonal carcinoma and yolk sac tumor with seminoma       1
oxytocin/administration & dosage                                      1
presumptive diagnosis on admission                                    1
cardiac arrest from trauma                                            1
thrombocytopenia related to chronic liver disease                     1
Name: condition, Length: 87326, dtype: int64

In [21]:
ct_covid.Conditions.value_counts()

COVID-19                                                                                                                                                                                                                                                                                                                                                       628
Covid19                                                                                                                                                                                                                                                                                                                                                        370
COVID                                                                                                                                                                                                                                                                                             

In [22]:
ct_covid.head()

Unnamed: 0,Rank,NCT Number,Title,Acronym,Status,Study Results,Conditions,Interventions,Outcome Measures,Sponsor/Collaborators,...,Other IDs,Start Date,Primary Completion Date,Completion Date,First Posted,Results First Posted,Last Update Posted,Locations,Study Documents,URL
0,1,NCT04372602,Duvelisib to Combat COVID-19,,Recruiting,No Results Available,COVID-19,Drug: Duvelisib|Procedure: Peripheral blood dr...,Overall survival|Length of hospital stay|Lengt...,Washington University School of Medicine|Veras...,...,202007009,"October 12, 2020","November 30, 2021","April 30, 2022","May 4, 2020",,"November 9, 2020","Washington University School of Medicine, Sain...",,https://ClinicalTrials.gov/show/NCT04372602
1,2,NCT04364698,Observational Cohort of COVID-19 Patients at R...,COVID-RPC,Recruiting,No Results Available,COVID-19,,"clinical, biological and radiological characte...",Assistance Publique - Hôpitaux de Paris,...,20SBS-COVID-RPC,"May 7, 2020",June 2020,June 2020,"April 28, 2020",,"May 14, 2020","Department of Infectiology, Raymond Poincaré H...",,https://ClinicalTrials.gov/show/NCT04364698
2,3,NCT04482621,Decitabine for Coronavirus (COVID-19) Pneumoni...,DART,Recruiting,No Results Available,COVID-19,Drug: Decitabine|Other: Placebo Saline,Change in clinical state as assessed by a 6-po...,Johns Hopkins University,...,IRB00247544,"September 14, 2020",May 2021,July 2021,"July 22, 2020",,"September 25, 2020","Johns Hopkins University, Baltimore, Maryland,...",,https://ClinicalTrials.gov/show/NCT04482621
3,4,NCT04459637,COVID-19 Surveillance Based on Smart Wearable ...,COVID-19SWD,Not yet recruiting,No Results Available,COVID-19,,Deterioration of the condition|Mortality|The i...,Peking University First Hospital,...,2020055-0615,"July 1, 2020","March 10, 2021","March 10, 2021","July 7, 2020",,"July 7, 2020","Peking University First Hospital, Beijing, Bei...",,https://ClinicalTrials.gov/show/NCT04459637
4,5,NCT04425538,A Phase 2 Trial of Infliximab in Coronavirus D...,,Recruiting,No Results Available,COVID-19,Drug: Infliximab,Time to improvement in oxygenation|28-day mort...,Tufts Medical Center|National Institutes of He...,...,STUDY00000564,"June 1, 2020",September 2020,December 2020,"June 11, 2020",,"June 11, 2020","Tufts Medical Center, Boston, Massachusetts, U...",,https://ClinicalTrials.gov/show/NCT04425538


In [23]:
ct_covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4156 entries, 0 to 4155
Data columns (total 27 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Rank                     4156 non-null   int64  
 1   NCT Number               4156 non-null   object 
 2   Title                    4156 non-null   object 
 3   Acronym                  1835 non-null   object 
 4   Status                   4156 non-null   object 
 5   Study Results            4156 non-null   object 
 6   Conditions               4156 non-null   object 
 7   Interventions            3524 non-null   object 
 8   Outcome Measures         4124 non-null   object 
 9   Sponsor/Collaborators    4156 non-null   object 
 10  Gender                   4148 non-null   object 
 11  Age                      4156 non-null   object 
 12  Phases                   2368 non-null   object 
 13  Enrollment               4125 non-null   float64
 14  Funded Bys              

In [24]:
ctcovidduplicates = ct_covid['NCT Number'].duplicated()
ctcovidduplicates.value_counts()

False    4156
Name: NCT Number, dtype: int64

In [25]:
ct_covid.shape

(4156, 27)

In [26]:
ct_whole.shape

(360145, 32)

In [27]:
#Identifying the studies from the CT covid dataset that are in the full CT dataset:

ct_whole = ct_whole.assign(CovidStudy=ct_whole.nct_id.isin(ct_covid['NCT Number']))

ct_whole.head()

Unnamed: 0,nct_id,study_first_submitted_date,results_first_submitted_date,last_update_submitted_date,start_date_type,start_date,completion_date_type,completion_date,results_first_posted_date,target_duration,...,start_year,number_of_facilities,registered_in_calendar_year,actual_duration,were_results_reported,months_to_report_results,sponsor_type,lead_or_collaborator,sponsor,CovidStudy
0,NCT04331431,2020-03-28,,2020-03-31,Actual,2008-04-01,Actual,2015-04-08,,,...,2008,,2020,85.0,False,,Other,lead,Sohag University,False
1,NCT04645472,2020-11-25,,2020-11-25,Actual,2020-04-20,Anticipated,2021-04-30,,,...,2020,1.0,2020,,False,,Other,lead,Zhenyu Wu,False
2,NCT04646369,2020-11-20,,2020-11-20,Actual,2020-11-05,Anticipated,2022-04-30,,,...,2020,4.0,2020,,False,,Other,lead,University of Pittsburgh,False
3,NCT04646356,2020-10-20,,2020-11-20,Actual,2020-10-20,Anticipated,2022-09-30,,,...,2020,1.0,2020,,False,,Other,lead,"St. Michael's Hospital, Toronto",False
4,NCT04646330,2020-11-23,,2020-11-23,Anticipated,2020-11-30,Anticipated,2023-12-31,,,...,2020,1.0,2020,,False,,Industry,lead,Akeso,False


In [28]:
print(ct_whole.CovidStudy.value_counts())
356008 + 4137

False    356008
True       4137
Name: CovidStudy, dtype: int64


360145

In [29]:
ct_whole['why_stopped'] = ct_whole['why_stopped'].str.lower()
ct_whole['why_stopped'] = ct_whole['why_stopped'].str.replace(r"[\"\',.]", '')

ct_whole['why_stopped'].value_counts().head(25)

slow accrual                                      416
lack of funding                                   274
low accrual                                       267
lack of enrollment                                184
sponsor decision                                  161
see termination reason in detailed description    150
low enrollment                                    142
slow enrollment                                   137
poor accrual                                      136
covid-19                                          121
no participants enrolled                          110
slow recruitment                                  105
poor enrollment                                   104
business decision                                 101
lack of recruitment                                97
poor recruitment                                   96
lack of accrual                                    90
no funding                                         88
lack of efficacy            

In [30]:
accrual = ct_whole[ct_whole['why_stopped'].str.contains('accru|enroll|recruit', na=False)]
accrual = accrual.why_stopped.value_counts()
#accrual.to_csv(r'C:\Users\kkosf\Documents\nss\projects\capstone\data\accrual.csv')

#export into csv and edit to make the dictionary: =CONCATENATE(TEXT(A1,"\'@\'"),":'recruitment issues',")

In [31]:
covid = ct_whole[ct_whole['why_stopped'].str.contains('covid|pandemic|corona', na=False)]
covid = covid.why_stopped.value_counts()
#covid.to_csv(r'C:\Users\kkosf\Documents\nss\projects\capstone\data\covid.csv')

In [32]:
funding = ct_whole[ct_whole['why_stopped'].str.contains('fund', na=False)]
funding = funding.why_stopped.value_counts()
#funding.to_csv(r'C:\Users\kkosf\Documents\nss\projects\capstone\data\funding.csv')

In [33]:
HCQ = ct_whole[ct_whole['why_stopped'].str.contains('hcq|hydrox', na=False)]
HCQ = HCQ.why_stopped.value_counts()
#HCQ.to_csv(r'C:\Users\kkosf\Documents\nss\projects\capstone\data\HCQ.csv')

In [44]:
x = {'slow accrual':'recruitment issues',
'low accrual':'recruitment issues',
'lack of enrollment':'recruitment issues',
'low enrollment':'recruitment issues',
'slow enrollment':'recruitment issues',
'poor accrual':'recruitment issues',
'no participants enrolled':'recruitment issues',
'slow recruitment':'recruitment issues',
'poor enrollment':'recruitment issues',
'lack of recruitment':'recruitment issues',
'poor recruitment':'recruitment issues',
'lack of accrual':'recruitment issues',
'insufficient recruitment':'recruitment issues',
'low recruitment':'recruitment issues',
'insufficient enrollment':'recruitment issues',
'recruitment difficulties':'recruitment issues',
'no enrollment':'recruitment issues',
'insufficient accrual':'recruitment issues',
'low recruitment rate':'recruitment issues',
'no accrual':'recruitment issues',
'inadequate enrollment':'recruitment issues',
'no patients enrolled':'recruitment issues',
'inadequate recruitment':'recruitment issues',
'low accrual rate':'recruitment issues',
'no recruitment':'recruitment issues',
'recruitment problems':'recruitment issues',
'due to poor accrual':'recruitment issues',
'failure to recruit':'recruitment issues',
'recruitment issues':'recruitment issues',
'terminated due to slow accrual':'recruitment issues',
'due to slow accrual':'recruitment issues',
'recruitment challenges':'recruitment issues',
'no subjects enrolled':'recruitment issues',
'unable to enroll':'recruitment issues',
'no participants were enrolled':'recruitment issues',
'slow recruitment rate':'recruitment issues',
'unable to recruit':'recruitment issues',
'difficulty recruiting patients':'recruitment issues',
'difficulty in recruitment':'recruitment issues',
'low enrollment rate':'recruitment issues',
'difficulty recruiting':'recruitment issues',
'unable to recruit participants':'recruitment issues',
'difficult recruitment':'recruitment issues',
'no patients recruited':'recruitment issues',
'recruitment difficulty':'recruitment issues',
'difficulty with enrollment':'recruitment issues',
'recruiting problems':'recruitment issues',
'study halted prematurely prior to enrollment of first participant':'recruitment issues',
'recruitment':'recruitment issues',
'difficulty in recruiting patients':'recruitment issues',

'lack of funding':'funding issues',
'no funding':'funding issues',
'funding':'funding issues',
'loss of funding':'funding issues',
'funding unavailable':'funding issues',
'insufficient funding':'funding issues',
'funding withdrawn':'funding issues',
'funding ended':'funding issues',
'funding issues':'funding issues',
'not funded':'funding issues',
'lack of funds':'funding issues',
'funding not available':'funding issues',
'funding terminated':'funding issues',
'funding not received':'funding issues',
'study was not funded':'funding issues',
'funding issue':'funding issues',
'sponsor withdrew funding':'funding issues',
'study not funded':'funding issues',
'funding not obtained':'funding issues',
'funding expired':'funding issues',
'sponsor funding':'funding issues',
'no funding obtained':'funding issues',
'lost funding':'funding issues',
'due to lack of funding':'funding issues',
'funding discontinued':'funding issues',
'ran out of funding':'funding issues',
'insufficient funds':'funding issues',
'funding not secured':'funding issues',
'no funding available':'funding issues',
'inadequate funding':'funding issues',
'break in funding':'funding issues',
'did not receive funding':'funding issues',
'suspended due to lack of funding':'funding issues',
'unable to obtain funding':'funding issues',
'funding stopped':'funding issues',
'funding was withdrawn':'funding issues',
'terminated due to lack of funding':'funding issues',
'the project was not funded':'funding issues',
'funding exhausted':'funding issues',
'funding ceased':'funding issues',
'unable to secure funding':'funding issues',
'funding no longer available':'funding issues',
'study was terminated due to lack of funding':'funding issues',
'unfunded':'funding issues',
'never funded':'funding issues',
'we did not receive proper funding to complete this study':'funding issues',
'the study was stopped due to lack of funding':'funding issues',
'the study was not funded':'funding issues',
'never received funding':'funding issues',
'halt in funding':'funding issues',
   
    
'covid-19':'COVID-19',
'due to covid-19':'COVID-19',
'covid-19 pandemic':'COVID-19',
'currently suspended due to covid-19 policies':'COVID-19',
'suspended due to covid-19':'COVID-19',
'enrollment and study activities are temporarily suspended due to covid-19':'COVID-19',
'temporarily suspended due to pandemic':'COVID-19',
'temporarily paused due to covid-19 and expected to resume this is not a suspension of irb approval':'COVID-19',
'covid':'COVID-19',
'enrollment and other trial activities have temporarily paused due to covid-19 and are expected to resume in the future; this is not a suspension of irb approval':'COVID-19',
'covid-19 restrictions':'COVID-19',
'per irb covid-19 guidance on conducting human subjects research':'COVID-19',
'study visits suspended due to covid-19 pandemic':'COVID-19',
'this study is temporarily suspended due to covid-19 pandemic':'COVID-19',
'temporarily suspended due to covid-19':'COVID-19',
'on hold due to covid-19 outbreak':'COVID-19',
'enrollment and interactions/interventions paused due to covid-19 expected to resume in the future this is not a suspension of irb approval':'COVID-19',
'covid 19':'COVID-19',
'due to covid-19 pandemic':'COVID-19',
'covid19 pandemic':'COVID-19',
'due to covid-19 lockdown':'COVID-19',
'study recruitment temporarily suspended due to covid-19 pandemic':'COVID-19',
'covid19':'COVID-19',
'covid pandemic':'COVID-19',
'enrollment is temporarily suspended due to covid-19':'COVID-19',
'suspended due to covid-19 risks':'COVID-19',
'covid-19 outbreak':'COVID-19',
'suspended (due to covid-19)':'COVID-19',
'enrollment and/or interactions/interventions temporarily paused due to covid-19 and expected to resume in the future this is not a suspension of irb approval':'COVID-19',
'covid-19 hold':'COVID-19',
'temporarily paused due to covid-19 and will resume based on guidance of public health authorities this is not a suspension of irb approval':'COVID-19',
'temporarily paused due to covid-19 and expected to resume':'COVID-19',
'the study is being terminated early as a result of the coronavirus (covid-19) outbreak':'COVID-19',
'due to covid-19 all research activities including recruitment have been paused':'COVID-19',
'suspended due to covid':'COVID-19',
'in response to covid-19':'COVID-19',
'study is temporarily suspended due to covid-19':'COVID-19',
'the study is temporarily suspended due to covid-19 policy':'COVID-19',
'due to covid 19':'COVID-19',
'temporarily paused due to covid-19 and expected to resume in the future; this is not a suspension of irb approval':'COVID-19',
'study initiation delayed due to covid 19':'COVID-19',
'due to covid-19 outbreak study is temporarily suspended':'COVID-19',
'accrual temporarily suspended during covid-19 pandemic':'COVID-19',
'coronavirus pandemic':'COVID-19',
'enrollment is temporarily paused due to covid-19 and are expected to resume in the future this is not a suspension of irb approval':'COVID-19',
'due to covid-19 outbreak':'COVID-19',
'due to coronavirus pandemic':'COVID-19',
'corona':'COVID-19',
'temporary suspension since march 13 due to covid-19 pandemic':'COVID-19',
'study terminated prematurely for financial reasons and covid-19 pandemic':'COVID-19',

'interest in the use of hcq is controversial':'hydroxychloroquine trial',
'concerned about the adverse effects of hcq':'hydroxychloroquine trial',
'suspension of clinical trials with hydroxychloroquine by health authorities':'hydroxychloroquine trial',
'evidence that hcq ineffective loss of hcw interest':'hydroxychloroquine trial',
'recently published data that has shown hcq to be safe when combined with chemo and or radiation at even higher doses than what is used in this study':'hydroxychloroquine trial',
'equipoise for hydroxychloquine was lost':'hydroxychloroquine trial',
'low recruitment rate exacerbated by manufacturing shortage and price increase of hydroxychloroquine':'hydroxychloroquine trial',
'currently almost no patients admitted to dutch hospitals if any effect of hcq is to be expected we need more than 1000 inclusions':'hydroxychloroquine trial',
'trial not started due to accumulating evidence against hcq for covid':'hydroxychloroquine trial',
'emerging evidence does not support the use of hcq for treatment or prevention of sars -cov2 infection no pts have been enrolled':'hydroxychloroquine trial',
'in view of the notices concerning hydroxychloroquine issued by the regulatory authorities we withdraw the protocol':'hydroxychloroquine trial',

#could add more to the below, but depends on time:
     
 'business decision' : 'business decision',
 'business reasons' : 'business decision',
 'company decision' : 'business decision',
 'management decision' : 'business decision',   


 'pi left institution' : 'investigator left institution',
 'pi left the institution' : 'investigator left institution', 
 'investigator left institution' : 'investigator left institution', 


 'administratively complete' : 'complete',

 'futility' : 'futility',

 'terminated' : 'terminated'     

     
    }

ct_whole['why_stopped_map'] = ct_whole['why_stopped'].map(x)

In [45]:
ct_whole['why_stopped_map'].value_counts()

recruitment issues               2803
funding issues                    865
COVID-19                          406
business decision                 216
investigator left institution     132
complete                           60
futility                           38
terminated                         32
hydroxychloroquine trial            5
Name: why_stopped_map, dtype: int64

In [46]:
# export to csv for use in another notebook, can use the true/false column regarding if it is a covid study:

ct_whole.to_csv(r'C:\Users\kkosf\Documents\nss\projects\capstone\data\CT_from_python.csv', index=False)

In [37]:
ct_no_covid = ct_whole[ct_whole['CovidStudy'] == False]
ct_no_covid.head()

Unnamed: 0,nct_id,study_first_submitted_date,results_first_submitted_date,last_update_submitted_date,start_date_type,start_date,completion_date_type,completion_date,results_first_posted_date,target_duration,...,number_of_facilities,registered_in_calendar_year,actual_duration,were_results_reported,months_to_report_results,sponsor_type,lead_or_collaborator,sponsor,CovidStudy,why_stopped_map
0,NCT04331431,2020-03-28,,2020-03-31,Actual,2008-04-01,Actual,2015-04-08,,,...,,2020,85.0,False,,Other,lead,Sohag University,False,
1,NCT04645472,2020-11-25,,2020-11-25,Actual,2020-04-20,Anticipated,2021-04-30,,,...,1.0,2020,,False,,Other,lead,Zhenyu Wu,False,
2,NCT04646369,2020-11-20,,2020-11-20,Actual,2020-11-05,Anticipated,2022-04-30,,,...,4.0,2020,,False,,Other,lead,University of Pittsburgh,False,
3,NCT04646356,2020-10-20,,2020-11-20,Actual,2020-10-20,Anticipated,2022-09-30,,,...,1.0,2020,,False,,Other,lead,"St. Michael's Hospital, Toronto",False,
4,NCT04646330,2020-11-23,,2020-11-23,Anticipated,2020-11-30,Anticipated,2023-12-31,,,...,1.0,2020,,False,,Industry,lead,Akeso,False,


In [38]:
#checking that no studies marked true for covidstudy remain:

ct_no_covid.CovidStudy.value_counts()

False    356008
Name: CovidStudy, dtype: int64

In [39]:
#saving the dataset with the covid studies removed to csv to use in another notebook:

#ct_no_covid.to_csv(r'C:\Users\kkosf\Documents\nss\projects\capstone\data\ct_no_covid_from_python.csv', index=False)
ct_no_covid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356008 entries, 0 to 360144
Data columns (total 34 columns):
 #   Column                        Non-Null Count   Dtype         
---  ------                        --------------   -----         
 0   nct_id                        356008 non-null  object        
 1   study_first_submitted_date    356008 non-null  object        
 2   results_first_submitted_date  46326 non-null   object        
 3   last_update_submitted_date    356008 non-null  object        
 4   start_date_type               162203 non-null  object        
 5   start_date                    351144 non-null  datetime64[ns]
 6   completion_date_type          331595 non-null  object        
 7   completion_date               338435 non-null  object        
 8   results_first_posted_date     46326 non-null   object        
 9   target_duration               6697 non-null    object        
 10  study_type                    356008 non-null  object        
 11  brief_title  

In [40]:
#saving the other datasets to csv to use in another notebook:

#interventions.to_csv(r'C:\Users\kkosf\Documents\nss\projects\capstone\data\ct_interventions_from_python.csv', index=False)
#conditions.to_csv(r'C:\Users\kkosf\Documents\nss\projects\capstone\data\ct_conditions_from_python.csv', index=False)
#countries.to_csv(r'C:\Users\kkosf\Documents\nss\projects\capstone\data\ct_countries_from_python.csv', index=False)
#sponsors.to_csv(r'C:\Users\kkosf\Documents\nss\projects\capstone\data\ct_sponsors_from_python.csv', index=False)


In [42]:
why_stopped = ct_whole[['nct_id', 'why_stopped', 'CovidStudy']]

In [None]:
why_stopped.CovidStudy.value_counts()

In [None]:
why_stopped.why_stopped_lower.value_counts().head(25)

In [None]:
why_stopped.info()

In [None]:
why_stopped = why_stopped.dropna()
why_stopped.info()

In [None]:
#trying to find a way to rename values in the why_stopped column to all be the same so they will aggregate...

#list comprehension
#make a loop to populate a new column based on the column? Would that be easier?

#why_stopped.loc
why_stopped = why_stopped.loc[(why_stopped.why_stopped == why_stopped.why_stopped.str.contains('enroll'))] = 'low/slow accrual'

#df[df['A'].astype(str).str.contains("Hello|Britain")]
#df.loc[(df.Event == 'Dance'),'Event']='Hip-Hop'

why_stopped

In [None]:
why_stopped_covid = why_stopped[why_stopped.CovidStudy == True]
why_stopped_covid.CovidStudy.value_counts()

In [None]:
why_stopped_covid.info()

In [None]:
why_stopped_no_covid = why_stopped[why_stopped.CovidStudy == False]
why_stopped_no_covid.CovidStudy.value_counts()

In [None]:
why_stopped_covid = why_stopped_covid.why_stopped_lower.value_counts()
why_stopped_covid

In [None]:
why_stopped_no_covid = why_stopped_no_covid.why_stopped_lower.value_counts()
why_stopped_no_covid

In [None]:
#why_stopped_covid.to_csv(r'C:\Users\kkosf\Documents\nss\projects\capstone\data\why_stopped_covid.csv')
#why_stopped_no_covid.to_csv(r'C:\Users\kkosf\Documents\nss\projects\capstone\data\why_stopped_no_covid.csv')