In [1]:
#import packages needed:

import pandas as pd
import psycopg2
from sqlalchemy import create_engine

In [2]:
#import data files to compare and explore:

ct_covid = pd.read_csv('../data/all_covid_studies_ctgov.csv') #-covid only trials from clinicaltrials.gov downloaded 12/10/20

#Read in clinicaltrials.gov dataset from SQL database, may save to csv for use later:

# establish a database connection to SQL for complete clinicaltrials.gov dataset
engine = create_engine("postgres+psycopg2://postgres:postgres@localhost:5432/AACT")

# use the connection to run a query using pandas:
ct_all = pd.read_sql("SELECT * FROM studies;", con=engine)

ct_all.head()


Unnamed: 0,nct_id,nlm_download_date_description,study_first_submitted_date,results_first_submitted_date,disposition_first_submitted_date,last_update_submitted_date,study_first_submitted_qc_date,study_first_posted_date,study_first_posted_date_type,results_first_submitted_qc_date,...,is_us_export,biospec_retention,biospec_description,ipd_time_frame,ipd_access_criteria,ipd_url,plan_to_share_ipd,plan_to_share_ipd_description,created_at,updated_at
0,NCT04331431,ClinicalTrials.gov processed this data on Nove...,2020-03-28,,,2020-03-31,2020-03-31,2020-04-02,Actual,,...,,,,,,,,,2020-12-01 07:41:58.458186,2020-12-01 07:41:58.458186
1,NCT04645472,ClinicalTrials.gov processed this data on Nove...,2020-11-25,,,2020-11-25,2020-11-25,2020-11-27,Actual,,...,,,,,,,No,,2020-12-01 06:44:46.185147,2020-12-01 06:44:46.185147
2,NCT04646369,ClinicalTrials.gov processed this data on Nove...,2020-11-20,,,2020-11-20,2020-11-20,2020-11-27,Actual,,...,,,,These data will be released to the NDCT soon a...,"In addition to public access to the NDCT, data...",,Yes,All requests for study data will follow NIMH's...,2020-12-01 06:44:35.512935,2020-12-01 06:44:35.512935
3,NCT04646356,ClinicalTrials.gov processed this data on Nove...,2020-10-20,,,2020-11-20,2020-11-20,2020-11-27,Actual,,...,False,,,,,,No,,2020-12-01 06:44:36.897833,2020-12-01 06:44:36.897833
4,NCT04646330,ClinicalTrials.gov processed this data on Nove...,2020-11-23,,,2020-11-23,2020-11-23,2020-11-27,Actual,,...,,,,,,,,,2020-12-01 06:44:37.204421,2020-12-01 06:44:37.204421


In [3]:
#check columns for full ct data to see which to keep for analysis:

ct_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360145 entries, 0 to 360144
Data columns (total 64 columns):
 #   Column                               Non-Null Count   Dtype         
---  ------                               --------------   -----         
 0   nct_id                               360145 non-null  object        
 1   nlm_download_date_description        360145 non-null  object        
 2   study_first_submitted_date           360145 non-null  object        
 3   results_first_submitted_date         46342 non-null   object        
 4   disposition_first_submitted_date     7523 non-null    object        
 5   last_update_submitted_date           360145 non-null  object        
 6   study_first_submitted_qc_date        360145 non-null  object        
 7   study_first_posted_date              360145 non-null  object        
 8   study_first_posted_date_type         360145 non-null  object        
 9   results_first_submitted_qc_date      46342 non-null   object        
 

In [4]:
#check for duplicates:

ctduplicates = ct_all.nct_id.duplicated()
ctduplicates.value_counts()

False    360145
Name: nct_id, dtype: int64

In [5]:
#select columns to focus on for ct_all dataset, will pull in more info from other tables in SQL once this is done:

ct_all_sub = ct_all[[
 'nct_id',
 'study_first_submitted_date',      
 'results_first_submitted_date',              
 'last_update_submitted_date',   
 'start_date_type',      
 'start_date',       
 'completion_date_type',        
 'completion_date',
 'results_first_posted_date',
 'target_duration',       
 'study_type',
 'brief_title',                          
 'official_title',                    
 'overall_status',
 'phase',                                        
 'enrollment',                                  
 'enrollment_type',
 'source',
 'limitations_and_caveats',
 'why_stopped',                                
 'has_expanded_access',
 'is_fda_regulated_drug',                       
 'is_fda_regulated_device'
]] 
    
    
ct_all_sub.head(2)

Unnamed: 0,nct_id,study_first_submitted_date,results_first_submitted_date,last_update_submitted_date,start_date_type,start_date,completion_date_type,completion_date,results_first_posted_date,target_duration,...,overall_status,phase,enrollment,enrollment_type,source,limitations_and_caveats,why_stopped,has_expanded_access,is_fda_regulated_drug,is_fda_regulated_device
0,NCT04331431,2020-03-28,,2020-03-31,Actual,2008-04-01,Actual,2015-04-08,,,...,Completed,,24.0,Actual,Sohag University,,,False,False,False
1,NCT04645472,2020-11-25,,2020-11-25,Actual,2020-04-20,Anticipated,2021-04-30,,,...,Recruiting,,70.0,Anticipated,Sun Yat-Sen Memorial Hospital of Sun Yat-Sen U...,,,False,False,False


In [27]:
#pull in interventions, conditions, sponsors, countries, and calculated_values tables from SQL:

interventions =  pd.read_sql("SELECT * FROM interventions;", con=engine)
conditions =  pd.read_sql("SELECT * FROM conditions;", con=engine)
sponsors =  pd.read_sql("SELECT * FROM sponsors;", con=engine)
countries =  pd.read_sql("SELECT * FROM countries;", con=engine)
calculated_values =  pd.read_sql("SELECT * FROM calculated_values;", con=engine)

print(interventions.shape)
print(conditions.shape)
print(sponsors.shape)
print(countries.shape)
calculated_values.shape

(618521, 5)
(607333, 4)
(574931, 5)
(510391, 4)


(360145, 19)

In [32]:
iduplicates = interventions.nct_id.duplicated()
print(iduplicates.value_counts())
interventions = interventions[['nct_id', 'intervention_type', 'name']]
interventions = interventions.rename(columns = {'name': 'intervention'}) 
interventions.head()

False    322164
True     296357
Name: nct_id, dtype: int64


Unnamed: 0,nct_id,intervention_type,intervention
0,NCT04423627,Drug,Placebo
1,NCT04645888,Procedure,Impacted tooth surgery
2,NCT04646369,Behavioral,"""screening as usual"""
3,NCT04646369,Behavioral,Screening Wizard 2.0
4,NCT04646369,Behavioral,Screening Wizard 2.0 + SOVA


In [33]:
cduplicates = conditions.nct_id.duplicated()
print(cduplicates.value_counts())
conditions = conditions[['nct_id','downcase_name']]
conditions = conditions.rename(columns = {'downcase_name': 'condition'})
conditions.head()

False    359277
True     248056
Name: nct_id, dtype: int64


Unnamed: 0,nct_id,condition
0,NCT02892617,modic 1
1,NCT02889016,pans
2,NCT02887209,hiv
3,NCT02882607,hiv
4,NCT02882256,head injury


In [35]:
sduplicates = sponsors.nct_id.duplicated()
print(sduplicates.value_counts())
sponsors = sponsors[['nct_id','agency_class', 'lead_or_collaborator', 'name']]
sponsors = sponsors.rename(columns = {'agency_class' : 'sponsor_type','name': 'sponsor'})
sponsors.head()

False    360145
True     214786
Name: nct_id, dtype: int64


Unnamed: 0,nct_id,sponsor_type,lead_or_collaborator,sponsor
0,NCT04646369,Other,lead,University of Pittsburgh
1,NCT04646369,Other,collaborator,Kaiser Foundation Research Institute
2,NCT04646369,NIH,collaborator,National Institute of Mental Health (NIMH)
3,NCT04646356,Other,lead,"St. Michael's Hospital, Toronto"
4,NCT04646356,U.S. Fed,collaborator,United States Department of Defense


In [36]:
#could separate out for lead only and it should match with the main study data set...

sponsors.lead_or_collaborator.value_counts()

lead            360145
collaborator    214786
Name: lead_or_collaborator, dtype: int64

In [39]:
cnduplicates = countries.nct_id.duplicated()
print(cnduplicates.value_counts())
countries = countries[['nct_id', 'name', 'removed']]
countries = countries.rename(columns = {'name' : 'country','removed': 'country_removed'})
countries.head()

False    323475
True     186916
Name: nct_id, dtype: int64


Unnamed: 0,nct_id,country,country_removed
0,NCT04646369,United States,
1,NCT04646356,Canada,
2,NCT04646330,China,
3,NCT04646317,Pakistan,
4,NCT04646278,"Korea, Republic of",


In [38]:
#not sure what "removed" means - will look into

countries.removed.value_counts()

True    29944
Name: removed, dtype: int64

In [45]:
cvduplicates = calculated_values.nct_id.duplicated()
print(cvduplicates.value_counts())
calculated_values = calculated_values[['nct_id', 'number_of_facilities', 'registered_in_calendar_year', 'actual_duration', 'were_results_reported', 'months_to_report_results']]
calculated_values.head()

False    360145
Name: nct_id, dtype: int64


Unnamed: 0,nct_id,number_of_facilities,registered_in_calendar_year,actual_duration,were_results_reported,months_to_report_results
0,NCT00506311,1.0,2007,58.0,False,
1,NCT00652496,1.0,2008,4.0,False,
2,NCT00581698,1.0,2007,227.0,False,
3,NCT00643214,1.0,2008,1.0,False,
4,NCT00632684,1.0,2008,31.0,False,


#see https://stackoverflow.com/questions/22798934/pandas-long-to-wide-reshape-by-two-variables

interventions['idx'] = interventions.groupby('nct_id').cumcount()

tmp = []
for var in ['name']:
    interventions['tmp_idx'] = var + '_' + interventions.idx.astype(str)
    tmp.append(interventions.pivot(index='nct_id',columns='tmp_idx',values=var))

reshape = pd.concat(tmp,axis=1)

reshape.head()

In [25]:
all_interventions = ct_all_sub.append(interventions)
print(all_interventions.shape)
print(all_interventions.nct_id.nunique())
all_interventions

(978666, 26)
360145


Unnamed: 0,nct_id,study_first_submitted_date,results_first_submitted_date,last_update_submitted_date,start_date_type,start_date,completion_date_type,completion_date,results_first_posted_date,target_duration,...,enrollment_type,source,limitations_and_caveats,why_stopped,has_expanded_access,is_fda_regulated_drug,is_fda_regulated_device,CovidStudy,intervention_type,name
0,NCT04331431,2020-03-28,,2020-03-31,Actual,2008-04-01,Actual,2015-04-08,,,...,Actual,Sohag University,,,False,False,False,False,,
1,NCT04645472,2020-11-25,,2020-11-25,Actual,2020-04-20,Anticipated,2021-04-30,,,...,Anticipated,Sun Yat-Sen Memorial Hospital of Sun Yat-Sen U...,,,False,False,False,False,,
2,NCT04646369,2020-11-20,,2020-11-20,Actual,2020-11-05,Anticipated,2022-04-30,,,...,Anticipated,University of Pittsburgh,,,False,False,False,False,,
3,NCT04646356,2020-10-20,,2020-11-20,Actual,2020-10-20,Anticipated,2022-09-30,,,...,Anticipated,"St. Michael's Hospital, Toronto",,,False,False,False,False,,
4,NCT04646330,2020-11-23,,2020-11-23,Anticipated,2020-11-30,Anticipated,2023-12-31,,,...,Anticipated,Akeso,,,False,False,False,False,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
618516,NCT00027170,,,,,,,,,,...,,,,,,,,,Device,Siemens MRI scanner
618517,NCT00001521,,,,,,,,,,...,,,,,,,,,Drug,Flutamide
618518,NCT00001521,,,,,,,,,,...,,,,,,,,,Drug,Letrozole
618519,NCT00001521,,,,,,,,,,...,,,,,,,,,Drug,Hydrocortisone


In [10]:
ct_interventions = ct_all_sub.merge(interventions, how='inner', on='nct_id')
print(ct_interventions.shape)
print(ct_interventions.nct_id.nunique())
ct_interventions

(618521, 25)
322164


Unnamed: 0,nct_id,study_first_submitted_date,results_first_submitted_date,last_update_submitted_date,start_date_type,start_date,completion_date_type,completion_date,results_first_posted_date,target_duration,...,enrollment,enrollment_type,source,limitations_and_caveats,why_stopped,has_expanded_access,is_fda_regulated_drug,is_fda_regulated_device,intervention_type,name
0,NCT04331431,2020-03-28,,2020-03-31,Actual,2008-04-01,Actual,2015-04-08,,,...,24.0,Actual,Sohag University,,,False,False,False,Procedure,Spinal cord tumor resection
1,NCT04645472,2020-11-25,,2020-11-25,Actual,2020-04-20,Anticipated,2021-04-30,,,...,70.0,Anticipated,Sun Yat-Sen Memorial Hospital of Sun Yat-Sen U...,,,False,False,False,Device,CT-Ultrasound
2,NCT04646369,2020-11-20,,2020-11-20,Actual,2020-11-05,Anticipated,2022-04-30,,,...,100.0,Anticipated,University of Pittsburgh,,,False,False,False,Behavioral,"""screening as usual"""
3,NCT04646369,2020-11-20,,2020-11-20,Actual,2020-11-05,Anticipated,2022-04-30,,,...,100.0,Anticipated,University of Pittsburgh,,,False,False,False,Behavioral,Screening Wizard 2.0
4,NCT04646369,2020-11-20,,2020-11-20,Actual,2020-11-05,Anticipated,2022-04-30,,,...,100.0,Anticipated,University of Pittsburgh,,,False,False,False,Behavioral,Screening Wizard 2.0 + SOVA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
618516,NCT00058188,2003-04-07,2020-10-23,2020-11-18,Actual,2003-03-31,Actual,2008-11-30,2020-11-19,,...,53.0,Actual,Northwestern University,,Closed by the research committee,False,,,Dietary Supplement,cholecalciferol
618517,NCT00058188,2003-04-07,2020-10-23,2020-11-18,Actual,2003-03-31,Actual,2008-11-30,2020-11-19,,...,53.0,Actual,Northwestern University,,Closed by the research committee,False,,,Drug,calcium gluconate
618518,NCT00058188,2003-04-07,2020-10-23,2020-11-18,Actual,2003-03-31,Actual,2008-11-30,2020-11-19,,...,53.0,Actual,Northwestern University,,Closed by the research committee,False,,,Drug,zoledronic acid
618519,NCT00047853,2002-10-22,,2020-12-04,Actual,2002-11-04,Anticipated,2029-09-30,,,...,1872.0,Anticipated,National Institutes of Health Clinical Center ...,,,False,,False,Device,Shock device


In [13]:
ct_all_sub.source.value_counts()

National Institutes of Health Clinical Center (CC)    4525
National Cancer Institute (NCI)                       3470
GlaxoSmithKline                                       3418
Pfizer                                                2990
Novartis                                              2876
                                                      ... 
Tenet Healthcare Corporation                             1
Emergency Medicine Association of Turkey                 1
Instituto de Ciencias del Corazon                        1
WestCare Pacific Islands, Inc.                           1
Prokarium Ltd                                            1
Name: source, Length: 21837, dtype: int64

In [14]:
ct_covid.Conditions.value_counts()

COVID-19                                                                            628
Covid19                                                                             370
COVID                                                                                98
Covid-19                                                                             74
Coronavirus Infection                                                                46
                                                                                   ... 
Coronavirus Infection|Pneumonia, Viral                                                1
Covid-19|Critical Illness|Post Intensive Care Unit Syndrome|Muscle Weakness           1
Infectious Disease|COVID-19                                                           1
Coronavirus|Coronavirus Sars-Associated as Cause of Disease Classified Elsewhere      1
Depression|Generalized Anxiety|Health Anxiety                                         1
Name: Conditions, Length: 2159, 

In [15]:
ct_covid.head()

Unnamed: 0,Rank,NCT Number,Title,Acronym,Status,Study Results,Conditions,Interventions,Outcome Measures,Sponsor/Collaborators,...,Other IDs,Start Date,Primary Completion Date,Completion Date,First Posted,Results First Posted,Last Update Posted,Locations,Study Documents,URL
0,1,NCT04372602,Duvelisib to Combat COVID-19,,Recruiting,No Results Available,COVID-19,Drug: Duvelisib|Procedure: Peripheral blood dr...,Overall survival|Length of hospital stay|Lengt...,Washington University School of Medicine|Veras...,...,202007009,"October 12, 2020","November 30, 2021","April 30, 2022","May 4, 2020",,"November 9, 2020","Washington University School of Medicine, Sain...",,https://ClinicalTrials.gov/show/NCT04372602
1,2,NCT04364698,Observational Cohort of COVID-19 Patients at R...,COVID-RPC,Recruiting,No Results Available,COVID-19,,"clinical, biological and radiological characte...",Assistance Publique - Hôpitaux de Paris,...,20SBS-COVID-RPC,"May 7, 2020",June 2020,June 2020,"April 28, 2020",,"May 14, 2020","Department of Infectiology, Raymond Poincaré H...",,https://ClinicalTrials.gov/show/NCT04364698
2,3,NCT04482621,Decitabine for Coronavirus (COVID-19) Pneumoni...,DART,Recruiting,No Results Available,COVID-19,Drug: Decitabine|Other: Placebo Saline,Change in clinical state as assessed by a 6-po...,Johns Hopkins University,...,IRB00247544,"September 14, 2020",May 2021,July 2021,"July 22, 2020",,"September 25, 2020","Johns Hopkins University, Baltimore, Maryland,...",,https://ClinicalTrials.gov/show/NCT04482621
3,4,NCT04459637,COVID-19 Surveillance Based on Smart Wearable ...,COVID-19SWD,Not yet recruiting,No Results Available,COVID-19,,Deterioration of the condition|Mortality|The i...,Peking University First Hospital,...,2020055-0615,"July 1, 2020","March 10, 2021","March 10, 2021","July 7, 2020",,"July 7, 2020","Peking University First Hospital, Beijing, Bei...",,https://ClinicalTrials.gov/show/NCT04459637
4,5,NCT04425538,A Phase 2 Trial of Infliximab in Coronavirus D...,,Recruiting,No Results Available,COVID-19,Drug: Infliximab,Time to improvement in oxygenation|28-day mort...,Tufts Medical Center|National Institutes of He...,...,STUDY00000564,"June 1, 2020",September 2020,December 2020,"June 11, 2020",,"June 11, 2020","Tufts Medical Center, Boston, Massachusetts, U...",,https://ClinicalTrials.gov/show/NCT04425538


In [26]:
ct_covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4156 entries, 0 to 4155
Data columns (total 27 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Rank                     4156 non-null   int64  
 1   NCT Number               4156 non-null   object 
 2   Title                    4156 non-null   object 
 3   Acronym                  1835 non-null   object 
 4   Status                   4156 non-null   object 
 5   Study Results            4156 non-null   object 
 6   Conditions               4156 non-null   object 
 7   Interventions            3524 non-null   object 
 8   Outcome Measures         4124 non-null   object 
 9   Sponsor/Collaborators    4156 non-null   object 
 10  Gender                   4148 non-null   object 
 11  Age                      4156 non-null   object 
 12  Phases                   2368 non-null   object 
 13  Enrollment               4125 non-null   float64
 14  Funded Bys              

In [16]:
ctcovidduplicates = ct_covid['NCT Number'].duplicated()
ctcovidduplicates.value_counts()

False    4156
Name: NCT Number, dtype: int64

In [17]:
ct_covid.shape

(4156, 27)

In [18]:
ct_all_sub.shape

(360145, 23)

In [19]:
#Identifying the studies from the CT covid dataset that are in the full CT dataset:

ct_all_sub = ct_all_sub.assign(CovidStudy=ct_all_sub.nct_id.isin(ct_covid['NCT Number']))

ct_all_sub.head()

Unnamed: 0,nct_id,study_first_submitted_date,results_first_submitted_date,last_update_submitted_date,start_date_type,start_date,completion_date_type,completion_date,results_first_posted_date,target_duration,...,phase,enrollment,enrollment_type,source,limitations_and_caveats,why_stopped,has_expanded_access,is_fda_regulated_drug,is_fda_regulated_device,CovidStudy
0,NCT04331431,2020-03-28,,2020-03-31,Actual,2008-04-01,Actual,2015-04-08,,,...,,24.0,Actual,Sohag University,,,False,False,False,False
1,NCT04645472,2020-11-25,,2020-11-25,Actual,2020-04-20,Anticipated,2021-04-30,,,...,,70.0,Anticipated,Sun Yat-Sen Memorial Hospital of Sun Yat-Sen U...,,,False,False,False,False
2,NCT04646369,2020-11-20,,2020-11-20,Actual,2020-11-05,Anticipated,2022-04-30,,,...,,100.0,Anticipated,University of Pittsburgh,,,False,False,False,False
3,NCT04646356,2020-10-20,,2020-11-20,Actual,2020-10-20,Anticipated,2022-09-30,,,...,Phase 2,30.0,Anticipated,"St. Michael's Hospital, Toronto",,,False,False,False,False
4,NCT04646330,2020-11-23,,2020-11-23,Anticipated,2020-11-30,Anticipated,2023-12-31,,,...,Phase 1/Phase 2,120.0,Anticipated,Akeso,,,False,False,False,False


In [20]:
# export to csv for use in another notebook, can use the true/false column regarding if it is a covid study:

#all_ct_studies.to_csv(r'C:\Users\kkosf\Documents\nss\projects\capstone\data\CT_from_python.csv', index=False)

In [21]:
print(ct_all_sub.CovidStudy.value_counts())
356008 + 4137

False    356008
True       4137
Name: CovidStudy, dtype: int64


360145

In [22]:
ct_no_covid = ct_all_sub[ct_all_sub['CovidStudy'] == False]
ct_no_covid.head()

Unnamed: 0,nct_id,study_first_submitted_date,results_first_submitted_date,last_update_submitted_date,start_date_type,start_date,completion_date_type,completion_date,results_first_posted_date,target_duration,...,phase,enrollment,enrollment_type,source,limitations_and_caveats,why_stopped,has_expanded_access,is_fda_regulated_drug,is_fda_regulated_device,CovidStudy
0,NCT04331431,2020-03-28,,2020-03-31,Actual,2008-04-01,Actual,2015-04-08,,,...,,24.0,Actual,Sohag University,,,False,False,False,False
1,NCT04645472,2020-11-25,,2020-11-25,Actual,2020-04-20,Anticipated,2021-04-30,,,...,,70.0,Anticipated,Sun Yat-Sen Memorial Hospital of Sun Yat-Sen U...,,,False,False,False,False
2,NCT04646369,2020-11-20,,2020-11-20,Actual,2020-11-05,Anticipated,2022-04-30,,,...,,100.0,Anticipated,University of Pittsburgh,,,False,False,False,False
3,NCT04646356,2020-10-20,,2020-11-20,Actual,2020-10-20,Anticipated,2022-09-30,,,...,Phase 2,30.0,Anticipated,"St. Michael's Hospital, Toronto",,,False,False,False,False
4,NCT04646330,2020-11-23,,2020-11-23,Anticipated,2020-11-30,Anticipated,2023-12-31,,,...,Phase 1/Phase 2,120.0,Anticipated,Akeso,,,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360140,NCT00033137,2002-04-05,,2020-12-05,Actual,2002-05-13,,,,,...,,950.0,Anticipated,National Institutes of Health Clinical Center ...,,,False,,,False
360141,NCT00032513,2002-03-22,,2020-12-04,Actual,2002-04-16,,,,,...,,300.0,Anticipated,National Institutes of Health Clinical Center ...,,,False,,,False
360142,NCT00029445,2002-01-11,,2020-12-05,Actual,2001-08-09,,,,,...,,400.0,Anticipated,National Institutes of Health Clinical Center ...,,,False,,,False
360143,NCT00028340,2001-12-21,,2020-12-04,Actual,2003-02-20,,,,,...,,214.0,Anticipated,National Institutes of Health Clinical Center ...,,,False,,,False


In [23]:
#saving the dataset with the covid studies removed to csv to use in another notebook:

#ct_no_covid.to_csv(r'C:\Users\kkosf\Documents\nss\projects\capstone\data\ct_no_covid_from_python.csv', index=False)
ct_no_covid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 356008 entries, 0 to 360144
Data columns (total 24 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   nct_id                        356008 non-null  object 
 1   study_first_submitted_date    356008 non-null  object 
 2   results_first_submitted_date  46326 non-null   object 
 3   last_update_submitted_date    356008 non-null  object 
 4   start_date_type               162203 non-null  object 
 5   start_date                    351144 non-null  object 
 6   completion_date_type          331595 non-null  object 
 7   completion_date               338435 non-null  object 
 8   results_first_posted_date     46326 non-null   object 
 9   target_duration               6697 non-null    object 
 10  study_type                    356008 non-null  object 
 11  brief_title                   356008 non-null  object 
 12  official_title                345897 non-nul