In [1]:
import statsmodels.api as sm

from lifelines import CoxPHFitter, KaplanMeierFitter
from lifelines.utils import concordance_index
from lifelines import KaplanMeierFitter
from lifelines import NelsonAalenFitter

from scipy import stats
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt

  from pandas import Int64Index as NumericIndex


In [2]:
# Provided data
data = """
survival	flchain
AER	DoctorVisits
AER	HealthInsurance
AER	Medicaid1986
asaur	hepatoCellular
asaur	pancreatic
asaur	pancreatic2
asaur	pharmacoSmoking
asaur	prostateSurvival
betareg	StressAnxiety
boot	aids
boot	cd4
boot	downs.bc
boot	melanoma
boot	neuro
boot	nodal
boot	remission
boot	survival
boot	urine
carData	Blackmore
carData	Davis
carData	KosteckiDillon
carData	Leinhardt
carData	LoBD
carData	Robey
carData	WeightLoss
carData	Wong
causaldata	abortion
causaldata	nhefs
causaldata	nhefs_complete
causaldata	organ_donations
causaldata	thornton_hiv
survival	nafld2 (nafld)
survival	nafld3 (nafld)
survival	survexp.us (survexp)
openintro	simpsons_paradox_covid
AER	Fertility
openintro	avandia
openintro	mammogram
stevedata	gss_abortion
stevedata	mm_nhis
survival	survexp.usr (survexp)
AER	Fertility2
Ecdat	HI
sampleSelection	RandHIE
Ecdat	DoctorContacts
openintro	health_coverage
dslabs	us_contagious_diseases
medicaldata	covid_testing
geepack	muscatine
openintro	yrbss
Stat2Data	YouthRisk
stevedata	wvs_usa_abortion
survival	survexp.mn (survexp)
mstate	ebmt2
mosaicData	Births
tidyr	who
tidyr	who2
DAAG	monica
openintro	smallpox
DAAG	crickete
stevedata	anes_prochoice
Ecdat	MedExp
mosaicData	BirthsSSA
Ecdat	DoctorAUS
Ecdat	OFP
heplots	AddHealth
carData	KosteckiDillon
tidyr	population
survival	myeloma (cancer)
mosaicData	BirthsCDC
survival	rotterdam (cancer)
MASS	Aids2
mlmRev	guPrenat
mstate	ebmt4
openintro	biontech_adolescents
mstate	ebmt3
mstate	prothr
geepack	ohio
modeldata	cells
DAAG	dengue
mstate	ebmt1
survival	pbcseq (pbc)
mlmRev	Contraception
survival	colon (cancer)
openintro	smoking
causaldata	nhefs
causaldata	nhefs_complete
openintro	healthcare_law_survey
COUNT	medpar
mosaicData	HELPfull
openintro	hfi
Stat2Data	NCbirths
survival	mgus2 (cancer)
Ecdat	Cigar
plm	Cigar
survival	udca2 (udca)
Stat2Data	Whickham2
DAAG	mifem
HistData	Snow.streets
mosaicData	Gestation
openintro	babies
MASS	OME
openintro	sleep_deprivation
openintro	ebola_survey
openintro	scotus_healthcare
openintro	births14
openintro	ncbirths
AER	Medicaid1986
carData	Blackmore
HistData	GaltonFamilies
multgee	arthritis
medicaldata	opt
survival	transplant
sampleSelection	Smoke
HistData	CholeraDeaths1849
MASS	biopsy
openintro	diabetes2
survival	gbsg (cancer)
survival	myeloid (cancer)
texmex	liver
HLMdiag	autism
medicaldata	indo_rct
quantreg	uis
boot	aids
dslabs	brca
HSAUR	respiratory
Ecdat	Cigarette
openintro	fastfood
openintro	bdims
dslabs	outlier_example
Stat2Data	Blood1
Stat2Data	YouthRisk2009
tidyr	cms_patient_experience
"""

# Splitting the data into lines and creating two lists
first_entries = []
second_entries = []

for line in data.strip().split('\n'):
    entries = line.split()
    
    if len(entries) == 2:
        first_entries.append(entries[0])
        second_entries.append(entries[1])
    elif len(entries) == 3:
        first_entries.append(entries[0])
        second_entries.append(entries[1] + ' ' + entries[2])
    
# Displaying the lists
print("First Entries:", first_entries)
print("\nSecond Entries:", second_entries)

print(len(first_entries))
print(len(second_entries))

First Entries: ['survival', 'AER', 'AER', 'AER', 'asaur', 'asaur', 'asaur', 'asaur', 'asaur', 'betareg', 'boot', 'boot', 'boot', 'boot', 'boot', 'boot', 'boot', 'boot', 'boot', 'carData', 'carData', 'carData', 'carData', 'carData', 'carData', 'carData', 'carData', 'causaldata', 'causaldata', 'causaldata', 'causaldata', 'causaldata', 'survival', 'survival', 'survival', 'openintro', 'AER', 'openintro', 'openintro', 'stevedata', 'stevedata', 'survival', 'AER', 'Ecdat', 'sampleSelection', 'Ecdat', 'openintro', 'dslabs', 'medicaldata', 'geepack', 'openintro', 'Stat2Data', 'stevedata', 'survival', 'mstate', 'mosaicData', 'tidyr', 'tidyr', 'DAAG', 'openintro', 'DAAG', 'stevedata', 'Ecdat', 'mosaicData', 'Ecdat', 'Ecdat', 'heplots', 'carData', 'tidyr', 'survival', 'mosaicData', 'survival', 'MASS', 'mlmRev', 'mstate', 'openintro', 'mstate', 'mstate', 'geepack', 'modeldata', 'DAAG', 'mstate', 'survival', 'mlmRev', 'survival', 'openintro', 'causaldata', 'causaldata', 'openintro', 'COUNT', 'mosaic

In [3]:
packages = []
items = []
labels = []
docs = []
rows = []
cols = []

for i, package in enumerate(first_entries):
    item = second_entries[i]

    try:
        df_doc = sm.datasets.get_rdataset(item, package)
        
    except:
        continue

    obj = df_doc.__doc__
    for line in obj.strip().split('\n'):
        if '.. rubric::' in line:
            ln = line.replace('.. rubric::', '')
            print(package, item, ln)
            
            packages.append(package)
            items.append(item)
            labels.append(ln + ' (' + package + ': ' + item + ')')
            docs.append(obj)
            
            df = df_doc.data
            rows.append(df.shape[0])
            cols.append(df.shape[1])
        
            break



survival flchain     Assay of serum free light chain for 7874 subjects.
AER DoctorVisits     Australian Health Service Utilization Data
AER HealthInsurance     Medical Expenditure Panel Survey Data
AER Medicaid1986     Medicaid Utilization Data
asaur hepatoCellular     hepatoCellular
asaur pancreatic     pancreatic
asaur pancreatic2     pancreatic2
asaur pharmacoSmoking     pharmacoSmoking
asaur prostateSurvival     prostateSurvival
betareg StressAnxiety     Dependency of Anxiety on Stress
boot aids     Delay in AIDS Reporting in England and Wales
boot cd4     CD4 Counts for HIV-Positive Patients
boot downs.bc     Incidence of Down's Syndrome in British Columbia
boot melanoma     Survival from Malignant Melanoma
boot neuro     Neurophysiological Point Process Data
boot nodal     Nodal Involvement in Prostate Cancer
boot remission     Cancer Remission and Cell Activity
boot survival     Survival of Rats after Radiation Doses
boot urine     Urine Analysis Data
carData Blackmore     Exerc

In [4]:
tdf = pd.DataFrame(columns=['package', 'item', 'Dataset', 'No. of rows', 'No. of columns', 'docs'])
tdf['package'] = packages
tdf['item'] = items
tdf['Dataset'] = labels
tdf['No. of rows'] = rows
tdf['No. of columns'] = cols
tdf['docs'] = docs

tdf.drop_duplicates(inplace=True)
print(tdf.shape[0],'healthcare datasets from Statsmodels')
tdf.head()

114 healthcare datasets from Statsmodels


Unnamed: 0,package,item,Dataset,No. of rows,No. of columns,docs
0,survival,flchain,Assay of serum free light chain for 7874 s...,7874,11,.. container::\n\n ======= ===============\n...
1,AER,DoctorVisits,Australian Health Service Utilization Data...,5190,12,.. container::\n\n ============ ============...
2,AER,HealthInsurance,Medical Expenditure Panel Survey Data (AER...,8802,11,.. container::\n\n =============== =========...
3,AER,Medicaid1986,Medicaid Utilization Data (AER: Medicaid1986),996,14,.. container::\n\n ============ ============...
4,asaur,hepatoCellular,hepatoCellular (asaur: hepatoCellular),227,48,.. container::\n\n ============== ==========...


In [5]:
obj = tdf['docs'].iloc[0]
print(obj)

.. container::

   flchain R Documentation

   .. rubric:: Assay of serum free light chain for 7874 subjects.
      :name: flchain

   .. rubric:: Description
      :name: description

   This is a stratified random sample containing 1/2 of the subjects
   from a study of the relationship between serum free light chain (FLC)
   and mortality. The original sample contains samples on approximately
   2/3 of the residents of Olmsted County aged 50 or greater.

   .. rubric:: Usage
      :name: usage

   .. code:: R

      flchain
      data(flchain, package="survival")

   .. rubric:: Format
      :name: format

   A data frame with 7874 persons containing the following variables.

   ``age``
      age in years

   ``sex``
      F=female, M=male

   ``sample.yr``
      the calendar year in which a blood sample was obtained

   ``kappa``
      serum free light chain, kappa portion

   ``lambda``
      serum free light chain, lambda portion

   ``flc.grp``
      the FLC group for the subjec

In [6]:
labels = tdf['Dataset'].tolist()

str1 = ' '.join(labels)
labels = str1.split()

labels = sorted(list(set(labels)))
print(len(labels), 'unique labels')


470 unique labels


In [7]:
# Remove parentheses, commas, and colons
cleaned_text = str1.replace("(", "").replace(")", "").replace(",", "").replace(":", "")

# Split the cleaned text into a list of individual words
word_list = cleaned_text.split()
word_list = sorted(list(set(word_list)))
print(len(word_list), 'unique words')

ls2 = []
for word in word_list:
    ls2.append(word.lower())
word_list = sorted(list(set(ls2)))
print(len(word_list), 'unique words\n')

# Display the resulting list
for i in word_list:
    print("\""+i+"\""+',')


451 unique words
413 unique words

"&",
"10-17",
"1000",
"1849",
"1854",
"2",
"2008",
"2009",
"2012",
"507",
"7874",
"abnormal",
"abortion",
"active",
"activity",
"addhealth",
"administrative",
"adolescent",
"adult",
"aer",
"after",
"aids",
"aids2",
"american",
"an",
"analysis",
"and",
"anes",
"anes_prochoice",
"annual",
"anxiety",
"arthritis",
"asaur",
"assay",
"at",
"attitudes",
"auditory",
"australia",
"australian",
"autism",
"avandia",
"babies",
"bangladesh",
"bdims",
"behavior",
"behaviors",
"behaviour",
"betareg",
"biontech_adolescents",
"biopsy",
"birth",
"births",
"births14",
"birthscdc",
"birthsssa",
"blackmore",
"blood",
"blood1",
"body",
"boot",
"brca",
"breast",
"british",
"by",
"cancer",
"cardata",
"cardiovascular",
"care",
"carolina",
"cases",
"causaldata",
"cd4",
"cell",
"cells",
"center",
"centers",
"chain",
"child",
"children",
"cholera",
"choleradeaths1849",
"cigar",
"cigarette",
"cirrhosis",
"clinical",
"cms_patient_experience",
"columbia",
"complete-data",
"consumpt

In [106]:
tags_ls = [
    #"&",
    #"10-17",
    #"1000",
    #"1849",
    #"1854",
    #"2",
    #"2008",
    #"2009",
    #"2012",
    #"507",
    #"7874",
    #"abnormal",
    "abortion",
    #"active",
    #"activity",
    #"addhealth",
    #"administrative",
    "adolescent",
    "adult",
    #"aer",
    #"after",
    "AIDS ", #"aids",
    #"aids2",
    #"american",
    #"an",
    #"analysis",
    #"and",
    #"anes",
    #"anes_prochoice",
    #"annual",
    "anxiety",
    "arthritis",
    #"asaur",
    #"assay",
    #"at",
    #"attitudes",
    "auditory",
    #"australia",
    #"australian",
    "autism",
    #"avandia",
    #"babies",
    #"bangladesh",
    #"bdims",
    "behavior",
    #"behaviors",
    #"behaviour",
    #"betareg",
    #"biontech_adolescents",
    "biopsy",
    "birth",
    #"births",
    #"births14",
    #"birthscdc",
    #"birthsssa",
    #"blackmore",
    "blood",
    #"blood1",
    "body",
    #"boot",
    #"brca",
    "breast",
    #"british",
    #"by",
    "cancer",
    #"cardata",
    "cardiovascular",
    #"care",
    #"carolina",
    #"cases",
    #"causaldata",
    #"cd4",
    "cell",
    #"cells",
    #"center",
    #"centers",
    #"chain",
    "child",
    #"children",
    "cholera",
    #"choleradeaths1849",
    #"cigar",
    #"cigarette",
    "cirrhosis",
    "clinical",
    #"cms_patient_experience",
    #"columbia",
    #"complete-data",
    "consumption",
    #"contacts",
    "contagious",
    #"contraception",
    "contraceptive",
    #"control",
    "coronary",
    #"count",
    #"counts",
    #"coverage",
    "covid",
    #"covid-19",
    #"covid_testing",
    #"daag",
    #"daily",
    #"data",
    #"dataset",
    #"davis",
    "deaths",
    #"deidentified",
    #"delay",
    #"demand",
    "dengue",
    "dependency",
    "deprivation",
    #"development",
    "diabetes",
    #"diabetes2",
    #"diagnostic",
    "diarrhaea",
    "disease",
    "doctor",
    #"doctoraus",
    #"doctorcontacts",
    #"doctorvisits",
    "donation",
    #"doses",
    #"down's",
    "Down", #"downs.bc",
    "drug",
    #"dslabs",
    "eating", #"eating-disordered",
    #"ebmt1",
    #"ebmt2",
    #"ebmt3",
    #"ebmt4",
    "ebola",
    #"ebola_survey",
    #"ecdat",
    "efficacy",
    #"england",
    #"european",
    #"evaluation",
    #"examination",
    #"example",
    "exercise",
    #"expenditure",
    #"experiment",
    #"factor",
    #"fast",
    #"fastfood",
    "feet",
    "fertility",
    #"fertility2",
    #"flchain",
    "food",
    #"for",
    #"free",
    "freedom",
    #"from",
    #"galton's",
    #"galtonfamilies",
    #"geepack",
    #"general",
    "gestation",
    #"gss_abortion",
    #"guatemala",
    #"guprenat",
    "headaches",
    "health",
    "risk", #"health-risk",
    #"health_coverage",
    "healthcare",
    #"healthcare_law_survey",
    #"healthinsurance",
    #"height",
    #"heights",
    #"helpfull",
    "hepatocellular",
    #"heplots",
    #"hfi",
    #"hi",
    #"histdata",
    #"histories",
    "hiv",
    #"hiv-positive",
    #"hlmdiag",
    #"hours",
    #"hsaur",
    #"human",
    #"i",
    "illness",
    #"in",
    #"incidence",
    #"including",
    #"index",
    #"individuals.",
    #"indo_rct",
    "indomethacin",
    "infant", #"infant-mortality",
    #"information",
    "insurance",
    #"interview",
    #"involvement",
    #"iq",
    #"john",
    #"justifiability",
    #"kosteckidillon",
    #"labor",
    #"laboratory",
    #"legalization",
    #"leinhardt",
    #"levels",
    #"light",
    #"linkage",
    #"list",
    "liver",
    #"lobd",
    #"london",
    #"loss",
    "male",
    "malignant",
    "mammogram",
    #"map",
    "marrow",
    #"mass",
    #"measurements",
    #"medexp",
    "medicaid",
    #"medicaid1986",
    "medical",
    #"medicaldata",
    "medicare",
    "medpar",
    "melanoma",
    #"mifem",
    "migraine",
    #"mlmrev",
    #"mm_nhis",
    #"modeldata",
    #"monica",
    #"mosaicdata",
    #"mstate",
    #"multgee",
    #"muscatine",
    #"national",
    #"ncbirths",
    "neuro",
    #"neurophysiological",
    #"nhefs",
    #"nhefs_complete",
    #"nodal",
    #"north",
    "nutrition",
    "obesity",
    "obstetrics",
    #"of",
    #"office",
    #"ofp",
    #"ohio",
    #"ome",
    #"on",
    #"openintro",
    #"opinion",
    #"opinions",
    #"opt",
    "organ",
    #"organ_donations",
    #"organization",
    #"outlier_example",
    #"outliers",
    "pancreatic",
    #"pancreatic2",
    #"panel",
    #"paradox",
    "parents",
    #"patients",
    #"perception",
    "periodontal",
    #"pew",
    #"pfizer", #pfizer-biontech",
    #"pharmacosmoking",
    #"physically",
    "physician",
    #"plm",
    #"point",
    #"poll",
    #"population",
    "coma",#"post-coma",
    #"post-ercp",
    "prenatal",
    #"pressure",
    #"prevalence",
    #"prevention",
    #"primary",
    #"problems",
    #"process",
    "prostate",
    #"prostatesurvival",
    #"prothr",
    "prothrombin",
    #"provide",
    #"public",
    #"quantreg",
    "quarantine",
    "radiation",
    #"rand",
    #"randhie",
    #"randomized",
    #"rats",
    #"rct",
    #"records",
    "recovery",
    #"region",
    #"related",
    "remission",
    "reporting",  # !!
    #"research",
    "respiratory",
    #"responses",
    #"results",
    #"rheumatoid",
    #"risk",
    #"robey",
    #"ruling",
    #"sampleselection",
    #"scotus",
    #"scotus_healthcare",
    #"segmentation",
    #"self-reports",
    "serum",
    #"service",
    #"services",
    #"set",
    #"sexually",
    #"simpson's",
    #"simpsons_paradox_covid",
    "sleep",
    #"sleep_deprivation",
    "smallpox",
    #"smoke",
    "smoking",
    #"snow's",
    #"snow.streets",
    #"social",
    "society",
    #"stat2data",
    #"states",
    #"status",
    #"stevedata",
    "stress",
    #"stressanxiety",
    #"structure",
    #"studies",
    #"study",
    #"subjects.",
    #"supply",
    #"surveillance",
    #"survey",
    #"survival",
    "syndrome",
    #"system",
    #"tb",
    #"testing",
    #"tests",
    #"texmex",
    #"the",
    #"their",
    #"therapy",
    #"thornton",
    #"thornton_hiv",
    #"tidyr",
    #"to",
    #"transmitted",
    "transplant",
    #"transportation",
    #"treatment",
    #"trial",
    #"two",
    #"type",
    #"types",
    #"uci",
    #"uis",
    #"uk",
    #"united",
    "urine",
    #"us",
    #"us_contagious_diseases",
    #"use",
    #"utilization",
    "vaccine",
    #"visits",
    #"waiting",
    #"wales",
    "weight",
    #"weightloss",
    "wheeze",
    #"whickham",
    #"whickham2",
    "WHO ", #"who",
    #"who2",
    #"wisconsin",
    #"with",
    #"wives",
    "women", #"women's",
    #"wong",
    #"worked",
    #"workers",
    #"world",
    #"wvs_usa_abortion",
    #"years",
    #"youth",
    #"youthrisk",
    #"youthrisk2009",
    #"yrbss",
]

print(len(tags_ls))
tags_ls = sorted(list(set(tags_ls)))
print(len(tags_ls))



100
100


In [107]:
labels = tdf['Dataset'].tolist()
print(len(list(set(labels))), 'datasets')

filtered = []
cts = []

for lab in labels:
    ct = 0
    for tag in tags_ls:
        if tag.lower() in lab.lower():
            filtered.append(lab)
            ct += 1
        
    cts.append(ct)
            
print(len(list(set(filtered))), 'filtered datasets')
print(max(cts), 'max tags per dataset')

print('Datasets not included:')

ls = np.setdiff1d(labels, filtered)
print(ls)
# yields the elements in `list_2` that are NOT in `list_1`


114 datasets
114 filtered datasets
4 max tags per dataset
Datasets not included:
[]


In [108]:
tag_str = str()
for tag in tags_ls:
    tag_str += "\""+tag+"\""+','
    
print(tag_str)

"AIDS ","Down","WHO ","abortion","adolescent","adult","anxiety","arthritis","auditory","autism","behavior","biopsy","birth","blood","body","breast","cancer","cardiovascular","cell","child","cholera","cirrhosis","clinical","coma","consumption","contagious","contraceptive","coronary","covid","deaths","dengue","dependency","deprivation","diabetes","diarrhaea","disease","doctor","donation","drug","eating","ebola","efficacy","exercise","feet","fertility","food","freedom","gestation","headaches","health","healthcare","hepatocellular","hiv","illness","indomethacin","infant","insurance","liver","male","malignant","mammogram","marrow","medicaid","medical","medicare","medpar","melanoma","migraine","neuro","nutrition","obesity","obstetrics","organ","pancreatic","parents","periodontal","physician","prenatal","prostate","prothrombin","quarantine","radiation","recovery","remission","reporting","respiratory","risk","serum","sleep","smallpox","smoking","society","stress","syndrome","transplant","urine

In [109]:
print(tdf.shape)

f_df = tdf[tdf['Dataset'].str.contains('AIDS', case=False)]
print(f_df.shape)
f_df.head()

(114, 6)
(2, 6)


Unnamed: 0,package,item,Dataset,No. of rows,No. of columns,docs
10,boot,aids,Delay in AIDS Reporting in England and Wal...,570,6,.. container::\n\n ==== ===============\n ...
64,MASS,Aids2,Australian AIDS Survival Data (MASS: Aids2),2843,7,.. container::\n\n ===== ===============\n ...


In [110]:
tdf.to_csv('statsmodels_df.csv')