In [29]:
import pandas as pd
import numpy as np
import matplotlib
from sklearn.preprocessing import LabelEncoder,StandardScaler,MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans,AgglomerativeClustering,DBSCAN,OPTICS
import seaborn as sns
from sklearn.metrics import silhouette_score
from kneed import KneeLocator
from scipy.cluster.hierarchy import dendrogram as scipy_dendrogram, linkage
from scipy.cluster.hierarchy import dendrogram
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [30]:
from ucimlrepo import fetch_ucirepo, list_available_datasets

import pprint

# List Available Datasets
Print a list of datasets that can be imported via fetch_ucirepo

In [31]:
list_available_datasets()
list_available_datasets(filter='aim-ahead')
list_available_datasets(search='diabe')


-------------------------------------
The following datasets are available:
-------------------------------------
Dataset Name                                                                            ID    
------------                                                                            --    
Abalone                                                                                 1     
Adult                                                                                   2     
Annealing                                                                               3     
Audiology (Standardized)                                                                8     
Auto MPG                                                                                9     
Automobile                                                                              10    
Balance Scale                                                                           12    
Balloons                       

Should not work for datasets that are not part of the list

In [32]:
try:
    fetch_ucirepo(name='defungi')
    # # test invalid inputs
    # fetch_ucirepo(name='heart diseaseeeee') 
    # fetch_ucirepo(id=10000)
except Exception as e:
    print(e)

"defungi" dataset (id=773) exists in the repository, but is not available for import. Please select a dataset from this list: https://archive.ics.uci.edu/datasets?skip=0&take=10&sort=desc&orderBy=NumHits&search=&Python=true


Import dataset by ID

In [33]:
sepsis = fetch_ucirepo(id=827)

Metadata

In [34]:
pprint.pp(sepsis.metadata)

{'uci_id': 827,
 'name': 'Sepsis Survival Minimal Clinical Records',
 'repository_url': 'https://archive.ics.uci.edu/dataset/827/sepsis+survival+minimal+clinical+records',
 'data_url': 'https://archive.ics.uci.edu/static/public/827/data.csv',
 'abstract': 'The dataset consists of 110,204 admissions of 84,811 '
             'hospitalized subjects between 2011 and 2012 in Norway who were '
             'diagnosed with infections, systemic inflammatory response '
             'syndrome, sepsis by causative microbes, or septic shock.  The '
             'prediction task is to determine whether a patient survived or is '
             'deceased at a time of about 9 days after collecting their '
             'medical record at the hospital.\n'
             '\n'
             'This is an important prediction problem in clinical medicine. '
             'Sepsis is a life-threatening condition triggered by an immune '
             'overreaction to infection, leading to organ failure or even '
   

In [35]:
sepsis.metadata.target_col

['hospital_outcome_1alive_0dead']

In [36]:
sepsis.metadata.intro_paper.title

'Survival prediction of patients with sepsis from age, sex, and septic episode number alone'

In [37]:
sepsis.metadata.additional_info.summary

"Primary cohort from Norway:\n- 4 features for 110,204 patient admissions\n- file: 's41598-020-73558-3_sepsis_survival_primary_cohort.csv'\n\nStudy cohort (a subset of the primary cohort) from Norway:\n- 4 features for 19,051 patient admissions\n- file: 's41598-020-73558-3_sepsis_survival_study_cohort.csv'\n\nValidation cohort from South Korea:\n- 4 features for 137 patients\n- file: 's41598-020-73558-3_sepsis_survival_validation_cohort.csv'\n\nThe validation cohort from South Korea was used by Chicco and Jurman (2020) as an external validation cohort to confirm the generalizability of their proposed approach. "

# Data
IDs, features, and targets are included as separate dataframes, along with an "original" that combines all of them.

In [38]:
sepsis.data.features

Unnamed: 0,age_years,sex_0male_1female,episode_number
0,21,1,1
1,20,1,1
2,21,1,1
3,77,0,1
4,72,0,1
...,...,...,...
110336,47,0,1
110337,50,0,1
110338,62,0,1
110339,58,0,1


In [39]:
sepsis.data.targets

Unnamed: 0,hospital_outcome_1alive_0dead
0,1
1,1
2,1
3,1
4,1
...,...
110336,1
110337,0
110338,1
110339,0


In [40]:
sepsis.data.ids   # this dataset no IDs

In [41]:
sepsis.data.original

Unnamed: 0,age_years,sex_0male_1female,episode_number,hospital_outcome_1alive_0dead
0,21,1,1,1
1,20,1,1,1
2,21,1,1,1
3,77,0,1,1
4,72,0,1,1
...,...,...,...,...
110336,47,0,1,1
110337,50,0,1,0
110338,62,0,1,1
110339,58,0,1,0


In [42]:
sepsis.data.headers

Index(['age_years', 'sex_0male_1female', 'episode_number',
       'hospital_outcome_1alive_0dead'],
      dtype='object')

# Variable Info
Displayed in a dataframe format

In [43]:
sepsis.variables

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,age_years,Feature,Integer,Age,Age of the patient in years.,years,no
1,sex_0male_1female,Feature,Binary,Gender,Gender of the patient. Values are encoded as f...,,no
2,episode_number,Feature,Integer,,Number of prior Sepsis episodes,,no
3,hospital_outcome_1alive_0dead,Target,Binary,,"Status of the patient after 9,351 days of bein...",,no


# Import by Name

In [44]:
glioma = fetch_ucirepo(name='glioma')
glioma.data.features

Unnamed: 0,Gender,Age_at_diagnosis,Race,IDH1,TP53,ATRX,PTEN,EGFR,CIC,MUC16,...,FUBP1,RB1,NOTCH1,BCOR,CSMD3,SMARCA4,GRIN2A,IDH2,FAT4,PDGFRA
0,0,51.30,white,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,38.72,white,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,35.17,white,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,32.78,white,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,0,31.51,white,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
834,1,77.89,white,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
835,0,85.18,white,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
836,1,77.49,white,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
837,0,63.33,white,0,1,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0


In [45]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
iris = fetch_ucirepo(id=189) 
  
# data (as pandas dataframes) 
X = iris.data.features 
y = iris.data.targets 
  
# metadata 
print(iris.metadata) 
  
# variable information 
print(iris.variables) 

{'uci_id': 189, 'name': 'Parkinsons Telemonitoring', 'repository_url': 'https://archive.ics.uci.edu/dataset/189/parkinsons+telemonitoring', 'data_url': 'https://archive.ics.uci.edu/static/public/189/data.csv', 'abstract': "Oxford Parkinson's Disease Telemonitoring Dataset", 'area': 'Health and Medicine', 'tasks': ['Regression'], 'characteristics': ['Tabular'], 'num_instances': 5875, 'num_features': 19, 'feature_types': ['Integer', 'Real'], 'demographics': ['Age', 'Sex'], 'target_col': ['motor_UPDRS', 'total_UPDRS'], 'index_col': ['subject#'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2009, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C5ZS3N', 'creators': ['Athanasios Tsanas', 'Max Little'], 'intro_paper': {'title': "Accurate Telemonitoring of Parkinson's Disease Progression by Noninvasive Speech Tests", 'authors': 'A. Tsanas, Max A. Little, P. McSharry, L. Ramig', 'published_in': 'IEEE Transactions on Biomedical Engineering', 

In [46]:
X

Unnamed: 0,age,test_time,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP,Shimmer,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE,sex
0,72,5.6431,0.00662,0.000034,0.00401,0.00317,0.01204,0.02565,0.230,0.01438,0.01309,0.01662,0.04314,0.014290,21.640,0.41888,0.54842,0.16006,0
1,72,12.6660,0.00300,0.000017,0.00132,0.00150,0.00395,0.02024,0.179,0.00994,0.01072,0.01689,0.02982,0.011112,27.183,0.43493,0.56477,0.10810,0
2,72,19.6810,0.00481,0.000025,0.00205,0.00208,0.00616,0.01675,0.181,0.00734,0.00844,0.01458,0.02202,0.020220,23.047,0.46222,0.54405,0.21014,0
3,72,25.6470,0.00528,0.000027,0.00191,0.00264,0.00573,0.02309,0.327,0.01106,0.01265,0.01963,0.03317,0.027837,24.445,0.48730,0.57794,0.33277,0
4,72,33.6420,0.00335,0.000020,0.00093,0.00130,0.00278,0.01703,0.176,0.00679,0.00929,0.01819,0.02036,0.011625,26.126,0.47188,0.56122,0.19361,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5870,61,142.7900,0.00406,0.000031,0.00167,0.00168,0.00500,0.01896,0.160,0.00973,0.01133,0.01549,0.02920,0.025137,22.369,0.64215,0.55314,0.21367,0
5871,61,149.8400,0.00297,0.000025,0.00119,0.00147,0.00358,0.02315,0.215,0.01052,0.01277,0.01904,0.03157,0.011927,22.886,0.52598,0.56518,0.12621,0
5872,61,156.8200,0.00349,0.000025,0.00152,0.00187,0.00456,0.02499,0.244,0.01371,0.01456,0.01877,0.04112,0.017701,25.065,0.47792,0.57888,0.14157,0
5873,61,163.7300,0.00281,0.000020,0.00128,0.00151,0.00383,0.01484,0.131,0.00693,0.00870,0.01307,0.02078,0.007984,24.422,0.56865,0.56327,0.14204,0
