In [96]:
import subprocess
import re
from pathlib import Path

import pandas as pd

In [97]:
GIT_ROOT = subprocess.check_output(['git', 'rev-parse', '--show-toplevel'])
GIT_ROOT = Path(GIT_ROOT.decode('utf-8').strip())
DATA = GIT_ROOT / Path('data')
SWITZERLAND_DATASET = DATA / Path('switzerland.data')
HUNGARIAN_DATASET = DATA / Path('hungarian.data')
LONG_BEACH_DATASET = DATA / Path('long-beach-va.data')
ATTRIBUTES_FILE = DATA / Path('attribute_names')
ATTRIBUTES_OF_INTEREST = ['age', 'sex', 'cp', 'thalrest', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'smoke', 'cigs', 'years', 'famhist', 'num']

In [98]:
with ATTRIBUTES_FILE.open() as attributes_fp:
    attributes = attributes_fp.read().split(', ')

In [99]:
def load_dataset(path):
    with path.open() as dataset_fp:
        raw_data = dataset_fp.read()
        
    samples = []
    current_sample = []
    for count, data_point in enumerate(re.split(r'\s+', raw_data)):
        if count != 0 and count % len(attributes) == 0:
            samples.append(current_sample)
            current_sample = []
            
        if data_point == '-9':
            data_point = 'NaN'
            
        current_sample.append(data_point)
        
    dataset = pd.DataFrame(data=samples, columns=attributes)
    return dataset

In [100]:
switzerland_dataset = load_dataset(SWITZERLAND_DATASET)
switzerland_dataset

Unnamed: 0,id,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,...,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk,name
0,3001,0,65,1,1,1,1,,4,115,...,1,1,1,1,1,1,1,75,-9.,name
1,3002,0,32,1,0,0,0,,1,95,...,1,1,1,1,1,5,1,63,-9.,name
2,3003,0,61,1,1,1,1,,4,105,...,2,1,1,1,1,1,1,67,-9.,name
3,3004,0,50,1,1,1,1,,4,145,...,1,1,1,1,1,5,4,36,-9.,name
4,3005,0,57,1,1,1,1,,4,110,...,2,1,1,1,1,1,1,60,-9.,name
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118,4070,0,54,1,1,1,1,,4,180,...,1,1,1,1,1,1,1,71,-9.,name
119,4071,0,56,1,1,1,1,,4,125,...,2,1,5,1,1,1,2,68,-9.,name
120,4072,0,56,1,0,1,1,,3,125,...,1,1,1,1,5,3,2,61,-9.,name
121,4073,0,54,1,1,1,1,,4,130,...,2,1,1,1,1,1,1,-9.,-9.,name


In [101]:
switzerland_dataset[ATTRIBUTES_OF_INTEREST]

Unnamed: 0,age,sex,cp,thalrest,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,smoke,cigs,years,famhist,num
0,65,1,4,56,115,0,0,0,93,1,0,2,,,,,1
1,32,1,1,74,95,0,,0,127,0,0.7,1,1,,,,1
2,61,1,4,70,105,0,,0,110,1,1.5,1,,,,,1
3,50,1,4,82,145,0,,0,139,1,0.7,2,,,,,1
4,57,1,4,71,110,0,,1,131,1,1.4,1,,,,,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118,54,1,4,58,180,0,,0,150,0,1.5,2,,,,,1
119,56,1,4,55,125,0,1,0,103,1,1,2,1,,,,3
120,56,1,3,64,125,0,,0,98,0,-2,2,,,,,2
121,54,1,4,58,130,0,,0,110,1,3,2,,,,,3


In [102]:
hungarian_dataset = load_dataset(HUNGARIAN_DATASET)
hungarian_dataset

Unnamed: 0,id,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,...,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk,name
0,1254,0,40,1,1,0,0,,2,140,...,,,1,1,1,1,1,-9.,-9.,name
1,1255,0,49,0,1,0,0,,3,160,...,,,1,1,1,1,1,-9.,-9.,name
2,1256,0,37,1,1,0,0,,2,130,...,,,1,1,1,1,1,-9.,-9.,name
3,1257,0,48,0,1,1,1,,4,138,...,2,,1,1,1,1,1,-9.,-9.,name
4,1258,0,54,1,1,0,1,,3,150,...,1,,1,1,1,1,1,-9.,-9.,name
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,1053,0,48,0,0,0,0,,2,,...,,,1,1,1,1,1,-9.,-9.,name
290,1054,0,36,1,1,0,0,,2,120,...,,,1,1,1,1,1,-9.,-9.,name
291,5001,0,48,1,0,0,0,,3,110,...,,,1,1,1,1,1,-9.,-9.,name
292,5000,0,47,0,0,0,0,,2,140,...,,,1,1,1,1,1,-9.,-9.,name


In [103]:
hungarian_dataset[ATTRIBUTES_OF_INTEREST]

Unnamed: 0,age,sex,cp,thalrest,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,smoke,cigs,years,famhist,num
0,40,1,2,86,140,289,0,0,172,0,0,,,,,,0
1,49,0,3,100,160,180,0,0,156,0,1,2,,,,,1
2,37,1,2,58,130,283,0,1,98,0,0,,,,,,0
3,48,0,4,54,138,214,0,0,108,1,1.5,2,,,,,3
4,54,1,3,74,150,,0,0,122,0,0,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,48,0,2,,,308,0,1,,,2,1,,,,,0
290,36,1,2,72,120,166,0,0,180,0,0,,,,,,0
291,48,1,3,66,110,211,0,0,138,0,0,,,,,,0
292,47,0,2,84,140,257,0,0,135,0,1,1,,,,,0


In [104]:
long_beach_dataset = load_dataset(LONG_BEACH_DATASET)
long_beach_dataset

Unnamed: 0,id,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,...,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk,name
0,1,0,63,1,1,1,1,,4,140,...,2,1,1,1,1,1,1,0.7,5.5,name
1,2,0,44,1,1,1,1,,4,130,...,1,1,1,1,1,1,1,0.5,-9.,name
2,3,0,60,1,1,1,1,,4,132,...,2,1,1,1,1,7,2,0.52,4.1,name
3,4,0,55,1,1,1,1,,4,142,...,1,1,1,1,1,1,1,0.73,6.5,name
4,5,0,66,1,1,0,0,,3,110,...,1,1,1,1,1,1,1,0.73,8,name
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,200,0,54,0,1,1,1,,4,127,...,1,1,1,1,1,1,1,0.76,5.6,name
196,201,0,62,1,0,0,0,,1,,...,1,1,1,1,1,1,2,0.62,3.5,name
197,202,0,55,1,1,1,1,,4,122,...,2,1,1,1,1,1,1,0.69,5.6,name
198,116,0,58,1,1,1,1,,4,,...,1,1,1,1,1,1,1,0.81,6,name


In [105]:
long_beach_dataset[ATTRIBUTES_OF_INTEREST]

Unnamed: 0,age,sex,cp,thalrest,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,smoke,cigs,years,famhist,num
0,63,1,4,62,140,260,0,1,112,1,3,2,0,0,0,0,2
1,44,1,4,73,130,209,0,1,127,0,0,,0,20,10,0,0
2,60,1,4,68,132,218,0,1,140,1,1.5,3,1,40,40,0,2
3,55,1,4,60,142,228,0,1,149,1,2.5,1,1,20,30,0,1
4,66,1,3,66,110,213,1,2,99,1,1.3,2,0,20,8,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,54,0,4,83,127,333,1,1,154,0,0,,0,0,0,1,1
196,62,1,1,,,139,0,1,,,,,1,15,30,0,0
197,55,1,4,74,122,223,1,1,100,0,0,,1,20,40,0,2
198,58,1,4,,,385,1,2,,,,,0,10,20,1,0
