In [1]:
import subprocess
import re
from pathlib import Path

import pandas as pd

In [2]:
GIT_ROOT = subprocess.check_output(['git', 'rev-parse', '--show-toplevel'])
GIT_ROOT = Path(GIT_ROOT.decode('utf-8').strip())
DATA = GIT_ROOT / Path('data')
SWITZERLAND_DATASET = DATA / Path('switzerland.data')
HUNGARIAN_DATASET = DATA / Path('hungarian.data')
LONG_BEACH_DATASET = DATA / Path('long_beach.data')
ATTRIBUTES_FILE = DATA / Path('attribute_names')
SHUFFLED_DATASET_OUTPUT_PATH = DATA / Path('combined_data.csv')
ATTRIBUTES_OF_INTEREST = ['age', 'sex', 'cp', 'thalrest', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'smoke', 'cigs', 'years', 'famhist', 'num']

In [3]:
with ATTRIBUTES_FILE.open() as attributes_fp:
    attributes = attributes_fp.read().split(', ')

In [4]:
def load_dataset(path):
    with path.open() as dataset_fp:
        raw_data = dataset_fp.read()
        
    samples = []
    current_sample = []
    for count, data_point in enumerate(re.split(r'\s+', raw_data)):
        if count != 0 and count % len(attributes) == 0:
            samples.append(current_sample)
            current_sample = []
            
        if data_point == '-9':
            data_point = None
            
        current_sample.append(data_point)
        
    dataset = pd.DataFrame(data=samples, columns=attributes)
    return dataset

In [5]:
def save_with_suffix(dataset, path, suffix):
    modified_path = path.with_name(path.stem + suffix)
    dataset.to_csv(modified_path, index=None)

In [6]:
switzerland_dataset = load_dataset(SWITZERLAND_DATASET)
curated_switzerland_dataset = switzerland_dataset[ATTRIBUTES_OF_INTEREST]
curated_switzerland_dataset

Unnamed: 0,age,sex,cp,thalrest,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,smoke,cigs,years,famhist,num
0,65,1,4,56,115,0,0,0,93,1,0,2,,,,,1
1,32,1,1,74,95,0,,0,127,0,0.7,1,1,,,,1
2,61,1,4,70,105,0,,0,110,1,1.5,1,,,,,1
3,50,1,4,82,145,0,,0,139,1,0.7,2,,,,,1
4,57,1,4,71,110,0,,1,131,1,1.4,1,,,,,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118,54,1,4,58,180,0,,0,150,0,1.5,2,,,,,1
119,56,1,4,55,125,0,1,0,103,1,1,2,1,,,,3
120,56,1,3,64,125,0,,0,98,0,-2,2,,,,,2
121,54,1,4,58,130,0,,0,110,1,3,2,,,,,3


In [7]:
hungarian_dataset = load_dataset(HUNGARIAN_DATASET)
curated_hungarian_dataset = hungarian_dataset[ATTRIBUTES_OF_INTEREST]
curated_hungarian_dataset

Unnamed: 0,age,sex,cp,thalrest,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,smoke,cigs,years,famhist,num
0,40,1,2,86,140,289,0,0,172,0,0,,,,,,0
1,49,0,3,100,160,180,0,0,156,0,1,2,,,,,1
2,37,1,2,58,130,283,0,1,98,0,0,,,,,,0
3,48,0,4,54,138,214,0,0,108,1,1.5,2,,,,,3
4,54,1,3,74,150,,0,0,122,0,0,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,48,0,2,,,308,0,1,,,2,1,,,,,0
290,36,1,2,72,120,166,0,0,180,0,0,,,,,,0
291,48,1,3,66,110,211,0,0,138,0,0,,,,,,0
292,47,0,2,84,140,257,0,0,135,0,1,1,,,,,0


In [8]:
long_beach_dataset = load_dataset(LONG_BEACH_DATASET)
curated_long_beach_dataset = long_beach_dataset[ATTRIBUTES_OF_INTEREST]
curated_long_beach_dataset

Unnamed: 0,age,sex,cp,thalrest,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,smoke,cigs,years,famhist,num
0,63,1,4,62,140,260,0,1,112,1,3,2,0,0,0,0,2
1,44,1,4,73,130,209,0,1,127,0,0,,0,20,10,0,0
2,60,1,4,68,132,218,0,1,140,1,1.5,3,1,40,40,0,2
3,55,1,4,60,142,228,0,1,149,1,2.5,1,1,20,30,0,1
4,66,1,3,66,110,213,1,2,99,1,1.3,2,0,20,8,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,54,0,4,83,127,333,1,1,154,0,0,,0,0,0,1,1
196,62,1,1,,,139,0,1,,,,,1,15,30,0,0
197,55,1,4,74,122,223,1,1,100,0,0,,1,20,40,0,2
198,58,1,4,,,385,1,2,,,,,0,10,20,1,0


In [9]:
combined_dataset = curated_switzerland_dataset.append(curated_hungarian_dataset).append(curated_long_beach_dataset)
shuffled_dataset = combined_dataset.sample(frac=1)
shuffled_dataset

Unnamed: 0,age,sex,cp,thalrest,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,smoke,cigs,years,famhist,num
43,52,1,3,78,140,259,0,1,170,0,0,,,,,,0
9,60,1,2,72,160,267,1,1,157,0,0.5,2,0,20,15,0,1
39,38,1,3,77,100,0,,0,179,0,-1.1,1,,,,,0
124,62,1,3,,,204,0,1,,,,,0,15,5,1,1
133,68,1,1,,,181,1,1,,,,,1,20,40,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,49,1,2,116,100,253,0,0,174,0,0,,,,,,0
230,37,0,4,82,130,173,0,1,184,0,0,,,,,,0
170,53,1,4,82,124,243,0,0,122,1,2,2,0,0,0,0,1
109,39,1,2,90,190,241,0,0,106,0,0,,,,,,0


In [10]:
shuffled_dataset.to_csv(str(SHUFFLED_DATASET_OUTPUT_PATH), index=False)