# Dataset Generator

In this notebook, we generate the dataset we will use to execute Azure Machine Learning Pipeline. We use, as we mentioned on the README file, the UCI heart disease dataset as baseline. With that base dataset we are able to generate a new one with more usefull information to do a more impact Responsible example. First we do this dataset to do a differential privacy execution, exploratory analysis more complex and also detect more fairness features to mitigate. 

In [None]:
import pandas as pd
import json
import random
from datetime import datetime, timedelta

In [None]:
SOURCE_FILENAME = '../../dataset/uci_dataset.csv'
DESTINATION_FILENAME = '../../dataset/complete_patients_dataset.csv'
ADDRESSES_FILENAME = 'addresses.json'
MALE_NAMES_FILENAME = 'male-names.csv'
FEMALE_NAMES_FILENAME = 'female-names.csv'

In [None]:
df = pd.read_csv(SOURCE_FILENAME)
df.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol', 'fasting_blood_sugar',              'rest_ecg', 'max_heart_rate_achieved', 'exercise_induced_angina', 'st_depression', 'st_slope', 'num_major_vessels', 'thalassemia', 'target']
df.head()

In [None]:
# Addresses from https://github.com/EthanRBrown/rrad
addresses = []
with open(ADDRESSES_FILENAME, 'r') as f:
    addresses = json.load(f)['addresses']

In [None]:
def get_attribute_or_empty(data, attribute, default=''):
    return data[attribute] if attribute in data else default

In [None]:
def concat_name(first_name, last_name):
    return f"{first_name.split()[0].capitalize()} {last_name.capitalize()}"

def get_names_from_file(filename):
    df = pd.read_csv(filename)
    names = df.apply(lambda name: concat_name(str(name['first name']), str(name['last name'])), axis=1)
    random.shuffle(names)
    return names

In [None]:
def generate_ssn():
    return f"{random.randint(100, 999)}-{random.randint(10, 99)}-{random.randint(1000, 9999)}"

In [None]:
def gen_datetime(min_year=1900, max_year=datetime.now().year):
    start = datetime(min_year, 1, 1, 00, 00, 00)
    years = max_year - min_year + 1
    end = start + timedelta(days=365 * years)
    date = start + (end - start) * random.random()
    return date.strftime("%m/%d/%Y")

def generate_observations(name, city):
    return f"{name} from {city}. Last visit on {gen_datetime(2018)}"

In [None]:
male_names = get_names_from_file(MALE_NAMES_FILENAME)
female_names = get_names_from_file(FEMALE_NAMES_FILENAME)

In [None]:
df['state'] = list(map(lambda address: get_attribute_or_empty(address, 'state'), addresses[:len(df.index)]))
df['address'] = list(map(lambda address: get_attribute_or_empty(address, 'address1'), addresses[:len(df.index)]))
df['city'] = list(map(lambda address: get_attribute_or_empty(address, 'city', 'Montgomery'), addresses[:len(df.index)]))
df['postalCode'] = list(map(lambda address: get_attribute_or_empty(address, 'postalCode'), addresses[:len(df.index)]))
df['name'] = male_names
df['name'][df['sex'] == 0] = female_names
df['ssn'] = df.apply(lambda x: generate_ssn(), axis=1)
df['pregnant'] = 0
df['pregnant'][df['sex'] == 0] = df[df['sex'] == 0].apply(lambda x: random.randint(0,1), axis=1)
df['diabetic'] = df.apply(lambda x: 0 if random.randint(0,10) <= 7 else 1, axis=1)
df['asthmatic'] = df.apply(lambda x: 0 if random.randint(0,10) <= 7 else 1, axis=1)
df['observation'] = df.apply(lambda x: generate_observations(x['name'], x['city']), axis=1)
df['smoker'] = df.apply(lambda x: 0 if random.randint(0,10) <= 7 else 1, axis=1)

In [None]:
df.to_csv(DESTINATION_FILENAME, index=False)

Check new custom dataset

In [None]:
df.info()