In [1]:
import numpy as np
data = np.load('./stroke.npy', allow_pickle=True)

### 1. Stroke dataset

#### (a) Load the dataset

In [2]:
gender = data[:, 0]              # categorical, [ 0 = Male, 1 = Female, 2 = Other]
age = data[:, 1]                 # discrete integer value
hypertension = data[:, 2]        # categorical, [ 0 = No, 1 = Yes ]
heart_disease = data[:, 3]       # categorical, [ 0 = No, 1 = Yes ] 
ever_married = data[:, 4]        # categorical, [ 0 = No, 1 = Yes ]
work_type = data[:, 5]           # categorical, [ 0 = Private, 1 = Self-employed, 2 = Never worked, 3 = Government, 4 = Children ]
residence_type = data[:, 6]      # categorical, [ 0 = Urban, 1 = Rural ]
glucose_level = data[:, 7]       # discrete integer value
bmi = data[:, 8]                 # discrete integer value
smoking_status = data[:, 9]      # categorical, [ 0 = Never smoked, 1 = Formerly smoked, 2 = Smokes ]
stroke = data[:, 10]             # categorical, [ 0 = No, 1 = Yes ]

#### (b) Patient mask 

In [3]:
patient_mask = stroke == 1
control_mask = stroke == 0
patient_group = data[patient_mask]
control_group = data[control_mask]
print('Number of stroke patients:', patient_group.shape[0])
print('Number of non-stroke patients:', control_group.shape[0])

Number of stroke patients: 180
Number of non-stroke patients: 3246


#### (c) Calculate mean age of patients and controls

In [4]:
print('Mean age of the stroke patients: %d' % age[patient_mask].mean())
print('Mean age of the controls group: %d' % age[control_mask].mean())

Mean age of the stroke patients: 68
Mean age of the controls group: 47


#### (d) BMI and heart disease in sub-groups 

In [5]:
urban_smoke_mask = (residence_type == 0) & (smoking_status == 2)
urban_nonsmoke_mask = (residence_type == 0) & (smoking_status < 2) # smoking_status == 0 or smoking_status == 1 -->  < 2
rural_smoke_mask = (residence_type == 1) & (smoking_status == 2)
rural_nonsmoke_mask = (residence_type == 1) & (smoking_status < 2)

In [6]:
print('Urban smokers: average BMI %.1f; average heart disease rate %.1f%%' 
      % (bmi[urban_smoke_mask].mean(), 100*heart_disease[urban_smoke_mask].sum()/heart_disease[urban_smoke_mask].shape[0]))
print('Urban non-smokers: average BMI %.1f; average heart disease rate %.1f%%' 
      % (bmi[urban_nonsmoke_mask].mean(), 100*heart_disease[urban_nonsmoke_mask].sum()/heart_disease[urban_nonsmoke_mask].shape[0]))
print('Rural smokers: average BMI %.1f; average heart disease rate %.1f%%' 
      % (bmi[rural_smoke_mask].mean(), 100*heart_disease[rural_smoke_mask].sum()/heart_disease[rural_smoke_mask].shape[0]))
print('Rural non-smokers: average BMI %.1f; average heart disease rate %.1f%%' 
      % (bmi[rural_nonsmoke_mask].mean(), 100*heart_disease[rural_nonsmoke_mask].sum()/heart_disease[rural_nonsmoke_mask].shape[0]))

Urban smokers: average BMI 29.7; average heart disease rate 7.2%
Urban non-smokers: average BMI 29.8; average heart disease rate 6.0%
Rural smokers: average BMI 30.6; average heart disease rate 7.7%
Rural non-smokers: average BMI 29.8; average heart disease rate 5.3%


### 2. BMI 

#### (a) Patient and control group BMI

##### i. Find the average BMI in the control group

In [7]:
avg_control_bmi = bmi[control_mask].mean()
print(avg_control_bmi)

29.824091189155883


##### ii. Create two masks. One satisfying each of the following conditions:
    - Has stroke and BMI above the control group average
    - Has stroke and BMI below or at the control group average

In [8]:
high_bmi_patient_mask = (stroke == 1) & (bmi > avg_control_bmi)
low_bmi_patient_mask = (stroke == 1) & (bmi <= avg_control_bmi)

##### iii. Print number and percentage of stroke patients with BMI above the average in the control group

In [9]:
print('Number of stroke patients with BMI above the average in the control group: %d' 
      % np.sum(high_bmi_patient_mask))
print('Percentage of stroke patients with BMI above the average in the control group: %.1f%%' 
      % (100.0*np.sum(high_bmi_patient_mask) / patient_group.shape[0]))

Number of stroke patients with BMI above the average in the control group: 86
Percentage of stroke patients with BMI above the average in the control group: 47.8%


##### iv. Create two arrays:
    - One holding the index of the stroke patients with a BMI above the control group average
    - One holding the index of the patients with a BMI at or below the control group average

In [10]:
high_bmi_indices = np.argwhere(high_bmi_patient_mask).squeeze(axis=1)
low_bmi_indices = np.argwhere(low_bmi_patient_mask).squeeze(axis=1)

#### (b) Patients with high BMI and heart disease


In [11]:
print('Number of stroke patients with a BMI above the control group average that has a heart disease: ' 
      ,patient_group[high_bmi_indices,:][:,3].sum())
print('Number of stroke patients with a BMI at/below the control group average that has a heart disease: ' 
      ,patient_group[low_bmi_indices,:][:,3].sum())

Number of stroke patients with a BMI above the control group average that has a heart disease:  20
Number of stroke patients with a BMI at/below the control group average that has a heart disease:  16


### 3. Age

In [12]:
age_sorted_data = data[data[:, 1].argsort()]
age_900th = age_sorted_data[900,1]

In [13]:
data_as_old = age_sorted_data[age_sorted_data[:,1] == age_900th]

In [14]:
print('Age of the 900th youngest: ', age_900th)
print('Number of individuals age %d: %d' % (age_900th, data_as_old.shape[0]))

Age of the 900th youngest:  34
Number of individuals age 34: 47
