In [1]:
import numpy as np
from helpers import *
from implementations import *
np.random.seed(1)

In [2]:
x_train, x_test, y_train, train_ids, test_ids = load_csv_data('data/dataset/')

In [3]:
#check that all the data loaded in
print('x_train shape before cleaning is:', x_train.shape)
print('x_test shape before cleaning is:',x_test.shape)
print('y_train shape before cleaning is:',y_train.shape)

print('train_ids:', train_ids.shape)
print('test_ids:', test_ids.shape)


x_train shape before cleaning is: (328135, 321)
x_test shape before cleaning is: (109379, 321)
y_train shape before cleaning is: (328135,)
train_ids: (328135,)
test_ids: (109379,)


In [4]:
# do smart stuff: focus on the most important parameters
# import pandas as pd

# df_x_train = pd.read_csv("data/dataset/x_train.csv")
# # List of column names
# column_names = ['_RFHYPE5',
#                                         'TOLDHI2', 
#                                          '_CHOLCHK',
#                                         '_BMI5',
#                                          'SMOKE100',
#                                          'CVDSTRK3', 
#                                         'DIABETE3',
#                                          '_TOTINDA',
#                                          '_FRTLT1', 
#                                         '_VEGLT1',
#                                          '_RFDRHV5',
#                                          'HLTHPLN1', 
#                                         'MEDCOST',
#                                          'GENHLTH', 
#                                         'MENTHLTH', 
#                                         'PHYSHLTH', 
#                                         'DIFFWALK',
#                                          'SEX', 
#                                         '_AGEG5YR', 
#                                         'EDUCA', 
#                                         'INCOME2' ]

# # Get the indices of the specified column names
# column_indices = [df_x_train.columns.get_loc(col) for col in column_names]

# print("Indices of specified columns:", column_indices)

# output -> Indices of specified columns: [233, 39, 234, 254, 73, 40, 49, 285, 279, 280, 266, 31, 33, 27, 29, 28, 70, 51, 247, 53, 61]
# so for the csv imported it needs to be that list minus because because the id column is loaded separately (train_ids)

In [5]:
# y_train = np.reshape(y_train, (328135, 1))
# train_ids = np.reshape(train_ids, (328135, 1))
# train_ids.shape

In [6]:
# before further cleaning the data it is better to conatecate them, this way we don't need to track what rows we delete
# x_train shape before cleaning is: (328135, 321)
# x_test shape before cleaning is: (109379, 321)
# y_train shape before cleaning is: (328135,)
# train_ids: (328135,)
# test_ids: (109379,)


# reshaping as preparation before concatenate
y_train = np.reshape(y_train, (328135, 1))
train_ids = np.reshape(train_ids, (328135, 1))
test_ids = np.reshape(test_ids, (109379, 1))

x_train_c = np.concatenate((train_ids, x_train, y_train), axis=1)
x_test_c = np.concatenate((test_ids, x_test), axis=1)

print('The order is id: x : y')
print(x_train_c.shape)
print(x_test_c.shape)

# before training the data we should separate the arrays again

The order is id: x : y
(328135, 323)
(109379, 322)


## Cleaning data

In [38]:
# focus only on the 21 most important as specified in https://medium.com/@alexteboul17/building-predictive-models-for-heart-disease-using-the-2015-behavioral-risk-factor-surveillance-b786368021ab

focus_variables = np.array([233, 39, 234, 254, 73, 40, 49, 285, 279, 280, 266, 31, 33, 27, 29, 28, 70, 51, 247, 53, 61])

focus_variables_train = np.concatenate((np.array([0]), focus_variables, np.array([322])), axis=0) # don't forget about stored id and y at the beginning and end
focus_variables_test = np.concatenate((np.array([0]), focus_variables), axis=0) # don't forget about stored id at beginning

print(focus_variables_train)
print(focus_variables_test)

x_train_sel = x_train_c[:, focus_variables_train]
x_test_sel = x_test_c[:, focus_variables_test]

print(x_train_sel[0:10, :]) # correct
print(x_train_sel.shape)
print(x_test_sel.shape)

# remove nan values for train
nan_rows_train = np.isnan(x_train_sel).any(axis=1)
x_train_cl = x_train_sel[~nan_rows_train]

# remove nan values for test
nan_rows_test = np.isnan(x_test_sel).any(axis=1)
x_test_cl = x_test_sel[~nan_rows_test]

print('x_train_cl is:', x_train_cl.shape)
print('x_test_cl is:',x_test_cl.shape)

print(x_train_cl[0:100, :]) # correct




[  0 233  39 234 254  73  40  49 285 279 280 266  31  33  27  29  28  70
  51 247  53  61 322]
[  0 233  39 234 254  73  40  49 285 279 280 266  31  33  27  29  28  70
  51 247  53  61]
[[ 0.    1.    2.    1.   20.78  1.    2.    3.    1.    2.    1.    1.
   1.    2.    2.    5.    1.    2.    2.    8.    5.    8.   -1.  ]
 [ 1.    2.    2.    1.   28.7   1.    2.    3.    9.    9.    9.    9.
   1.    2.    4.   88.   88.    2.    1.    8.    4.    7.   -1.  ]
 [ 2.    1.    1.    1.     nan  1.    2.    3.    1.    1.    1.    1.
   1.    2.    2.   77.   77.    2.    2.   10.    6.   99.   -1.  ]
 [ 3.    1.    2.    2.   27.96  1.    2.    3.    1.    2.    1.    1.
   1.    2.    1.   88.   88.    2.    2.   10.    4.   99.   -1.  ]
 [ 4.    1.    2.    1.   24.39  1.    2.    3.    1.    1.    1.    9.
   1.    2.    2.   88.   88.    2.    1.   14.    9.   99.   -1.  ]
 [ 5.    2.    1.    1.   30.72  2.    2.    1.    2.    1.    2.    1.
   1.    2.    3.    5.    2.    2.  

## Further Cleaning -> Make it clinical

For every category the data is made better. make it boolean or ordinal, remove large numbers like 88 that often indicates 0

In [39]:
#1 _RFHYPE5 - High Blood pressure
#Change 1 to 0 so it represetnts No high blood pressure and 2 to 1 so it represents high blood pressure
column_index = 1  
# Replace 1 with 0
x_train_cl[x_train_cl[:, column_index] == 1, column_index] = 0

# Replace 2 with 1
x_train_cl[x_train_cl[:, column_index] == 2, column_index] = 1

print(x_train_cl.shape)
#print(x_train_cl[0:20, :]) # correct

(257733, 23)


In [40]:
#2 TOLDHI2 - Ever Told Blood Cholesterol High
# Change 2 to 0 because it is No
# Remove all 7 (dont knows)
# Remove all 9 (refused)

column_index = 2 
x_train_cl[x_train_cl[:, column_index] == 2, column_index] = 0

# Remove all rows where 'TOLDHI2' is 7 (dont knows)
x_train_cl = x_train_cl[x_train_cl[:, column_index] != 7]

# Remove all rows where 'TOLDHI2' is 9 (refused)
x_train_cl = x_train_cl[x_train_cl[:, column_index] != 9]
print(x_train_cl.shape)

(255606, 23)


In [41]:
#3 _CHOLCHK - How Long since Cholesterol Checked
# I choose to say last year = yes thus 1. If not within last year I give zero
# Change 4 to 0 ,3 to 0, and 2 to 0 for Not checked cholesterol in past 5 years
# remove 7
# Remove 9

column_index = 3 

x_train_cl[x_train_cl[:, column_index] == 4, column_index] = 0
x_train_cl[x_train_cl[:, column_index] == 3, column_index] = 0
x_train_cl[x_train_cl[:, column_index] == 2, column_index] = 0

# Remove all rows where '_CHOLCHK' is 7 and 9
x_train_cl = x_train_cl[x_train_cl[:, column_index] != 7]
x_train_cl = x_train_cl[x_train_cl[:, column_index] != 9]
print(x_train_cl.shape)

(252297, 23)


In [11]:
#5 SMOKE100 - Smoked at Least 100 Cigarettes
# Change 2 to 0 because it is No
# Remove all 7 (dont knows)
# Remove all 9 (refused)

column_index = 5  # Replace with the actual index
x_train_cl[x_train_cl[:, column_index] == 2, column_index] = 0

# Remove all rows where 'SMOKE100' is 7
x_train_cl = x_train_cl[x_train_cl[:, column_index] != 7]

# Remove all rows where 'SMOKE100' is 9
x_train_cl = x_train_cl[x_train_cl[:, column_index] != 9]
print(x_train_cl.shape)

(250810, 23)


In [12]:
#6 CVDSTRK3 - Ever Diagnosed with a Stroke
# Change 2 to 0 because it is No
# Remove all 7 (dont knows)
# Remove all 9 (refused)

column_index = 6  # Replace with the actual index
x_train_cl[x_train_cl[:, column_index] == 2, column_index] = 0

# Remove all rows where 'CVDSTRK3' is 7
x_train_cl = x_train_cl[x_train_cl[:, column_index] != 7]

# Remove all rows where 'CVDSTRK3' is 9
x_train_cl = x_train_cl[x_train_cl[:, column_index] != 9]
print(x_train_cl.shape)

(250252, 23)


In [13]:
#7 DIABETE3 - Ever told) you have diabetes
# going to make this ordinal. 0 is for no diabetes or only during pregnancy, 1 is for pre-diabetes or borderline diabetes, 2 is for yes diabetes
# Remove all 7 (dont knows)
# Remove all 9 (refused)
column_index = 7  
# Replace 2 and 3 with 0, 1 with 2, and 4 with 1
x_train_cl[x_train_cl[:, column_index] == 1, column_index] = 2
x_train_cl[x_train_cl[:, column_index] == 2, column_index] = 0
x_train_cl[x_train_cl[:, column_index] == 3, column_index] = 0
x_train_cl[x_train_cl[:, column_index] == 4, column_index] = 1

# Remove all rows where 'DIABETE3' is 7
x_train_cl = x_train_cl[x_train_cl[:, column_index] != 7]

# Remove all rows where 'DIABETE3' is 9
x_train_cl = x_train_cl[x_train_cl[:, column_index] != 9]
print(x_train_cl.shape)

(249999, 23)


In [14]:
#8 _TOTINDA - Leisure Time Physical Activity Calculated Variable
# 1 for physical activity
# change 2 to 0 for no physical activity
# Remove all 9 (don't know/refused)

column_index = 8 
x_train_cl[x_train_cl[:, column_index] == 2, column_index] = 0

# Remove all rows where '_TOTINDA' is 9
x_train_cl = x_train_cl[x_train_cl[:, column_index] != 9]
print(x_train_cl.shape)

(238445, 23)


In [15]:
#9 _FRTLT1 - Consume Fruit 1 or more times per day
# Change 2 to 0. this means no fruit consumed per day. 1 will mean consumed 1 or more pieces of fruit per day
# remove all dont knows and missing 9

column_index = 9  
x_train_cl[x_train_cl[:, column_index] == 2, column_index] = 0

# Remove all rows where '_FRTLT1' is 9
x_train_cl = x_train_cl[x_train_cl[:, column_index] != 9]
print(x_train_cl.shape)

(232760, 23)


In [16]:
#10 _VEGLT1 - Consume Vegetables 1 or more times per day
# Change 2 to 0. this means no vegetables consumed per day. 1 will mean consumed 1 or more pieces of vegetable per day
# remove all dont knows and missing 9

column_index = 10  # Replace with the actual index
x_train_cl[x_train_cl[:, column_index] == 2, column_index] = 0

# Remove all rows where '_VEGLT1' is 9
x_train_cl = x_train_cl[x_train_cl[:, column_index] != 9]

print(x_train_cl.shape)

(227013, 23)


In [17]:
#11 _RFDRHV5 Heavy Alcohol Consumption Calculated Variable

# Change 1 to 0 (1 was no for heavy drinking). change all 2 to 1 (2 was yes for heavy drinking)
# remove all dont knows and missing 9
column_index = 11

x_train_cl[x_train_cl[:, column_index] == 1, column_index] = 0
x_train_cl[x_train_cl[:, column_index] == 2, column_index] = 1

# Remove all rows where '_RFDRHV5' is 9
x_train_cl = x_train_cl[x_train_cl[:, column_index] != 9]

print(x_train_cl.shape)

(224360, 23)


In [18]:
#12 HLTHPLN1 - Have any health care coverage
# 1 is yes, change 2 to 0 because it is No health care access
# remove 7 and 9 for don't know or refused

column_index = 12  # Replace with the actual index
x_train_cl[x_train_cl[:, column_index] == 2, column_index] = 0

# Remove all rows where 'HLTHPLN1' is 7
x_train_cl = x_train_cl[x_train_cl[:, column_index] != 7]

# Remove all rows where 'HLTHPLN1' is 9
x_train_cl = x_train_cl[x_train_cl[:, column_index] != 9]
print(x_train_cl.shape)

(223977, 23)


In [19]:
#13 MEDCOST - Could Not See Doctor Because of Cost
# Change 2 to 0 for no, 1 is already yes
# remove 7 for don/t know and 9 for refused

column_index = 13  # Replace with the actual index
x_train_cl[x_train_cl[:, column_index] == 2, column_index] = 0

# Remove all rows where 'MEDCOST' is 7
x_train_cl = x_train_cl[x_train_cl[:, column_index] != 7]

# Remove all rows where 'MEDCOST' is 9
x_train_cl = x_train_cl[x_train_cl[:, column_index] != 9]
print(x_train_cl.shape)

(223658, 23)


In [20]:
#14 GENHLTH - General Health
# This is an ordinal variable that I want to keep (1 is Excellent -> 5 is Poor)
# Remove 7 and 9 for don't know and refused
column_index = 14  # Replace with the actual index

# Remove all rows where 'GENHLTH' is 7
x_train_cl = x_train_cl[x_train_cl[:, column_index] != 7]

# Remove all rows where 'GENHLTH' is 9
x_train_cl = x_train_cl[x_train_cl[:, column_index] != 9]
print(x_train_cl.shape)

(223264, 23)


In [21]:
#15 MENTHLTH - Number of Days Mental Health Not Good
# already in days so keep that, scale will be 0-30
# change 88 to 0 because it means none (no bad mental health days)
# remove 77 and 99 for don't know not sure and refused

# Assuming the column index for 'MENTHLTH' is 16
column_index = 15  # Replace with the actual index

# Replace 88 with 0 in the 'MENTHLTH' column
x_train_cl[x_train_cl[:, column_index] == 88, column_index] = 0

# Remove all rows where 'MENTHLTH' is 77
x_train_cl = x_train_cl[x_train_cl[:, column_index] != 77]

# Remove all rows where 'MENTHLTH' is 99
x_train_cl = x_train_cl[x_train_cl[:, column_index] != 99]
print(x_train_cl.shape)

(220922, 23)


In [22]:
#16 PHYSHLTH - PHYSHLTH
# already in days so keep that, scale will be 0-30
# change 88 to 0 because it means none (no bad mental health days)
# remove 77 and 99 for don't know not sure and refused

column_index = 16  # Replace with the actual index

# Replace 88 with 0 in the 'PHYSHLTH' column
x_train_cl[x_train_cl[:, column_index] == 88, column_index] = 0

# Remove all rows where 'PHYSHLTH' is 77
x_train_cl = x_train_cl[x_train_cl[:, column_index] != 77]

# Remove all rows where 'PHYSHLTH' is 99
x_train_cl = x_train_cl[x_train_cl[:, column_index] != 99]
print(x_train_cl.shape)

(218148, 23)


In [23]:
#17 DIFFWALK - Difficulty Walking or Climbing Stairs
# change 2 to 0 for no. 1 is already yes
# remove 7 and 9 for don't know not sure and refused

column_index = 17  # Replace with the actual index

# Replace 2 with 0 in the 'DIFFWALK' column
x_train_cl[x_train_cl[:, column_index] == 2, column_index] = 0

# Remove all rows where 'DIFFWALK' is 7
x_train_cl = x_train_cl[x_train_cl[:, column_index] != 7]

# Remove all rows where 'DIFFWALK' is 9
x_train_cl = x_train_cl[x_train_cl[:, column_index] != 9]

print(x_train_cl.shape)

(217551, 23)


In [24]:
#18 SEX - Respondents Sex
# in other words - is respondent male (somewhat arbitrarily chose this change because men are at higher risk for heart disease)
# change 2 to 0 (female as 0). Male is 1
column_index = 18  # Replace with the actual index

# Replace 2 with 0 in the 'SEX' column
x_train_cl[x_train_cl[:, column_index] == 2, column_index] = 0

print(x_train_cl.shape)

(217551, 23)


In [25]:
#19 _AGEG5YR - Reported age in five-year age categories calculated variable
# already ordinal. 1 is 18-24 all the way up to 13 wis 80 and older. 5 year increments.
# remove 14 because it is don't know or missing
column_index = 19  # Replace with the actual index

# Remove all rows where '_AGEG5YR' is 14
x_train_cl = x_train_cl[x_train_cl[:, column_index] != 14]
print(x_train_cl.shape)

(216447, 23)


In [26]:
#20 EDUCA - Education Level
# This is already an ordinal variable with 1 being never attended school or kindergarten only up to 6 being college 4 years or more
# Scale here is 1-6
# Remove 9 for refused:

column_index = 20  # Replace with the actual index

# Remove all rows where 'EDUCA' is 9
x_train_cl = x_train_cl[x_train_cl[:, column_index] != 9]

print(x_train_cl.shape)

(216195, 23)


In [27]:
#21 INCOME2  - Income Level
# Variable is already ordinal with 1 being less than $10,000 all the way up to 8 being $75,000 or more
# Remove 77 and 99 for don't know and refused

column_index = 21 # Replace with the actual index

# Remove all rows where 'INCOME2' is 77 or 99
x_train_cl = x_train_cl[(x_train_cl[:, column_index] != 77) & (x_train_cl[:, column_index] != 99)]



print(x_train_cl.shape)

(190560, 23)


In [28]:
# He said we should also look into race in the article. This we can do next time because it takes some ordering from the beginning.

!!!!!! Warning: x_test_cl needs to be cleaned further like x_train_cl !!!!!!!!

# Train

In [37]:
# Train our model
# (y, tx, initial_w, max_iters, gamma)
# I have to take the array apart again: the first column is id, the last column is the y
tx = x_train_cl[:, 1:-1]
y = x_train_cl[:, -1]
#initial_w = np.random.randn(tx.shape[1])*0.01
initial_w = np.zeros(tx.shape[1])

print('checking shapes:')
print('tx is:', tx.shape)
print('y is:', y.shape)
print('w is:', initial_w.shape)
max_iters = 25
gamma = 0.005

w, loss = logistic_regression(y, tx, initial_w, max_iters, gamma)
print('The loss we find is:', loss)

checking shapes:
tx is: (190560, 21)
y is: (190560,)
w is: (21,)
The loss we find is: -79.561393574746


!!!!!! Warning: x_test_cl needs to be cleaned further like x_train_cl !!!!!!!! 

In [30]:
# Function to predict the labels for the test data
def predict_labels(w, data):
    y_pred = sigmoid(np.dot(data, w))
    y_pred[np.where(y_pred <= 0.5)] = -1
    y_pred[np.where(y_pred > 0.5)] = 1
    return y_pred

In [31]:
x_test_sub = x_test_cl[:, 1:]
print(x_test_sub.shape)

(85873, 21)


In [32]:
# Generate predictions and save ouput in csv format for submission:
OUTPUT_PATH = 'data/submission.csv'

x_test_sub = x_test_cl[:, 1:]
print('x_test_sub shape is:', x_test_sub.shape)
y_pred = predict_labels(w, x_test_sub)
create_csv_submission(x_test_cl[:, 0], y_pred, OUTPUT_PATH)

x_test_sub shape is: (85873, 21)
