<a href="https://colab.research.google.com/github/kelseymatsik/project_chd/blob/main/tori_data_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### GOAL: Build predictive algorithms that predict the likelihood a person develops coronary heart disease (CHD).

In [1]:
! git clone https://github.com/kelseymatsik/project_chd

Cloning into 'project_chd'...
remote: Enumerating objects: 25, done.[K
remote: Counting objects: 100% (25/25), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 25 (delta 11), reused 1 (delta 0), pack-reused 0[K
Receiving objects: 100% (25/25), 650.32 KiB | 3.10 MiB/s, done.
Resolving deltas: 100% (11/11), done.


In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("./project_chd/fhs_train.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,sex,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1267,1,58,1.0,0,0.0,0.0,0,0,0,220.0,143.0,104.0,29.85,75,87.0,1
1,1209,0,40,1.0,1,15.0,0.0,0,0,0,199.0,122.0,82.0,22.16,85,77.0,0
2,2050,0,52,1.0,0,0.0,0.0,0,0,0,275.0,112.0,71.0,25.68,80,,0
3,1183,1,38,2.0,1,43.0,0.0,0,1,0,170.0,130.0,94.0,23.9,110,75.0,0
4,3225,0,43,1.0,0,0.0,0.0,0,0,0,202.0,124.0,92.0,21.26,75,74.0,0


In [4]:
# Renaming columns to be more intuitive
df = df.rename(columns = {'prevalentStroke': 'stroke',
                          'prevalentHyp': 'hypertensive',
                          'TenYearCHD': 'tenYearRisk'})

# Dropping 'Unnamed: 0' column
df.drop('Unnamed: 0', axis=1, inplace=True)

In [5]:
df.head()

Unnamed: 0,sex,age,education,currentSmoker,cigsPerDay,BPMeds,stroke,hypertensive,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,tenYearRisk
0,1,58,1.0,0,0.0,0.0,0,0,0,220.0,143.0,104.0,29.85,75,87.0,1
1,0,40,1.0,1,15.0,0.0,0,0,0,199.0,122.0,82.0,22.16,85,77.0,0
2,0,52,1.0,0,0.0,0.0,0,0,0,275.0,112.0,71.0,25.68,80,,0
3,1,38,2.0,1,43.0,0.0,0,1,0,170.0,130.0,94.0,23.9,110,75.0,0
4,0,43,1.0,0,0.0,0.0,0,0,0,202.0,124.0,92.0,21.26,75,74.0,0


In [6]:
df.columns

Index(['sex', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds',
       'stroke', 'hypertensive', 'diabetes', 'totChol', 'sysBP', 'diaBP',
       'BMI', 'heartRate', 'glucose', 'tenYearRisk'],
      dtype='object')

In [7]:
# Setting temporary variable names
sex = df['sex']
age = df['age']
education = df['education']
currentSmoker = df['currentSmoker']
cigsPerDay = df['cigsPerDay']
BPMeds = df['BPMeds']
stroke = df['stroke']
hypertensive = df['hypertensive']
diabetes = df['diabetes']
totChol = df['totChol']
sysBP = df['sysBP']
diaBP = df['diaBP']
BMI = df['BMI']
heartRate = df['heartRate']
glucose = df['glucose']
glucose = df['glucose']

In [15]:
# Looking at value counts for each variable

#for var in df.columns:
#  print(df[var].value_counts())

# cigsPerDay is the number of cigarettes smoked each day, but only has values of 0 and 1
# Not sure what that means...

In [9]:
for var in df.columns:
  print(df[var].isna().sum()/len(df))

0.0
0.0
0.026729559748427674
0.0
0.007547169811320755
0.011635220125786163
0.0
0.0
0.0
0.012264150943396227
0.0
0.0
0.0047169811320754715
0.0
0.08962264150943396
0.0


In [10]:
# Check for missings
for var in df.columns:
    missing_percentage = df[var].isna().sum() / len(df) * 100
    print(f"{var}: {missing_percentage:.2f}% missing")
# Overall, the data frame does not have many missings

sex: 0.00% missing
age: 0.00% missing
education: 2.67% missing
currentSmoker: 0.00% missing
cigsPerDay: 0.75% missing
BPMeds: 1.16% missing
stroke: 0.00% missing
hypertensive: 0.00% missing
diabetes: 0.00% missing
totChol: 1.23% missing
sysBP: 0.00% missing
diaBP: 0.00% missing
BMI: 0.47% missing
heartRate: 0.00% missing
glucose: 8.96% missing
tenYearRisk: 0.00% missing


In [27]:
# Imputation for education, cigsPerDay, BPMeds, totChol, BMI, glucose
df['education'].value_counts()
# Only 1-4, so don't impute with averages here ?
# OK b/c not much missing

1.0    1310
2.0     949
3.0     495
4.0     341
Name: education, dtype: int64

In [17]:
df['cigsPerDay'].value_counts().head(5)

0.0     1619
20.0     528
30.0     172
15.0     159
10.0     108
Name: cigsPerDay, dtype: int64

In [26]:
df['BPMeds'].value_counts()
# Only 1 or 0, so don't impute with averages here ?
# OK b/c not much missing

0.0    3050
1.0      93
Name: BPMeds, dtype: int64

In [20]:
df['totChol'].value_counts().head(5)

240.0    63
220.0    50
232.0    49
260.0    48
210.0    45
Name: totChol, dtype: int64

In [22]:
df['BMI'].value_counts().head(5)

23.48    17
22.91    15
22.54    12
23.09    12
25.09    11
Name: BMI, dtype: int64

In [24]:
df['glucose'].value_counts().head(5)

75.0    147
77.0    126
73.0    119
80.0    117
70.0    113
Name: glucose, dtype: int64

In [11]:
# Determining k
## ASK: How do you find the optimal number of k's if you have NA's?

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

X = df.iloc[:, :-1]
y = df['tenYearRisk']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=3402)

N_train = len(y_train)
N_test = len(y_test)

## Solve for k that maximizes accuracy:
k_bar = 20 # Number of k's to try
Acc = np.zeros(k_bar) # We'll store the accuracy here

for k in range(k_bar):
    model = KNeighborsClassifier(n_neighbors=k+1) # Create a sk model for k
    fitted_model = model.fit(X_train.values,y_train) # Train the model on our data
    y_hat = fitted_model.predict(X_test.values) # Predict values for test set
    Acc[k] = np.sum(y_hat == y_test)/N_test # Accuracy on testing data

Acc_max = np.max(Acc) # Find lowest recorded SSE
max_index = np.where(Acc==Acc_max) # Find the indices of SSE that equal the minimum
k_star = max_index[0]+1 # Find the optimal value of k; why index+1?
print(k_star)

ValueError: Input X contains NaN.
KNeighborsClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
optimal_neighbors

In [None]:
# Use https://scikit-learn.org/stable/modules/generated/sklearn.impute.KNNImputer.html

In [None]:
df.columns

In [None]:
# Data types of missings
print('education: ', education.dtype)
print('cigsPerDay: ', cigsPerDay.dtype)
print('BPMeds: ', BPMeds.dtype)
print('totChol: ', totChol.dtype)
print('BMI: ', BMI.dtype)
print('glucose: ', glucose.dtype)

In [None]:
# Examining missings
BPMeds_missing = df[df['BPMeds'].isnull()]
# len(BPMeds_missing)
totChol_missing = df[df['totChol'].isnull()]