 Analytical Question:  Is it possible to predict Cardio Vascular disease using the data points from the chronic illness data file?

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

import joblib
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

In [3]:
#import the sql package
from sqlalchemy import create_engine

In [4]:
#set up variable for the db
host = r'127.0.0.1' # denotes that the db in a local installation
db = r'MSDS610' # db for this class
user = r'postgres' # using the postgres user because I'm too lazy to create a new one
pw = r'BrightBlessings1!' # this is the password established during installation
port = r'5432' # default port estabalished during install
schema = r'cleaned' # schema where I last saved the data to

In [5]:
#open the connection
db_conn = create_engine("postgresql://{}:{}@{}:{}/{}".format(user, pw, host, port, db))

In [6]:
#read from the table created

table_name = r'tree'
schema = r'cleaned' # schema were the data was loaded last week.

#set df equal to disease, so we can use disease again
disease = pd.read_sql_table(table_name, db_conn, schema)
df=disease

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 577 entries, 0 to 576
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   YearStart        577 non-null    int64  
 1   YearEnd          577 non-null    int64  
 2   LocationAbbr     577 non-null    int64  
 3   DataValueTypeID  577 non-null    int64  
 4   DataValue        577 non-null    float64
 5   QuestionID       577 non-null    int64  
 6   gender           577 non-null    int64  
 7   Race             577 non-null    int64  
 8   Overall          577 non-null    int64  
 9   Age              577 non-null    int64  
dtypes: float64(1), int64(9)
memory usage: 45.2 KB


<div class="alert alert-info">The cleaning was done last week and the result saved in postgres.</div>

### 3-Way splitting of the data

In [9]:
#gather up names of all the columns
cols = df.columns

#set the prediction column and the feature columns for KNN
#startng with Race snce last week the 'important' feature placed race as #1
prediction_col = 'Race'
feature_cols = [c for c in cols if c != prediction_col]
X = df[feature_cols]
y = df[prediction_col]

In [10]:
# Train-Validation-Test Split - First split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, random_state=42, test_size=0.3, stratify=y)

In [11]:
X_temp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 174 entries, 481 to 488
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   YearStart        174 non-null    int64  
 1   YearEnd          174 non-null    int64  
 2   LocationAbbr     174 non-null    int64  
 3   DataValueTypeID  174 non-null    int64  
 4   DataValue        174 non-null    float64
 5   QuestionID       174 non-null    int64  
 6   gender           174 non-null    int64  
 7   Overall          174 non-null    int64  
 8   Age              174 non-null    int64  
dtypes: float64(1), int64(8)
memory usage: 13.6 KB


In [12]:
X_temp.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,DataValueTypeID,DataValue,QuestionID,gender,Overall,Age
481,2021,2021,0,4,0.0,1,0,0,0
18,2019,2019,0,0,870.0,7,0,0,0
162,2019,2019,0,4,64.3,1,0,0,0
65,2019,2019,0,4,48.0,0,0,0,0
114,2019,2019,0,3,32.1,2,1,0,0


In [13]:
y_temp.info()

<class 'pandas.core.series.Series'>
Index: 174 entries, 481 to 488
Series name: Race
Non-Null Count  Dtype
--------------  -----
174 non-null    int64
dtypes: int64(1)
memory usage: 2.7 KB


In [14]:
y_temp.head()

481    7
18     6
162    2
65     4
114    0
Name: Race, dtype: int64

In [15]:
# Train-Validation-Test Split - Second split
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, random_state=42, test_size=0.5, stratify=y_temp)

In [16]:
print(X_test.shape)
X_test.head()

(87, 9)


Unnamed: 0,YearStart,YearEnd,LocationAbbr,DataValueTypeID,DataValue,QuestionID,gender,Overall,Age
369,2021,2021,0,3,0.0,3,0,0,0
204,2019,2019,0,2,126.2,7,1,0,0
232,2020,2020,0,0,17251.0,7,0,0,0
520,2021,2021,0,3,78.3,1,0,1,0
513,2021,2021,0,0,375476.0,6,0,1,0


In [17]:
print(X_val.shape)
X_val.head()

(87, 9)


Unnamed: 0,YearStart,YearEnd,LocationAbbr,DataValueTypeID,DataValue,QuestionID,gender,Overall,Age
164,2019,2019,0,1,40.6,5,0,0,0
220,2019,2019,0,3,81.2,1,1,0,0
10,2019,2019,0,1,26.47,4,2,0,0
219,2019,2019,0,3,40.1,2,0,0,3
388,2021,2021,0,0,695547.0,7,0,1,0


In [18]:
print(y_test.shape)
y_test.head()

(87,)


369    1
204    0
232    7
520    0
513    0
Name: Race, dtype: int64

In [19]:
print(y_val.shape)
y_val.head()

(87,)


164    6
220    0
10     0
219    0
388    0
Name: Race, dtype: int64

Okay, build the first model and save
### Building the Model
Following the 3 way split example work book

In [21]:
# Train a Random Forest classifier using training set
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [22]:
pred_X_test = model.predict(X_test)

In [23]:
accuracy_score(y_test,pred_X_test)

0.41379310344827586

<div class="alert alert-info">I'll save the different models to play with.</div>

### Saving the Optimal Model

In [25]:
# Save the model using joblib
model_filename = "dis1_rf_model.joblib"
joblib.dump(model, model_filename)

['dis1_rf_model.joblib']

Try a different predictor

In [27]:
#gather up names of all the columns
df1=disease
cols = df1.columns

#set the prediction column and the feature columns for KNN
#startng with Race snce last week the 'important' feature placed race as #1
prediction_col = 'gender'
feature_cols = [c for c in cols if c != prediction_col]
X = df1[feature_cols]
y = df1[prediction_col]

In [28]:
# Train-Validation-Test Split - First split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, random_state=42, test_size=0.3, stratify=y)

In [29]:
X_temp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 174 entries, 471 to 25
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   YearStart        174 non-null    int64  
 1   YearEnd          174 non-null    int64  
 2   LocationAbbr     174 non-null    int64  
 3   DataValueTypeID  174 non-null    int64  
 4   DataValue        174 non-null    float64
 5   QuestionID       174 non-null    int64  
 6   Race             174 non-null    int64  
 7   Overall          174 non-null    int64  
 8   Age              174 non-null    int64  
dtypes: float64(1), int64(8)
memory usage: 13.6 KB


In [30]:
X_temp.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,DataValueTypeID,DataValue,QuestionID,Race,Overall,Age
471,2021,2021,0,1,234.1,7,0,0,0
403,2021,2021,0,1,54.6,5,2,0,0
131,2019,2019,0,3,31.8,3,0,0,0
58,2019,2019,0,4,28.9,3,0,1,0
530,2021,2021,0,0,20389.0,5,0,0,3


In [31]:
# Train-Validation-Test Split - Second split
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, random_state=42, test_size=0.5, stratify=y_temp)

In [32]:
print(X_test.shape)
X_test.head()

(87, 9)


Unnamed: 0,YearStart,YearEnd,LocationAbbr,DataValueTypeID,DataValue,QuestionID,Race,Overall,Age
29,2019,2019,0,2,17.8,5,5,0,0
70,2019,2019,0,0,12956.0,4,8,0,0
367,2021,2021,0,4,57.3,1,0,0,0
287,2020,2020,0,2,173.5,7,6,0,0
72,2019,2019,0,1,200.8,7,0,1,0


In [33]:
print(X_val.shape)
X_val.head()

(87, 9)


Unnamed: 0,YearStart,YearEnd,LocationAbbr,DataValueTypeID,DataValue,QuestionID,Race,Overall,Age
378,2021,2021,0,0,380946.0,4,0,0,0
260,2020,2020,0,1,1.6,5,0,0,2
438,2021,2021,0,2,155.2,7,1,0,0
250,2020,2020,0,0,46470.0,6,2,0,0
318,2020,2020,0,2,29.91,4,4,0,0


In [34]:
print(y_test.shape)
y_test.head()

(87,)


29     0
70     0
367    2
287    0
72     0
Name: gender, dtype: int64

In [35]:
print(y_val.shape)
y_val.head()

(87,)


378    1
260    0
438    0
250    0
318    0
Name: gender, dtype: int64

In [None]:
# Train a Random Forest classifier using training set
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
pred_X_test = model.predict(X_test)

In [None]:
accuracy_score(y_test,pred_X_test)

In [None]:
# Save the model using joblib
model_filename = "dis2_rf_model.joblib"
joblib.dump(model, model_filename)

Second model saved
Tru a final predictor

In [None]:
#gather up names of all the columns
df2=disease
cols = df2.columns

#set the prediction column and the feature columns for KNN
#startng with Race snce last week the 'important' feature placed race as #1
prediction_col = 'Age'
feature_cols = [c for c in cols if c != prediction_col]
X = df2[feature_cols]
y = df2[prediction_col]

In [None]:
#Train-Validation-Test Split - First split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, random_state=42, test_size=0.3, stratify=y)

In [None]:
X_temp.head()

In [None]:
# Train-Validation-Test Split - Second split
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, random_state=42, test_size=0.5, stratify=y_temp)

In [None]:
print(X_test.shape)
X_test.head()

In [None]:
print(X_val.shape)
X_val.head()

In [None]:
print(y_test.shape)
y_test.head()

In [None]:
print(y_val.shape)
y_val.head()

In [None]:
# Train a Random Forest classifier using training set
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
pred_X_test = model.predict(X_test)

In [None]:
accuracy_score(y_test,pred_X_test)

In [None]:
# Save the model using joblib
model_filename = "dis3_rf_model.joblib"
joblib.dump(model, model_filename)

Initial conclusions:
<p> I am shamelessly using the code from this week's sample 3 way split.  
<br>  Age seems to be the best model so far.