In [8]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
# reference: random_forest_solution.ipynb
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix




In [24]:
# other paths
# reference: pandas_reading_files_solution.ipynb
PATHtopreprocesseddataset = Path("Resources/preprocesseddataset.csv")
PATHtofeatureimportances = Path("Resources/featureimportances.csv")

In [3]:
# Load the dataset
file_path = Path("Resources/originaldataset.csv")
data = pd.read_csv(file_path)



In [4]:
# Display the first few rows
data.head()

Unnamed: 0,PatientID,State,Sex,GeneralHealth,AgeCategory,HeightInMeters,WeightInKilograms,BMI,HadHeartAttack,HadAngina,...,ECigaretteUsage,ChestScan,RaceEthnicityCategory,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,1,Alabama,Female,Fair,Age 75 to 79,1.63,84.82,32.099998,0,1,...,Never used e-cigarettes in my entire life,1,"White only, Non-Hispanic",0,0,0,1,"No, did not receive any tetanus shot in the pa...",0,1
1,2,Alabama,Female,Very good,Age 65 to 69,1.6,71.669998,27.99,0,0,...,Never used e-cigarettes in my entire life,0,"White only, Non-Hispanic",0,0,1,1,"Yes, received Tdap",0,0
2,3,Alabama,Male,Excellent,Age 60 to 64,1.78,71.209999,22.530001,0,0,...,Never used e-cigarettes in my entire life,0,"White only, Non-Hispanic",1,0,0,0,"Yes, received tetanus shot but not sure what type",0,0
3,4,Alabama,Male,Very good,Age 70 to 74,1.78,95.25,30.129999,0,0,...,Never used e-cigarettes in my entire life,0,"White only, Non-Hispanic",0,0,1,1,"Yes, received tetanus shot but not sure what type",0,0
4,5,Alabama,Female,Good,Age 50 to 54,1.68,78.019997,27.76,0,0,...,Never used e-cigarettes in my entire life,1,"Black only, Non-Hispanic",0,0,1,0,"No, did not receive any tetanus shot in the pa...",0,0


In [5]:
# Check for missing values and data types
print(data.info())  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 237630 entries, 0 to 237629
Data columns (total 35 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   PatientID                  237630 non-null  int64  
 1   State                      237630 non-null  object 
 2   Sex                        237630 non-null  object 
 3   GeneralHealth              237630 non-null  object 
 4   AgeCategory                237630 non-null  object 
 5   HeightInMeters             237630 non-null  float64
 6   WeightInKilograms          237630 non-null  float64
 7   BMI                        237630 non-null  float64
 8   HadHeartAttack             237630 non-null  int64  
 9   HadAngina                  237630 non-null  int64  
 10  HadStroke                  237630 non-null  int64  
 11  HadAsthma                  237630 non-null  int64  
 12  HadSkinCancer              237630 non-null  int64  
 13  HadCOPD                    23

In [6]:
# verifying values in column
# reference: cc_preprocessing_solution.ipynb
data["HadDiabetes"].value_counts()

HadDiabetes
No                                         197463
Yes                                         33055
No, pre-diabetes or borderline diabetes      5211
Yes, but only during pregnancy (female)      1901
Name: count, dtype: int64

In [7]:
# encoding target with function for binary values
# reference: cc_preprocessing_solution.ipynb
def encode_HadDiabetes(HadDiabetes):
    """
    This function encodes the column named HadDiabetes by setting the 2 values with "Yes" as 1 and the 2 values with "No" as 0.
    """
    if HadDiabetes == "Yes":
        return 1
    if HadDiabetes == "Yes, but only during pregnancy (female)":
        return 1
    if HadDiabetes == "No":
        return 0
    if HadDiabetes == "No, pre-diabetes or borderline diabetes":
        return 0

data["HadDiabetes"] = data["HadDiabetes"].apply(encode_HadDiabetes)

data["HadDiabetes"].value_counts()

HadDiabetes
0    202674
1     34956
Name: count, dtype: int64

In [8]:
# dropping unnecessary column
data = data.drop(columns=["PatientID"])
data.head()

Unnamed: 0,State,Sex,GeneralHealth,AgeCategory,HeightInMeters,WeightInKilograms,BMI,HadHeartAttack,HadAngina,HadStroke,...,ECigaretteUsage,ChestScan,RaceEthnicityCategory,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Fair,Age 75 to 79,1.63,84.82,32.099998,0,1,0,...,Never used e-cigarettes in my entire life,1,"White only, Non-Hispanic",0,0,0,1,"No, did not receive any tetanus shot in the pa...",0,1
1,Alabama,Female,Very good,Age 65 to 69,1.6,71.669998,27.99,0,0,0,...,Never used e-cigarettes in my entire life,0,"White only, Non-Hispanic",0,0,1,1,"Yes, received Tdap",0,0
2,Alabama,Male,Excellent,Age 60 to 64,1.78,71.209999,22.530001,0,0,0,...,Never used e-cigarettes in my entire life,0,"White only, Non-Hispanic",1,0,0,0,"Yes, received tetanus shot but not sure what type",0,0
3,Alabama,Male,Very good,Age 70 to 74,1.78,95.25,30.129999,0,0,0,...,Never used e-cigarettes in my entire life,0,"White only, Non-Hispanic",0,0,1,1,"Yes, received tetanus shot but not sure what type",0,0
4,Alabama,Female,Good,Age 50 to 54,1.68,78.019997,27.76,0,0,0,...,Never used e-cigarettes in my entire life,1,"Black only, Non-Hispanic",0,0,1,0,"No, did not receive any tetanus shot in the pa...",0,0


In [9]:
# verifying values in column
# reference: cc_preprocessing_solution.ipynb
data["State"].value_counts()

State
Washington              14241
Maryland                 8817
Minnesota                8712
Ohio                     8700
New York                 8625
Texas                    7267
Florida                  7124
Kansas                   6000
Wisconsin                5890
Maine                    5709
Iowa                     5492
Indiana                  5393
South Carolina           5360
Virginia                 5358
Arizona                  5302
Hawaii                   5262
Utah                     5212
Michigan                 5206
Massachusetts            5164
Nebraska                 5008
Colorado                 4973
Georgia                  4860
California               4801
Connecticut              4765
Vermont                  4569
South Dakota             4280
Montana                  4155
Missouri                 4042
New Jersey               3833
New Hampshire            3564
Puerto Rico              3550
Idaho                    3394
Alaska                   3100
Rhod

In [10]:
# getting dummies for column with objects
# reference: cc_preprocessing_solution.ipynb
State_dummies = pd.get_dummies(data["State"])
State_dummies.head()

Unnamed: 0,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District of Columbia,Florida,...,Tennessee,Texas,Utah,Vermont,Virgin Islands,Virginia,Washington,West Virginia,Wisconsin,Wyoming
0,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [11]:
# concatinating dummies
# reference: cc_preprocessing_solution.ipynb
data = pd.concat([data, State_dummies], axis=1)
data = data.drop(columns=["State"])
data.head()

Unnamed: 0,Sex,GeneralHealth,AgeCategory,HeightInMeters,WeightInKilograms,BMI,HadHeartAttack,HadAngina,HadStroke,HadAsthma,...,Tennessee,Texas,Utah,Vermont,Virgin Islands,Virginia,Washington,West Virginia,Wisconsin,Wyoming
0,Female,Fair,Age 75 to 79,1.63,84.82,32.099998,0,1,0,1,...,False,False,False,False,False,False,False,False,False,False
1,Female,Very good,Age 65 to 69,1.6,71.669998,27.99,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
2,Male,Excellent,Age 60 to 64,1.78,71.209999,22.530001,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
3,Male,Very good,Age 70 to 74,1.78,95.25,30.129999,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
4,Female,Good,Age 50 to 54,1.68,78.019997,27.76,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False


In [12]:
# verifying values in column
# reference: cc_preprocessing_solution.ipynb
data["Sex"].value_counts()

Sex
Female    123293
Male      114337
Name: count, dtype: int64

In [13]:
# getting dummies for column with objects
# reference: cc_preprocessing_solution.ipynb
Sex_dummies = pd.get_dummies(data["Sex"])
Sex_dummies.head()

Unnamed: 0,Female,Male
0,True,False
1,True,False
2,False,True
3,False,True
4,True,False


In [14]:
# concatinating dummies
# reference: cc_preprocessing_solution.ipynb
data = pd.concat([data, Sex_dummies], axis=1)
data = data.drop(columns=["Sex"])
data.head()

Unnamed: 0,GeneralHealth,AgeCategory,HeightInMeters,WeightInKilograms,BMI,HadHeartAttack,HadAngina,HadStroke,HadAsthma,HadSkinCancer,...,Utah,Vermont,Virgin Islands,Virginia,Washington,West Virginia,Wisconsin,Wyoming,Female,Male
0,Fair,Age 75 to 79,1.63,84.82,32.099998,0,1,0,1,1,...,False,False,False,False,False,False,False,False,True,False
1,Very good,Age 65 to 69,1.6,71.669998,27.99,0,0,0,0,0,...,False,False,False,False,False,False,False,False,True,False
2,Excellent,Age 60 to 64,1.78,71.209999,22.530001,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,True
3,Very good,Age 70 to 74,1.78,95.25,30.129999,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,True
4,Good,Age 50 to 54,1.68,78.019997,27.76,0,0,0,0,0,...,False,False,False,False,False,False,False,False,True,False


In [15]:
# verifying column with objects
# reference: cc_preprocessing_solution.ipynb
data["AgeCategory"].value_counts()

AgeCategory
Age 65 to 69       27547
Age 60 to 64       25685
Age 70 to 74       24946
Age 55 to 59       21422
Age 50 to 54       19154
Age 75 to 79       17679
Age 80 or older    17544
Age 40 to 44       16228
Age 45 to 49       16095
Age 35 to 39       14982
Age 30 to 34       12825
Age 18 to 24       12777
Age 25 to 29       10746
Name: count, dtype: int64

In [16]:
# getting dummies for column with objects
# reference: cc_preprocessing_solution.ipynb
AgeCategory_dummies = pd.get_dummies(data["AgeCategory"])
AgeCategory_dummies.head()

Unnamed: 0,Age 18 to 24,Age 25 to 29,Age 30 to 34,Age 35 to 39,Age 40 to 44,Age 45 to 49,Age 50 to 54,Age 55 to 59,Age 60 to 64,Age 65 to 69,Age 70 to 74,Age 75 to 79,Age 80 or older
0,False,False,False,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,True,False,False,False
2,False,False,False,False,False,False,False,False,True,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,True,False,False
4,False,False,False,False,False,False,True,False,False,False,False,False,False


In [17]:
# concatinating dummies
# reference: cc_preprocessing_solution.ipynb
data = pd.concat([data, AgeCategory_dummies], axis=1)
data = data.drop(columns=["AgeCategory"])
data.head()

Unnamed: 0,GeneralHealth,HeightInMeters,WeightInKilograms,BMI,HadHeartAttack,HadAngina,HadStroke,HadAsthma,HadSkinCancer,HadCOPD,...,Age 35 to 39,Age 40 to 44,Age 45 to 49,Age 50 to 54,Age 55 to 59,Age 60 to 64,Age 65 to 69,Age 70 to 74,Age 75 to 79,Age 80 or older
0,Fair,1.63,84.82,32.099998,0,1,0,1,1,0,...,False,False,False,False,False,False,False,False,True,False
1,Very good,1.6,71.669998,27.99,0,0,0,0,0,0,...,False,False,False,False,False,False,True,False,False,False
2,Excellent,1.78,71.209999,22.530001,0,0,0,0,0,0,...,False,False,False,False,False,True,False,False,False,False
3,Very good,1.78,95.25,30.129999,0,0,0,0,0,0,...,False,False,False,False,False,False,False,True,False,False
4,Good,1.68,78.019997,27.76,0,0,0,0,0,0,...,False,False,False,True,False,False,False,False,False,False


In [18]:
# verifying column with objects
# reference: cc_preprocessing_solution.ipynb
data["GeneralHealth"].value_counts()

GeneralHealth
Very good    83520
Good         74950
Excellent    39911
Fair         29965
Poor          9284
Name: count, dtype: int64

In [19]:
# encoding column with function for numerical values
# seth's suggestion
# reference: cc_preprocessing_solution.ipynb
def encode_GeneralHealth(GeneralHealth):
    """
    This function encodes the column named General by setting its values in a range from -1 to 1.
    """
    if GeneralHealth == "Excellent":
        return 1
    if GeneralHealth == "Very Good":
        return 0.5
    if GeneralHealth == "Good":
        return 0
    if GeneralHealth == "Fair":
        return -0.5
    if GeneralHealth == "Poor":
        return -1

data["GeneralHealth"] = data["GeneralHealth"].apply(encode_GeneralHealth)

data["GeneralHealth"].value_counts()

GeneralHealth
 0.0    74950
 1.0    39911
-0.5    29965
-1.0     9284
Name: count, dtype: int64

In [20]:
# verifying column with objects
# reference: cc_preprocessing_solution.ipynb
data["SmokerStatus"].value_counts()

SmokerStatus
Never smoked                             142390
Former smoker                             66193
Current smoker - now smokes every day     21148
Current smoker - now smokes some days      7899
Name: count, dtype: int64

In [21]:
# getting dummies for column with objects
# reference: cc_preprocessing_solution.ipynb
SmokerStatus_dummies = pd.get_dummies(data["SmokerStatus"])
SmokerStatus_dummies.head()

Unnamed: 0,Current smoker - now smokes every day,Current smoker - now smokes some days,Former smoker,Never smoked
0,False,False,True,False
1,False,False,True,False
2,False,False,False,True
3,False,False,True,False
4,False,False,False,True


In [22]:
# concatinating dummies
# reference: cc_preprocessing_solution.ipynb
data = pd.concat([data, SmokerStatus_dummies], axis=1)
data = data.drop(columns=["SmokerStatus"])
data.head()

Unnamed: 0,GeneralHealth,HeightInMeters,WeightInKilograms,BMI,HadHeartAttack,HadAngina,HadStroke,HadAsthma,HadSkinCancer,HadCOPD,...,Age 55 to 59,Age 60 to 64,Age 65 to 69,Age 70 to 74,Age 75 to 79,Age 80 or older,Current smoker - now smokes every day,Current smoker - now smokes some days,Former smoker,Never smoked
0,-0.5,1.63,84.82,32.099998,0,1,0,1,1,0,...,False,False,False,False,True,False,False,False,True,False
1,,1.6,71.669998,27.99,0,0,0,0,0,0,...,False,False,True,False,False,False,False,False,True,False
2,1.0,1.78,71.209999,22.530001,0,0,0,0,0,0,...,False,True,False,False,False,False,False,False,False,True
3,,1.78,95.25,30.129999,0,0,0,0,0,0,...,False,False,False,True,False,False,False,False,True,False
4,0.0,1.68,78.019997,27.76,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,True


In [23]:
# verifying column with objects
# reference: cc_preprocessing_solution.ipynb
data["ECigaretteUsage"].value_counts()

ECigaretteUsage
Never used e-cigarettes in my entire life    183446
Not at all (right now)                        41963
Use them some days                             6468
Use them every day                             5753
Name: count, dtype: int64

In [24]:
# getting dummies for column with objects
# reference: cc_preprocessing_solution.ipynb
ECigaretteUsage_dummies = pd.get_dummies(data["ECigaretteUsage"])
ECigaretteUsage_dummies.head()

Unnamed: 0,Never used e-cigarettes in my entire life,Not at all (right now),Use them every day,Use them some days
0,True,False,False,False
1,True,False,False,False
2,True,False,False,False
3,True,False,False,False
4,True,False,False,False


In [25]:
# concatinating dummies
# reference: cc_preprocessing_solution.ipynb
data = pd.concat([data, ECigaretteUsage_dummies], axis=1)
data = data.drop(columns=["ECigaretteUsage"])
data.head()

Unnamed: 0,GeneralHealth,HeightInMeters,WeightInKilograms,BMI,HadHeartAttack,HadAngina,HadStroke,HadAsthma,HadSkinCancer,HadCOPD,...,Age 75 to 79,Age 80 or older,Current smoker - now smokes every day,Current smoker - now smokes some days,Former smoker,Never smoked,Never used e-cigarettes in my entire life,Not at all (right now),Use them every day,Use them some days
0,-0.5,1.63,84.82,32.099998,0,1,0,1,1,0,...,True,False,False,False,True,False,True,False,False,False
1,,1.6,71.669998,27.99,0,0,0,0,0,0,...,False,False,False,False,True,False,True,False,False,False
2,1.0,1.78,71.209999,22.530001,0,0,0,0,0,0,...,False,False,False,False,False,True,True,False,False,False
3,,1.78,95.25,30.129999,0,0,0,0,0,0,...,False,False,False,False,True,False,True,False,False,False
4,0.0,1.68,78.019997,27.76,0,0,0,0,0,0,...,False,False,False,False,False,True,True,False,False,False


In [26]:
# verifying column with objects
# reference: cc_preprocessing_solution.ipynb
data["RaceEthnicityCategory"].value_counts()

RaceEthnicityCategory
White only, Non-Hispanic         179369
Hispanic                          22023
Black only, Non-Hispanic          19053
Other race only, Non-Hispanic     11802
Multiracial, Non-Hispanic          5383
Name: count, dtype: int64

In [27]:
# getting dummies for column with objects
# reference: cc_preprocessing_solution.ipynb
RaceEthnicityCategory_dummies = pd.get_dummies(data["RaceEthnicityCategory"])
RaceEthnicityCategory_dummies.head()

Unnamed: 0,"Black only, Non-Hispanic",Hispanic,"Multiracial, Non-Hispanic","Other race only, Non-Hispanic","White only, Non-Hispanic"
0,False,False,False,False,True
1,False,False,False,False,True
2,False,False,False,False,True
3,False,False,False,False,True
4,True,False,False,False,False


In [28]:
# concatinating dummies
# reference: cc_preprocessing_solution.ipynb
data = pd.concat([data, RaceEthnicityCategory_dummies], axis=1)
data = data.drop(columns=["RaceEthnicityCategory"])
data.head()

Unnamed: 0,GeneralHealth,HeightInMeters,WeightInKilograms,BMI,HadHeartAttack,HadAngina,HadStroke,HadAsthma,HadSkinCancer,HadCOPD,...,Never smoked,Never used e-cigarettes in my entire life,Not at all (right now),Use them every day,Use them some days,"Black only, Non-Hispanic",Hispanic,"Multiracial, Non-Hispanic","Other race only, Non-Hispanic","White only, Non-Hispanic"
0,-0.5,1.63,84.82,32.099998,0,1,0,1,1,0,...,False,True,False,False,False,False,False,False,False,True
1,,1.6,71.669998,27.99,0,0,0,0,0,0,...,False,True,False,False,False,False,False,False,False,True
2,1.0,1.78,71.209999,22.530001,0,0,0,0,0,0,...,True,True,False,False,False,False,False,False,False,True
3,,1.78,95.25,30.129999,0,0,0,0,0,0,...,False,True,False,False,False,False,False,False,False,True
4,0.0,1.68,78.019997,27.76,0,0,0,0,0,0,...,True,True,False,False,False,True,False,False,False,False


In [29]:
# verifying column with objects
# reference: cc_preprocessing_solution.ipynb
data["TetanusLast10Tdap"].value_counts()

TetanusLast10Tdap
No, did not receive any tetanus shot in the past 10 years    79370
Yes, received tetanus shot but not sure what type            71538
Yes, received Tdap                                           67418
Yes, received tetanus shot, but not Tdap                     19304
Name: count, dtype: int64

In [30]:
# getting dummies for column with objects
# reference: cc_preprocessing_solution.ipynb
TetanusLast10Tdap_dummies = pd.get_dummies(data["TetanusLast10Tdap"])
TetanusLast10Tdap_dummies.head()

Unnamed: 0,"No, did not receive any tetanus shot in the past 10 years","Yes, received Tdap","Yes, received tetanus shot but not sure what type","Yes, received tetanus shot, but not Tdap"
0,True,False,False,False
1,False,True,False,False
2,False,False,True,False
3,False,False,True,False
4,True,False,False,False


In [31]:
# concatinating dummies
# reference: cc_preprocessing_solution.ipynb
data = pd.concat([data, TetanusLast10Tdap_dummies], axis=1)
data = data.drop(columns=["TetanusLast10Tdap"])
data.head()

Unnamed: 0,GeneralHealth,HeightInMeters,WeightInKilograms,BMI,HadHeartAttack,HadAngina,HadStroke,HadAsthma,HadSkinCancer,HadCOPD,...,Use them some days,"Black only, Non-Hispanic",Hispanic,"Multiracial, Non-Hispanic","Other race only, Non-Hispanic","White only, Non-Hispanic","No, did not receive any tetanus shot in the past 10 years","Yes, received Tdap","Yes, received tetanus shot but not sure what type","Yes, received tetanus shot, but not Tdap"
0,-0.5,1.63,84.82,32.099998,0,1,0,1,1,0,...,False,False,False,False,False,True,True,False,False,False
1,,1.6,71.669998,27.99,0,0,0,0,0,0,...,False,False,False,False,False,True,False,True,False,False
2,1.0,1.78,71.209999,22.530001,0,0,0,0,0,0,...,False,False,False,False,False,True,False,False,True,False
3,,1.78,95.25,30.129999,0,0,0,0,0,0,...,False,False,False,False,False,True,False,False,True,False
4,0.0,1.68,78.019997,27.76,0,0,0,0,0,0,...,False,True,False,False,False,False,True,False,False,False


In [32]:
# dropping any null values
# latifah's help
data = data.dropna()
data.head()

Unnamed: 0,GeneralHealth,HeightInMeters,WeightInKilograms,BMI,HadHeartAttack,HadAngina,HadStroke,HadAsthma,HadSkinCancer,HadCOPD,...,Use them some days,"Black only, Non-Hispanic",Hispanic,"Multiracial, Non-Hispanic","Other race only, Non-Hispanic","White only, Non-Hispanic","No, did not receive any tetanus shot in the past 10 years","Yes, received Tdap","Yes, received tetanus shot but not sure what type","Yes, received tetanus shot, but not Tdap"
0,-0.5,1.63,84.82,32.099998,0,1,0,1,1,0,...,False,False,False,False,False,True,True,False,False,False
2,1.0,1.78,71.209999,22.530001,0,0,0,0,0,0,...,False,False,False,False,False,True,False,False,True,False
4,0.0,1.68,78.019997,27.76,0,0,0,0,0,0,...,False,True,False,False,False,False,True,False,False,False
6,0.0,1.7,74.839996,25.84,0,0,0,0,0,0,...,False,True,False,False,False,False,True,False,False,False
7,-0.5,1.7,87.540001,30.23,0,0,0,0,0,0,...,False,True,False,False,False,False,True,False,False,False


In [33]:
# exporting file
# syntax from floris c.
data.to_csv(PATHtopreprocesseddataset, index=False)

In [4]:
# reading in file
# syntax from floris
data = pd.read_csv(PATHtopreprocesseddataset)
# verifying
data.head()

Unnamed: 0,GeneralHealth,HeightInMeters,WeightInKilograms,BMI,HadHeartAttack,HadAngina,HadStroke,HadAsthma,HadSkinCancer,HadCOPD,...,Use them some days,"Black only, Non-Hispanic",Hispanic,"Multiracial, Non-Hispanic","Other race only, Non-Hispanic","White only, Non-Hispanic","No, did not receive any tetanus shot in the past 10 years","Yes, received Tdap","Yes, received tetanus shot but not sure what type","Yes, received tetanus shot, but not Tdap"
0,-0.5,1.63,84.82,32.099998,0,1,0,1,1,0,...,False,False,False,False,False,True,True,False,False,False
1,1.0,1.78,71.209999,22.530001,0,0,0,0,0,0,...,False,False,False,False,False,True,False,False,True,False
2,0.0,1.68,78.019997,27.76,0,0,0,0,0,0,...,False,True,False,False,False,False,True,False,False,False
3,0.0,1.7,74.839996,25.84,0,0,0,0,0,0,...,False,True,False,False,False,False,True,False,False,False
4,-0.5,1.7,87.540001,30.23,0,0,0,0,0,0,...,False,True,False,False,False,False,True,False,False,False


In [5]:
# Prepare features (X) and labels (y)
# Replace 'label_column' with the actual column name of the target variable in your dataset
X = data.drop(columns=['HadDiabetes'])  # Features
y = data['HadDiabetes']  # Labels

In [6]:
# Split the data into training and testing sets
# "stratify=y" was instructor's suggestion.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [9]:
# creating StandardScaler's instance
# reference: random_forest_solution.ipynb
scaler = StandardScaler()

In [10]:
# fitting
# reference: random_forest_solution.ipynb
X_scaler = scaler.fit(X_train)

In [11]:
# scaling
# reference: random_forest_solution.ipynb
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
# Create the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

In [13]:
# fitting
# reference: random_forest_solution.ipynb
rf_classifier = rf_classifier.fit(X_train_scaled, y_train)

In [14]:
# Make predictions
# reference: random_forest_solution.ipynb
y_pred = rf_classifier.predict(X_test_scaled)

In [15]:
# Evaluate the model
# reference: random_forest_solution.ipynb
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actually No Diabetes", "Actually Diabetes"], columns=["Predicted No Diabetes", "Predicted Diabetes"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, y_pred)

In [16]:
# displaying results
# reference: random_forest_solution.ipynb
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted No Diabetes,Predicted Diabetes
Actually No Diabetes,30925,689
Actually Diabetes,6019,895


Accuracy Score : 0.8258928571428571
Classification Report
              precision    recall  f1-score   support

           0       0.84      0.98      0.90     31614
           1       0.57      0.13      0.21      6914

    accuracy                           0.83     38528
   macro avg       0.70      0.55      0.56     38528
weighted avg       0.79      0.83      0.78     38528



In [17]:
# feature importances
# reference: random_forest_solution.ipynb
importances = rf_classifier.feature_importances_
sorted(zip(rf_classifier.feature_importances_, X.columns), reverse=True)

[(0.1019971395305954, 'BMI'),
 (0.08291426323771019, 'WeightInKilograms'),
 (0.06015206450065106, 'HeightInMeters'),
 (0.04583172480872557, 'GeneralHealth'),
 (0.023744785908072608, 'PneumoVaxEver'),
 (0.018339763890747682, 'DifficultyWalking'),
 (0.017075103762661983, 'AlcoholDrinkers'),
 (0.016684630746912992, 'HadArthritis'),
 (0.016301045902460728, 'HadKidneyDisease'),
 (0.016131205404711916, 'CovidPos'),
 (0.01578660307808145, 'HIVTesting'),
 (0.015439841571712515, 'ChestScan'),
 (0.014779624991511215, 'FluVaxLast12'),
 (0.014386846638391609,
  'No, did not receive any tetanus shot in the past 10 years'),
 (0.014358218978262243, 'Yes, received tetanus shot but not sure what type'),
 (0.013396867704596001, 'HadDepressiveDisorder'),
 (0.013152714414801972, 'Never smoked'),
 (0.012585384875602441, 'Former smoker'),
 (0.012557625675226922, 'Yes, received Tdap'),
 (0.012506710760942875, 'HadAsthma'),
 (0.011630600916468227, 'HadAngina'),
 (0.0108409920210849, 'HadHeartAttack'),
 (0.010

In [22]:
# better visualization
# reference: random_forest_solution.ipynb
importances_df = pd.DataFrame(sorted(zip(rf_classifier.feature_importances_, X.columns), reverse=True))
importances_df.head()

Unnamed: 0,0,1
0,0.101997,BMI
1,0.082914,WeightInKilograms
2,0.060152,HeightInMeters
3,0.045832,GeneralHealth
4,0.023745,PneumoVaxEver


In [25]:
# exporting file
# syntax from floris c.
importances_df.to_csv(PATHtofeatureimportances, index=False)