In [49]:
pip install streamlit




In [50]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score,confusion_matrix

In [51]:
data=pd.read_csv('/content/Heart_Disease_Dataset.csv')

In [52]:

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100201 entries, 0 to 100200
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      100201 non-null  object 
 1   BMI               100201 non-null  float64
 2   Smoking           100201 non-null  object 
 3   AlcoholDrinking   100201 non-null  object 
 4   Stroke            100201 non-null  object 
 5   PhysicalHealth    100201 non-null  int64  
 6   MentalHealth      100201 non-null  int64  
 7   DiffWalking       100201 non-null  object 
 8   Sex               100201 non-null  object 
 9   AgeCategory       100201 non-null  object 
 10  Race              100201 non-null  object 
 11  Diabetic          100201 non-null  object 
 12  PhysicalActivity  100201 non-null  object 
 13  GenHealth         100201 non-null  object 
 14  SleepTime         100201 non-null  float64
 15  Asthma            100201 non-null  object 
 16  KidneyDisease     10

In [53]:
data.describe()

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime
count,100201.0,100201.0,100201.0,100201.0
mean,53.373302,14.503937,14.498239,12.490389
std,23.878712,8.640423,8.658639,6.632122
min,12.02,0.0,0.0,1.0
25%,32.76,7.0,7.0,6.7
50%,53.38,14.0,15.0,12.5
75%,73.95,22.0,22.0,18.2
max,94.85,29.0,29.0,24.0


In [54]:
data.isnull().sum()


Unnamed: 0,0
HeartDisease,0
BMI,0
Smoking,0
AlcoholDrinking,0
Stroke,0
PhysicalHealth,0
MentalHealth,0
DiffWalking,0
Sex,0
AgeCategory,0


In [55]:
(data.duplicated()).sum()

0

In [56]:
data.columns

Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',
       'Asthma', 'KidneyDisease', 'SkinCancer'],
      dtype='object')

In [57]:
for column in data.columns:
 print( data[column].value_counts)


<bound method IndexOpsMixin.value_counts of 0         Yes
1         Yes
2          No
3         Yes
4         Yes
         ... 
100196     No
100197    Yes
100198     No
100199     No
100200     No
Name: HeartDisease, Length: 100201, dtype: object>
<bound method IndexOpsMixin.value_counts of 0         14.96
1         36.93
2         18.70
3         31.43
4         75.64
          ...  
100196    48.02
100197    82.88
100198    39.46
100199    46.25
100200    39.26
Name: BMI, Length: 100201, dtype: float64>
<bound method IndexOpsMixin.value_counts of 0         Yes
1         Yes
2         Yes
3         Yes
4          No
         ... 
100196     No
100197     No
100198     No
100199    Yes
100200    Yes
Name: Smoking, Length: 100201, dtype: object>
<bound method IndexOpsMixin.value_counts of 0         Yes
1         Yes
2          No
3         Yes
4          No
         ... 
100196     No
100197     No
100198    Yes
100199    Yes
100200     No
Name: AlcoholDrinking, Length: 100201, dtype: 

In [58]:
data.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,Yes,14.96,Yes,Yes,No,7,6,Yes,Female,25-29,White,Yes,Yes,Fair,17.1,Yes,Yes,Yes
1,Yes,36.93,Yes,Yes,No,3,10,No,Female,30-34,Hispanic,"No, borderline diabetes",Yes,Poor,2.7,Yes,No,No
2,No,18.7,Yes,No,Yes,26,15,No,Male,70-74,Hispanic,No,No,Poor,15.6,Yes,Yes,Yes
3,Yes,31.43,Yes,Yes,Yes,24,20,No,Female,40-44,American Indian/Alaskan Native,"No, borderline diabetes",No,Fair,1.4,No,No,Yes
4,Yes,75.64,No,No,Yes,2,29,No,Male,35-39,White,No,Yes,Excellent,8.8,Yes,Yes,Yes


In [59]:
binary_columns=['Smoking','AlcoholDrinking','Stroke','DiffWalking','PhysicalActivity','Asthma','SkinCancer']
for column in binary_columns:
 data[column]=data[column].map({'Yes':1,'No':0})

In [60]:
# Check the actual column names in the DataFrame
print(data.columns)

# Ensure the columns are present before applying get_dummies
# If column names are different, update the list:
columns_to_encode = ['AgeCategory', 'Race', 'Diabetic', 'Sex', 'GenHealth', 'PhysicalHealth', 'MentalHealth']  # Updated 'sex' to 'Sex'

# Apply get_dummies only to existing columns
data = pd.get_dummies(data, columns=columns_to_encode)
print(data.head())

Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',
       'Asthma', 'KidneyDisease', 'SkinCancer'],
      dtype='object')
  HeartDisease    BMI  Smoking  AlcoholDrinking  Stroke  DiffWalking  \
0          Yes  14.96        1                1       0            1   
1          Yes  36.93        1                1       0            0   
2           No  18.70        1                0       1            0   
3          Yes  31.43        1                1       1            0   
4          Yes  75.64        0                0       1            0   

   PhysicalActivity  SleepTime  Asthma KidneyDisease  ...  MentalHealth_20  \
0                 1       17.1       1           Yes  ...            False   
1                 1        2.7       1            No  ...            False   
2                 0       15.6       1   

In [61]:
x=data.drop(columns=['HeartDisease'])
y=data['HeartDisease']

In [62]:
y.head()

Unnamed: 0,HeartDisease
0,Yes
1,Yes
2,No
3,Yes
4,Yes


In [63]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [65]:
clf = DecisionTreeClassifier(max_depth=5,criterion = "gini",random_state=42)
clf.fit(x_train,y_train)

ValueError: could not convert string to float: 'Yes'