In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [2]:
train = pd.read_csv(r"C:\Users\hi\Desktop\Github Repositories\heart-disease-analysis-prediction\Data\train.csv")
test = pd.read_csv(r"C:\Users\hi\Desktop\Github Repositories\heart-disease-analysis-prediction\Data\test.csv")

In [3]:
train.head()

Unnamed: 0,id,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,0,58,1,4,152,239,0,0,158,1,3.6,2,2,7,Presence
1,1,52,1,1,125,325,0,2,171,0,0.0,1,0,3,Absence
2,2,56,0,2,160,188,0,2,151,0,0.0,1,0,3,Absence
3,3,44,0,3,134,229,0,2,150,0,1.0,2,0,3,Absence
4,4,58,1,4,140,234,0,2,125,1,3.8,2,3,3,Presence


In [4]:
X = train.drop(columns=['id','Heart Disease'])
y = train['Heart Disease']
y = y.map({'Presence':1, 'Absence':0})

In [5]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 630000 entries, 0 to 629999
Data columns (total 13 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Age                      630000 non-null  int64  
 1   Sex                      630000 non-null  int64  
 2   Chest pain type          630000 non-null  int64  
 3   BP                       630000 non-null  int64  
 4   Cholesterol              630000 non-null  int64  
 5   FBS over 120             630000 non-null  int64  
 6   EKG results              630000 non-null  int64  
 7   Max HR                   630000 non-null  int64  
 8   Exercise angina          630000 non-null  int64  
 9   ST depression            630000 non-null  float64
 10  Slope of ST              630000 non-null  int64  
 11  Number of vessels fluro  630000 non-null  int64  
 12  Thallium                 630000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 62.5 MB


In [6]:
preprocessing = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),X.columns)
    ]
)

In [7]:
classifier = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=42, n_jobs=-1, )
model = Pipeline(
    steps=[
        ('preprocessing',preprocessing),
        ('clf', classifier)
    ]
)

In [8]:
print(train.shape)
print(X.shape)
print(y.shape)

(630000, 15)
(630000, 13)
(630000,)


In [9]:
model.fit(X,y)

In [10]:
pred = model.predict(test)

In [11]:
subm = pd.DataFrame({
    'id':test.id,
    'Heart Disease': pred
}
   )

In [12]:
subm.to_csv('random_forest.csv',index=False)

In [13]:
pred.shape

(270000,)

In [14]:
# getting importante features and placing them in dataframe
importances = model.named_steps['clf'].feature_importances_
feature_names = X.columns

In [15]:
feat_importance = pd.DataFrame({
    'feature': feature_names,
    'importance':importances
})

In [16]:
feat_importance = feat_importance.sort_values('importance', ascending=False).reset_index()

In [17]:
feat_importance.drop(columns=['level_0','index'], inplace=True)


KeyError: "['level_0'] not found in axis"

In [None]:
feat_importance

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(5,4))
sns.barplot(data=feat_importance, x='feature', y='importance', color='teal')
plt.tight_layout()
plt.xticks(rotation=80)
plt.title('Feature Importance Visualization')
plt.show()