Binary Classification problem to detect the severity of getting heart disease  

Features: 13  
age [num]: age in years   
sex [cat]: sex (1 = male; 0 = female)  
cp [ord]: chest pain type  
    -- Value 1: typical angina  
    -- Value 2: atypical angina  
    -- Value 3: non-anginal pain  
    -- Value 4: asymptomatic  
trestbps [num]: resting blood pressure (in mm Hg on admission to the hospital)  
chol [num]: serum cholestoral in mg/dl  
fbs [cat]: (fasting blood sugar > 120 mg/dl)  (1 = true; 0 = false)   
restecg [ord]: resting electrocardiographic results  
           -- Value 0: normal  
           -- Value 1: having ST-T wave abnormality (T wave inversions and/or ST   
                       elevation or depression of > 0.05 mV)  
           -- Value 2: showing probable or definite left ventricular hypertrophy  
                       by Estes' criteria   
thalach [num]: maximum heart rate achieved   
exang [cat]: exercise induced angina (1 = yes; 0 = no)   
oldpeak [num]: ST depression induced by exercise relative to rest  
slope [ord]: the slope of the peak exercise ST segment  
         -- Value 1: upsloping  
         -- Value 2: flat  
         -- Value 3: downsloping  
ca [ord]: number of major vessels (0-3) colored by flourosopy  
thal [cat]: 3 = normal; 6 = fixed defect; 7 = reversable defect  
    
Target:   
sever : heart disease severity (1 = sever, 0: not sever)  

In [120]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

In [121]:
# Reading the data
h = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'sever']
df = pd.read_csv('heart_disease_classification.data', encoding_errors='ignore', names=h)
df.loc[df["sever"]>0, "sever"]=1

In [122]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,sever
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [123]:
df.nunique(axis=0)

age          41
sex           2
cp            4
trestbps     50
chol        152
fbs           2
restecg       3
thalach      91
exang         2
oldpeak      40
slope         3
ca            5
thal          4
sever         2
dtype: int64

In [124]:
df.value_counts(['sever'])

sever
0        164
1        139
dtype: int64

In [140]:
df.isna().sum()
df = df.drop(columns = ["thal","ca"])

In [141]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,sever
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,1
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1


In [142]:
#Find correlation

In [143]:
df = df.fillna(0)

In [144]:
#Normalization

In [145]:
#test and train 25%,75%

In [146]:
y = df["sever"]
X = df.drop(columns = ["sever"])
y.head

<bound method NDFrame.head of 0      0
1      1
2      1
3      0
4      0
      ..
298    1
299    1
300    1
301    1
302    0
Name: sever, Length: 303, dtype: int64>

In [147]:
X.head

<bound method NDFrame.head of       age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0    63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3   
1    67.0  1.0  4.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5   
2    67.0  1.0  4.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6   
3    37.0  1.0  3.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5   
4    41.0  0.0  2.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4   
..    ...  ...  ...       ...    ...  ...      ...      ...    ...      ...   
298  45.0  1.0  1.0     110.0  264.0  0.0      0.0    132.0    0.0      1.2   
299  68.0  1.0  4.0     144.0  193.0  1.0      0.0    141.0    0.0      3.4   
300  57.0  1.0  4.0     130.0  131.0  0.0      0.0    115.0    1.0      1.2   
301  57.0  0.0  2.0     130.0  236.0  0.0      2.0    174.0    0.0      0.0   
302  38.0  1.0  3.0     138.0  175.0  0.0      0.0    173.0    0.0      0.0   

     slope  
0      3

In [148]:
df.dtypes

age         float64
sex         float64
cp          float64
trestbps    float64
chol        float64
fbs         float64
restecg     float64
thalach     float64
exang       float64
oldpeak     float64
slope       float64
sever         int64
dtype: object

In [149]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25,random_state = 1)
X_train.shape

(227, 11)

In [150]:
X_test.shape


(76, 11)

In [151]:
#Recall, F-1 score
df.replace("?", str(0))

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,sever
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,1
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1


In [152]:
"""
drop_enc = OneHotEncoder(drop='thal').fit(X)
drop_enc.categories_
[array(['3.0', '6.0','7.0'], dtype=object), array([1, 2, 3], dtype=object)]
drop_enc.transform([['Female', 1], ['Male', 2]]).toarray()
array([[0., 0., 0.],
       [1., 1., 0.]])
"""

"\ndrop_enc = OneHotEncoder(drop='thal').fit(X)\ndrop_enc.categories_\n[array(['3.0', '6.0','7.0'], dtype=object), array([1, 2, 3], dtype=object)]\ndrop_enc.transform([['Female', 1], ['Male', 2]]).toarray()\narray([[0., 0., 0.],\n       [1., 1., 0.]])\n"

In [153]:
#Logistic regression,K-NN,DT(hyper tuning)

In [154]:
model = LogisticRegression(random_state = 1)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [156]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.78      0.85      0.81        41
           1       0.81      0.71      0.76        35

    accuracy                           0.79        76
   macro avg       0.79      0.78      0.79        76
weighted avg       0.79      0.79      0.79        76

