In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle


In [24]:
# Load the dataset
data = pd.read_csv(r"heartnew.csv")

In [26]:
# Display the first few rows
print(data.head())


   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   55    0   1       110   305    1        0      162      0      2.2      0   
1   45    1   1       155   308    1        2      147      0      1.2      1   
2   73    0   2       192   219    1        0      173      1      3.4      1   
3   55    1   3       136   278    0        0      128      1      1.2      2   
4   58    1   3       160   172    0        2      142      0      1.5      2   

   ca  thal  target  
0   0     2       0  
1   4     3       1  
2   1     2       0  
3   4     0       1  
4   0     0       0  


In [28]:
# Check dataset dimensions
print("Dataset shape:", data.shape)


Dataset shape: (25000, 14)


In [30]:
# Information about the dataset
print(data.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       25000 non-null  int64  
 1   sex       25000 non-null  int64  
 2   cp        25000 non-null  int64  
 3   trestbps  25000 non-null  int64  
 4   chol      25000 non-null  int64  
 5   fbs       25000 non-null  int64  
 6   restecg   25000 non-null  int64  
 7   thalach   25000 non-null  int64  
 8   exang     25000 non-null  int64  
 9   oldpeak   25000 non-null  float64
 10  slope     25000 non-null  int64  
 11  ca        25000 non-null  int64  
 12  thal      25000 non-null  int64  
 13  target    25000 non-null  int64  
dtypes: float64(1), int64(13)
memory usage: 2.7 MB
None


In [32]:
# Check for missing values
print("Missing values in each column:\n", data.isnull().sum())


Missing values in each column:
 age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


In [34]:
# Statistical summary
print("Statistical summary:\n", data.describe())


Statistical summary:
                 age          sex            cp      trestbps          chol  \
count  25000.000000  25000.00000  25000.000000  25000.000000  25000.000000   
mean      52.969440      0.49676      1.502480    136.493840    250.567200   
std       14.170349      0.50000      1.123657     25.694726     63.022219   
min       29.000000      0.00000      0.000000     94.000000    126.000000   
25%       41.000000      0.00000      0.000000    117.000000    206.000000   
50%       53.000000      0.00000      2.000000    134.000000    245.000000   
75%       65.000000      1.00000      3.000000    154.000000    290.000000   
max       77.000000      1.00000      3.000000    200.000000    564.000000   

                fbs       restecg       thalach         exang       oldpeak  \
count  25000.000000  25000.000000  25000.000000  25000.000000  25000.000000   
mean       0.494240      0.992800    144.619080      0.495880      2.180816   
std        0.499977      0.819009     

In [36]:
# Distribution of the target variable
print("Target variable distribution:\n", data['target'].value_counts())

Target variable distribution:
 target
0    12582
1    12418
Name: count, dtype: int64


In [38]:
data.tail()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
24995,76,0,3,134,180,1,1,145,1,1.4,1,4,0,0
24996,40,0,1,105,294,1,0,95,0,2.9,2,4,0,1
24997,31,1,0,106,237,0,2,117,0,0.5,1,3,0,0
24998,77,1,3,136,187,0,1,150,1,0.5,2,1,3,0
24999,46,0,3,172,306,1,0,129,0,0.8,2,4,0,1


In [40]:
# Separate features and target
features = data.drop(columns='target', axis=1)
target = data['target']

In [42]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, stratify=target, random_state=2
)

print(f"Original dataset size: {features.shape}")
print(f"Training dataset size: {X_train.shape}")
print(f"Testing dataset size: {X_test.shape}")


Original dataset size: (25000, 13)
Training dataset size: (20000, 13)
Testing dataset size: (5000, 13)


In [44]:
# Train a logistic regression model
model = LogisticRegression()

In [46]:
# Train the model on the training data
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [47]:
# Evaluate the model on the training data
train_predictions = model.predict(X_train)
train_accuracy = accuracy_score(y_train, train_predictions) * 100
print(f"Accuracy on training data: {train_accuracy:.2f}%")

Accuracy on training data: 50.93%


In [50]:
# Evaluate the model on the testing data
test_predictions = model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions) * 100
print(f"Accuracy on testing data: {test_accuracy:.2f}%")

Accuracy on testing data: 48.56%


In [52]:
# Predict for a single input
sample_input = (45, 1, 1, 155, 308, 1, 2, 147, 0, 1.2, 1, 4, 3)
sample_input_array = np.asarray(sample_input).reshape(1, -1)

sample_prediction = model.predict(sample_input_array)
if sample_prediction[0] == 0:
    print("The person does NOT have heart disease.")
else:
    print("The person has heart disease.")

The person has heart disease.




In [54]:
# Save the trained model
model_file_path = r"hearttrained_modelfinal.sav"
pickle.dump(model, open(model_file_path, 'wb'))
print(f"Model saved to {model_file_path}")

Model saved to hearttrained_modelfinal.sav


In [56]:
# Load the saved model
loaded_model = pickle.load(open(model_file_path, 'rb'))

In [58]:
# Test the loaded model with the same sample input
loaded_model_prediction = loaded_model.predict(sample_input_array)
if loaded_model_prediction[0] == 0:
    print("The person does NOT have heart disease (Loaded Model).")
else:
    print("The person has heart disease (Loaded Model).")


The person has heart disease (Loaded Model).


