In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer  
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [54]:
pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.


In [55]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
heart_disease = fetch_ucirepo(id=45) 
  
# data (as pandas dataframes) 
X = heart_disease.data.features 
y = heart_disease.data.targets 
  
# metadata 
print(heart_disease.metadata) 
  
# variable information 
print(heart_disease.variables) 


{'uci_id': 45, 'name': 'Heart Disease', 'repository_url': 'https://archive.ics.uci.edu/dataset/45/heart+disease', 'data_url': 'https://archive.ics.uci.edu/static/public/45/data.csv', 'abstract': '4 databases: Cleveland, Hungary, Switzerland, and the VA Long Beach', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 303, 'num_features': 13, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': ['Age', 'Sex'], 'target_col': ['num'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1989, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C52P4X', 'creators': ['Andras Janosi', 'William Steinbrunn', 'Matthias Pfisterer', 'Robert Detrano'], 'intro_paper': {'ID': 231, 'type': 'NATIVE', 'title': 'International application of a new probability algorithm for the diagnosis of coronary artery disease.', 'authors': 'R. Detrano, A. Jánosi, W. Steinbrunn, M

In [58]:
df = pd.DataFrame(X, columns=heart_disease.data.feature_names)
df['target']=y
print(df.head())

   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   63    1   1       145   233    1        2      150      0      2.3      3   
1   67    1   4       160   286    0        2      108      1      1.5      2   
2   67    1   4       120   229    0        2      129      1      2.6      2   
3   37    1   3       130   250    0        0      187      0      3.5      3   
4   41    0   2       130   204    0        2      172      0      1.4      1   

    ca  thal  target  
0  0.0   6.0       0  
1  3.0   3.0       2  
2  2.0   7.0       1  
3  0.0   3.0       0  
4  0.0   3.0       0  


In [66]:
# 1. Handle missing values (if any)
# SimpleImputer with strategy 'mean' to fill missing values for numerical columns
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df.drop(columns='target')), columns=df.drop(columns='target').columns)

# Add the target column back
df_imputed['target'] = df['target']

# Check if there are any missing values left
print(f"Missing values after imputation:\n{df_imputed.isnull().sum()}")

# 2. Encode categorical variables (if any)
# Check for categorical columns
categorical_cols = df_imputed.select_dtypes(include=['object']).columns
print(f"Categorical columns: {categorical_cols}")

# If categorical columns exist, use LabelEncoder or OneHotEncoder
# Example: Encoding with LabelEncoder
label_encoder = LabelEncoder()
for col in categorical_cols:
    df_imputed[col] = label_encoder.fit_transform(df_imputed[col])

# 3. Standardize numerical features
# Extract the numerical features (excluding the target column)
numerical_cols = df_imputed.select_dtypes(include=[np.number]).columns
numerical_cols = numerical_cols.drop('target')  # Exclude target

# Standardize numerical features
scaler = StandardScaler()
df_imputed[numerical_cols] = scaler.fit_transform(df_imputed[numerical_cols])

# 4. Split the dataset into training and test sets
X = df_imputed.drop(columns='target')
y = df_imputed['target']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shape of training and testing sets
print(f"Training data shape: {X_train.shape}, Testing data shape: {X_test.shape}")


Missing values after imputation:
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64
Categorical columns: Index([], dtype='object')
Training data shape: (242, 13), Testing data shape: (61, 13)


In [62]:
# Checking for missing values after imputation
print(f"Missing values after imputation:\n{df_imputed.isnull().sum()}")

# Check the first few rows of the preprocessed data
print(f"\nPreprocessed data preview:\n{df_imputed.head()}")

# Check the statistical summary of the dataset
print(f"\nStatistical Summary of the data:\n{df_imputed.describe()}")

Missing values after imputation:
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

Preprocessed data preview:
        age       sex        cp  trestbps      chol       fbs   restecg  \
0  0.948726  0.686202 -2.251775  0.757525 -0.264900  2.394438  1.016684   
1  1.392002  0.686202  0.877985  1.611220  0.760415 -0.417635  1.016684   
2  1.392002  0.686202  0.877985 -0.665300 -0.342283 -0.417635  1.016684   
3 -1.932564  0.686202 -0.165268 -0.096170  0.063974 -0.417635 -0.996749   
4 -1.489288 -1.457296 -1.208521 -0.096170 -0.825922 -0.417635  1.016684   

    thalach     exang   oldpeak     slope        ca      thal  target  
0  0.017197 -0.696631  1.087338  2.274579 -0.723095  0.655818       0  
1 -1.821905  1.435481  0.397182  0.649113  2.503851 -0.898522       2  
2 -0.902354  1.435481  1.346147  0.649113  1.428203  1.173931  

In [68]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Logistic Regression model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(f"Classification Report:\n{classification_report(y_test, y_pred)}")

Accuracy: 0.5410
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.90      0.85        29
           1       0.38      0.25      0.30        12
           2       0.22      0.22      0.22         9
           3       0.17      0.29      0.21         7
           4       0.00      0.00      0.00         4

    accuracy                           0.54        61
   macro avg       0.32      0.33      0.32        61
weighted avg       0.51      0.54      0.52        61



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [70]:
print(y_train.value_counts())
print(y_test.value_counts())

target
0    135
1     43
3     28
2     27
4      9
Name: count, dtype: int64
target
0    29
1    12
2     9
3     7
4     4
Name: count, dtype: int64


In [72]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Initialize the Logistic Regression model
log_reg = LogisticRegression(max_iter=1000, random_state=42)

# Fit the model on the training data
log_reg.fit(X_train, y_train)

# Make predictions
log_reg_preds = log_reg.predict(X_test)

# Evaluate the model
print("Logistic Regression Evaluation:")
print(classification_report(y_test, log_reg_preds))
print(confusion_matrix(y_test, log_reg_preds))


Logistic Regression Evaluation:
              precision    recall  f1-score   support

           0       0.81      0.90      0.85        29
           1       0.38      0.25      0.30        12
           2       0.22      0.22      0.22         9
           3       0.17      0.29      0.21         7
           4       0.00      0.00      0.00         4

    accuracy                           0.54        61
   macro avg       0.32      0.33      0.32        61
weighted avg       0.51      0.54      0.52        61

[[26  1  2  0  0]
 [ 3  3  3  3  0]
 [ 2  1  2  4  0]
 [ 1  2  2  2  0]
 [ 0  1  0  3  0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [76]:
log_reg = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')

In [78]:
# Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Load data (assuming 'df' is the loaded dataset)
# df = pd.read_csv('your_dataset.csv')  # Make sure to load your dataset

# 1. Handle missing values (if any)
# SimpleImputer with strategy 'mean' to fill missing values for numerical columns
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df.drop(columns='target')), columns=df.drop(columns='target').columns)

# Add the target column back
df_imputed['target'] = df['target']

# Check if there are any missing values left
print(f"Missing values after imputation:\n{df_imputed.isnull().sum()}")

# 2. Encode categorical variables (if any)
# Check for categorical columns
categorical_cols = df_imputed.select_dtypes(include=['object']).columns
print(f"Categorical columns: {categorical_cols}")

# If categorical columns exist, use LabelEncoder or OneHotEncoder
# Example: Encoding with LabelEncoder
label_encoder = LabelEncoder()
for col in categorical_cols:
    df_imputed[col] = label_encoder.fit_transform(df_imputed[col])

# 3. Standardize numerical features
# Extract the numerical features (excluding the target column)
numerical_cols = df_imputed.select_dtypes(include=[np.number]).columns
numerical_cols = numerical_cols.drop('target')  # Exclude target

# Standardize numerical features
scaler = StandardScaler()
df_imputed[numerical_cols] = scaler.fit_transform(df_imputed[numerical_cols])

# 4. Split the dataset into training and test sets
X = df_imputed.drop(columns='target')
y = df_imputed['target']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shape of training and testing sets
print(f"Training data shape: {X_train.shape}, Testing data shape: {X_test.shape}")

# 5. Logistic Regression Model with class weighting
log_reg = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')

# Train the model
log_reg.fit(X_train, y_train)

# 6. Model Evaluation
y_pred = log_reg.predict(X_test)

# Output evaluation metrics
print("Logistic Regression Evaluation:")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


Missing values after imputation:
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64
Categorical columns: Index([], dtype='object')
Training data shape: (242, 13), Testing data shape: (61, 13)
Logistic Regression Evaluation:
              precision    recall  f1-score   support

           0       0.88      0.76      0.81        29
           1       0.15      0.17      0.16        12
           2       0.14      0.11      0.12         9
           3       0.17      0.29      0.21         7
           4       0.25      0.25      0.25         4

    accuracy                           0.46        61
   macro avg       0.32      0.31      0.31        61
weighted avg       0.51      0.46      0.48        61

[[22  5  2  0  0]
 [ 3  2  2  4  1]
 [ 0  3  1  3  2]
 [ 0  3  2  2  0]
 [ 0  0  0  3  1]]


In [80]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Initialize the model
rf_clf = RandomForestClassifier(random_state=42, class_weight='balanced')

# Train the model on the training data
rf_clf.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_clf.predict(X_test)

# Evaluate the Random Forest model
print("Random Forest Evaluation:")
print(classification_report(y_test, y_pred_rf))
print(confusion_matrix(y_test, y_pred_rf))

# Optional: Cross-validation score for model robustness
cv_scores = cross_val_score(rf_clf, X, y, cv=5, scoring='accuracy')
print(f"Cross-validation accuracy scores: {cv_scores}")
print(f"Mean cross-validation accuracy: {cv_scores.mean()}")


Random Forest Evaluation:
              precision    recall  f1-score   support

           0       0.74      0.97      0.84        29
           1       0.00      0.00      0.00        12
           2       0.00      0.00      0.00         9
           3       0.00      0.00      0.00         7
           4       0.00      0.00      0.00         4

    accuracy                           0.46        61
   macro avg       0.15      0.19      0.17        61
weighted avg       0.35      0.46      0.40        61

[[28  0  1  0  0]
 [ 6  0  4  2  0]
 [ 3  2  0  4  0]
 [ 1  5  1  0  0]
 [ 0  2  1  1  0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Cross-validation accuracy scores: [0.60655738 0.60655738 0.55737705 0.56666667 0.55      ]
Mean cross-validation accuracy: 0.577431693989071


In [82]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to balance the data
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Train the model on the resampled data
rf_clf.fit(X_train_res, y_train_res)

# Evaluate the Random Forest model on the test data
y_pred_rf_res = rf_clf.predict(X_test)
print("Random Forest with SMOTE Evaluation:")
print(classification_report(y_test, y_pred_rf_res))
print(confusion_matrix(y_test, y_pred_rf_res))


Random Forest with SMOTE Evaluation:
              precision    recall  f1-score   support

           0       0.82      0.93      0.87        29
           1       0.12      0.08      0.10        12
           2       0.11      0.11      0.11         9
           3       0.20      0.29      0.24         7
           4       0.00      0.00      0.00         4

    accuracy                           0.51        61
   macro avg       0.25      0.28      0.26        61
weighted avg       0.45      0.51      0.48        61

[[27  1  1  0  0]
 [ 5  1  3  3  0]
 [ 1  3  1  3  1]
 [ 0  1  4  2  0]
 [ 0  2  0  2  0]]


In [91]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_res, y_train_res)

# Print the best parameters and score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_}")


Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best cross-validation score: 0.9199999999999999


In [102]:
# Using the best parameters
rf_clf_best = RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_split=2, min_samples_leaf=1, random_state=42)

# Train the model
rf_clf_best.fit(X_train_res, y_train_res)

# Evaluate the model on the test set
y_pred_best = rf_clf_best.predict(X_test)
print("Random Forest with Best Parameters Evaluation:")
print(classification_report(y_test, y_pred_best))
print(confusion_matrix(y_test, y_pred_best))


Random Forest with Best Parameters Evaluation:
              precision    recall  f1-score   support

           0       0.82      0.93      0.87        29
           1       0.10      0.08      0.09        12
           2       0.22      0.22      0.22         9
           3       0.25      0.29      0.27         7
           4       0.00      0.00      0.00         4

    accuracy                           0.52        61
   macro avg       0.28      0.30      0.29        61
weighted avg       0.47      0.52      0.50        61

[[27  1  1  0  0]
 [ 5  1  3  2  1]
 [ 1  4  2  2  0]
 [ 0  2  3  2  0]
 [ 0  2  0  2  0]]


In [104]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(rf_clf_best, X_train_res, y_train_res, cv=5, scoring='accuracy')
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation accuracy: {cv_scores.mean()}")


Cross-validation scores: [0.86666667 0.91111111 0.91111111 0.97037037 0.94074074]
Mean cross-validation accuracy: 0.9199999999999999


In [106]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Create the Random Forest model with class_weight='balanced'
rf_clf_best_weighted = RandomForestClassifier(n_estimators=200, max_depth=None,
                                              min_samples_split=2, min_samples_leaf=1,
                                              class_weight='balanced', random_state=42)

# Train the model on the resampled data (train set)
rf_clf_best_weighted.fit(X_train_res, y_train_res)

# Make predictions on the test set
y_pred_weighted = rf_clf_best_weighted.predict(X_test)

# Evaluate the model
print("Classification Report:\n", classification_report(y_test, y_pred_weighted))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_weighted))


Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.93      0.87        29
           1       0.10      0.08      0.09        12
           2       0.22      0.22      0.22         9
           3       0.25      0.29      0.27         7
           4       0.00      0.00      0.00         4

    accuracy                           0.52        61
   macro avg       0.28      0.30      0.29        61
weighted avg       0.47      0.52      0.50        61

Confusion Matrix:
 [[27  1  1  0  0]
 [ 5  1  3  2  1]
 [ 1  4  2  2  0]
 [ 0  2  3  2  0]
 [ 0  2  0  2  0]]


In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced', None]  # Include balanced class weight
}

# Perform grid search
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train_res, y_train_res)

# Best parameters from GridSearchCV
print("Best parameters:", grid_search.best_params_)

# Train with the best found parameters
best_rf_clf = grid_search.best_estimator_
best_rf_clf.fit(X_train_res, y_train_res)

# Evaluate on the test set
y_pred_best = best_rf_clf.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred_best))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_best))