In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
dt = pd.read_csv('/content/heart (1).csv')
dt = dt.drop_duplicates()

In [None]:
threshold = len(dt) * 0.2
dt = dt.dropna(thresh=len(dt) - threshold, axis=1)

In [None]:
for col in dt.columns:
  if dt[col].dtype == 'object':
    dt[col] = dt[col].astype('category').cat.codes

In [None]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
Index: 302 entries, 0 to 878
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       302 non-null    int64  
 1   sex       302 non-null    int64  
 2   cp        302 non-null    int64  
 3   trestbps  302 non-null    int64  
 4   chol      302 non-null    int64  
 5   fbs       302 non-null    int64  
 6   restecg   302 non-null    int64  
 7   thalach   302 non-null    int64  
 8   exang     302 non-null    int64  
 9   oldpeak   302 non-null    float64
 10  slope     302 non-null    int64  
 11  ca        302 non-null    int64  
 12  thal      302 non-null    int64  
 13  target    302 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 35.4 KB


In [None]:
corr_matrix = dt.corr()
target_corr = corr_matrix['target']
threshold = 0.1
features_to_drop = target_corr[abs(target_corr) < threshold].index.tolist()
dt = dt.drop(columns=features_to_drop)
dt.info()

<class 'pandas.core.frame.DataFrame'>
Index: 302 entries, 0 to 878
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       302 non-null    int64  
 1   sex       302 non-null    int64  
 2   cp        302 non-null    int64  
 3   trestbps  302 non-null    int64  
 4   restecg   302 non-null    int64  
 5   thalach   302 non-null    int64  
 6   exang     302 non-null    int64  
 7   oldpeak   302 non-null    float64
 8   slope     302 non-null    int64  
 9   ca        302 non-null    int64  
 10  thal      302 non-null    int64  
 11  target    302 non-null    int64  
dtypes: float64(1), int64(11)
memory usage: 30.7 KB


In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
for column in dt.columns:
  if dt[column].std() != 0:
    print(f"Column '{column}' might require scaling.")

Column 'age' might require scaling.
Column 'sex' might require scaling.
Column 'cp' might require scaling.
Column 'trestbps' might require scaling.
Column 'restecg' might require scaling.
Column 'thalach' might require scaling.
Column 'exang' might require scaling.
Column 'oldpeak' might require scaling.
Column 'slope' might require scaling.
Column 'ca' might require scaling.
Column 'thal' might require scaling.
Column 'target' might require scaling.


In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
X = dt.drop('target', axis=1)
y = dt['target']
dtorg = dt
dt = X
for column in dt.columns:
  if dt[column].std() != 0:
    if dt[column].skew() > 1 or dt[column].skew() < -1:  # Check for skewness (indicating potential outliers)
      scaler = RobustScaler()
      print(f"Column '{column}' has outliers, using RobustScaler.")
    elif dt[column].skew() >= -1 and dt[column].skew() <= 1: # Check for normal distribution
      scaler = StandardScaler()
      print(f"Column '{column}' is normally distributed, using StandardScaler.")
    else:
      scaler = MinMaxScaler()
      print(f"Column '{column}' has a bounded range, using MinMaxScaler.")
    dt[column] = scaler.fit_transform(dt[[column]])


Column 'age' is normally distributed, using StandardScaler.
Column 'sex' is normally distributed, using StandardScaler.
Column 'cp' is normally distributed, using StandardScaler.
Column 'trestbps' is normally distributed, using StandardScaler.
Column 'restecg' is normally distributed, using StandardScaler.
Column 'thalach' is normally distributed, using StandardScaler.
Column 'exang' is normally distributed, using StandardScaler.
Column 'oldpeak' has outliers, using RobustScaler.
Column 'slope' is normally distributed, using StandardScaler.
Column 'ca' has outliers, using RobustScaler.
Column 'thal' is normally distributed, using StandardScaler.


In [None]:
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Check for PCA applicability (if dataset has many features and some are highly correlated)
if X.shape[1] > 10 and np.linalg.matrix_rank(X.corr()) < X.shape[1]:
    print("PCA might be beneficial for dimensionality reduction.")

    # Determine the optimal number of components for PCA using explained variance ratio
    pca = PCA()
    pca.fit(X_train)
    explained_variance_ratio = pca.explained_variance_ratio_
    cumulative_variance_ratio = np.cumsum(explained_variance_ratio)

    # Choose the number of components that explain at least 95% of the variance
    n_components = np.argmax(cumulative_variance_ratio >= 0.95) + 1

    print(f"Optimal number of PCA components: {n_components}")

    # Apply PCA
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    # Replace original features with PCA components
    X_train = pd.DataFrame(X_train_pca)
    X_test = pd.DataFrame(X_test_pca)

# Check for LDA applicability (if dataset has a clear class separation)
elif len(y.unique()) > 1 and y.value_counts().min() > 10:
    print("LDA might be beneficial for dimensionality reduction and feature extraction.")
    lda = LDA(n_components=min(len(y.unique()) - 1, X.shape[1]))
    X_train_lda = lda.fit_transform(X_train, y_train)
    X_test_lda = lda.transform(X_test)

    # Replace original features with LDA components
    X_train = pd.DataFrame(X_train_lda)
    X_test = pd.DataFrame(X_test_lda)

else:
  print("PCA or LDA is not necessary or not applicable based on the current data.")


LDA might be beneficial for dimensionality reduction and feature extraction.


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report


# Choose a classification algorithm based on the nature of your problem
# and data characteristics.

# 1. Logistic Regression: A good baseline for binary classification.
#    Suitable for linearly separable data or when interpretability is desired.
# 2. Decision Tree: Useful for handling non-linear relationships and interactions.
#    Can be prone to overfitting if not properly pruned or controlled.
# 3. Random Forest: An ensemble method combining multiple decision trees.
#    Robust to overfitting and generally provides high accuracy.
# 4. Support Vector Machine (SVM): Powerful for complex datasets with clear margins.
#    Can be computationally expensive and sensitive to hyperparameter tuning.
# 5. K-Nearest Neighbors (KNN): Based on the similarity of data points.
#    Simple to implement but can be computationally expensive for large datasets.
# 6. Naive Bayes: Assumes independence between features, suitable for text classification.
#    Can be less accurate than other algorithms for complex relationships.

# Example of choosing a classifier and evaluating its performance:
# We'll select Random Forest as a starting point.
clf = RandomForestClassifier()

# Train the classifier
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

# You can experiment with different classifiers and compare their performance
# to choose the most suitable one for your data and problem.

# Example of comparing multiple classifiers:
classifiers = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB()
}

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name}: Accuracy = {accuracy}")


Accuracy: 0.6885245901639344
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.69      0.70        32
           1       0.67      0.69      0.68        29

    accuracy                           0.69        61
   macro avg       0.69      0.69      0.69        61
weighted avg       0.69      0.69      0.69        61

Logistic Regression: Accuracy = 0.819672131147541
Decision Tree: Accuracy = 0.6885245901639344
Random Forest: Accuracy = 0.6885245901639344
SVM: Accuracy = 0.8360655737704918
KNN: Accuracy = 0.7868852459016393
Naive Bayes: Accuracy = 0.8032786885245902


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Assuming X_train, X_test, y_train, y_test are already defined

# Create a dictionary of classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB()
}

# Train and evaluate each classifier
results = {}
for name, clf in classifiers.items():
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  results[name] = accuracy

# Find the best classifier based on accuracy
best_classifier = max(results, key=results.get)
best_accuracy = results[best_classifier]

# Print the best classifier and its accuracy
print(f"The best classifier is: {best_classifier}")
print(f"With accuracy: {best_accuracy}")

# Now you can use the best_classifier for your predictions
best_clf = classifiers[best_classifier]


The best classifier is: SVM
With accuracy: 0.8360655737704918


In [None]:
# Feature importance:
import pandas as pd
if isinstance(best_clf, RandomForestClassifier):
  # Feature Importance for Random Forest
  importances = best_clf.feature_importances_
  feature_importances = pd.DataFrame({'feature': X_train.columns, 'importance': importances})
  feature_importances = feature_importances.sort_values('importance', ascending=False)
  print("Feature Importance (Random Forest):")
  print(feature_importances)
elif isinstance(best_clf, LogisticRegression):
  # Feature Importance for Logistic Regression (coefficients)
  coefficients = best_clf.coef_[0]
  feature_importances = pd.DataFrame({'feature': X_train.columns, 'importance': coefficients})
  feature_importances = feature_importances.sort_values('importance', ascending=False)
  print("Feature Importance (Logistic Regression - Coefficients):")
  print(feature_importances)
elif isinstance(best_clf, DecisionTreeClassifier):
  # Feature Importance for Decision Tree
  importances = best_clf.feature_importances_
  feature_importances = pd.DataFrame({'feature': X_train.columns, 'importance': importances})
  feature_importances = feature_importances.sort_values('importance', ascending=False)
  print("Feature Importance (Decision Tree):")
  print(feature_importances)
else:
  print("Feature importance is not readily available for this classifier.")


Feature importance is not readily available for this classifier.


In [None]:
from sklearn.model_selection import GridSearchCV

# Check if the best classifier is a tree-based model (DecisionTree or RandomForest)
if isinstance(best_clf, (DecisionTreeClassifier, RandomForestClassifier)):
    # Define the parameter grid to search
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
elif isinstance(best_clf, SVC):
    # Define a parameter grid suitable for SVC
    param_grid = {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    }
# Add other elif blocks for other classifier types if needed with appropriate parameters

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=best_clf, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters found: ", grid_search.best_params_)
print("Best accuracy found: ", grid_search.best_score_)

# Use the best estimator for predictions
best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy with tuned model: {accuracy}")

Best parameters found:  {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}
Best accuracy found:  0.8713435374149661
Accuracy with tuned model: 0.8360655737704918
