In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Load the HeberMan's Heart Disease dataset from the CSV file
habermans_data = pd.read_csv('haberman1.csv')

# Assuming 'survival_status' is the target column, and other columns are features
X = habermans_data.drop('survival_status', axis=1)
y = habermans_data['survival_status']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a KNN classifier with n_neighbors=35
neigh = KNeighborsClassifier(n_neighbors=35)

# Fit the classifier to the training data
neigh.fit(X_train, y_train)

# Make predictions on the test set
Y_pred_test = neigh.predict(X_test)

# Generate a classification report for the test set
report_test = classification_report(y_test, Y_pred_test)
print("Classification Report for Test Set:\n", report_test)

# Make predictions on the training set
Y_pred_train = neigh.predict(X_train)

# Generate a classification report for the training set
report_train = classification_report(y_train, Y_pred_train)
print("Classification Report for Training Set:\n", report_train)



Classification Report for Test Set:
               precision    recall  f1-score   support

           1       0.80      0.96      0.87        47
           2       0.60      0.21      0.32        14

    accuracy                           0.79        61
   macro avg       0.70      0.59      0.59        61
weighted avg       0.76      0.79      0.75        61

Classification Report for Training Set:
               precision    recall  f1-score   support

           1       0.77      0.97      0.85       177
           2       0.71      0.22      0.34        67

    accuracy                           0.76       244
   macro avg       0.74      0.59      0.60       244
weighted avg       0.75      0.76      0.71       244



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [7]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Assuming 'survival_status' is the target column, and other columns are features
X = habermans_data.drop('survival_status', axis=1)
y = habermans_data['survival_status']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a KNN classifier with n_neighbors=35
neigh = KNeighborsClassifier(n_neighbors=35)

# Fit the classifier to the training data
neigh.fit(X_train, y_train)

# Make predictions on the test set
pred_Y_train = neigh.predict(X_train)
pred_Y_test = neigh.predict(X_test)

# Ensure Y_test and pred_Y_train have the same shape
print("Shape of Y_test:", y_test.shape)
print("Shape of pred_Y_train:", pred_Y_train.shape)

# Print unique values in Y_test and pred_Y_train to check for any discrepancies
print("Unique values in Y_test:", np.unique(y_test))
print("Unique values in pred_Y_train:", np.unique(pred_Y_train))

# Generate classification report for the test set
target_names = ['2', '1']
report_test = classification_report(y_test, pred_Y_test, target_names=target_names)
print("Classification Report for Test Set:\n", report_test)

# Generate classification report for the training set
report_train = classification_report(y_train, pred_Y_train, target_names=target_names)
print("Classification Report for Training Set:\n", report_train)


Shape of Y_test: (61,)
Shape of pred_Y_train: (244,)
Unique values in Y_test: [1 2]
Unique values in pred_Y_train: [1 2]
Classification Report for Test Set:
               precision    recall  f1-score   support

           2       0.80      0.96      0.87        47
           1       0.60      0.21      0.32        14

    accuracy                           0.79        61
   macro avg       0.70      0.59      0.59        61
weighted avg       0.76      0.79      0.75        61

Classification Report for Training Set:
               precision    recall  f1-score   support

           2       0.77      0.97      0.85       177
           1       0.71      0.22      0.34        67

    accuracy                           0.76       244
   macro avg       0.74      0.59      0.60       244
weighted avg       0.75      0.76      0.71       244



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [10]:
y_test.value_counts()

1    47
2    14
Name: survival_status, dtype: int64

In [11]:
y_train.value_counts()

1    177
2     67
Name: survival_status, dtype: int64

In [15]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,pred_Y_test)

array([[45,  2],
       [11,  3]], dtype=int64)

In [19]:
df=pd.read_csv('haberman1.csv')
from sklearn.preprocessing import StandardScaler
scaler= StandardScaler()
scaled_data=scaler.fit_transform(df)

In [20]:
scaled_data

array([[-2.10053274, -0.2613877 , -0.14414708, -0.60133779],
       [-2.10053274,  0.66204762, -0.561535  , -0.60133779],
       [-2.00730479, -1.18482303, -0.28327639, -0.60133779],
       ...,
       [ 2.281181  ,  0.66204762, -0.14414708, -0.60133779],
       [ 2.37440895,  0.66204762, -0.4224057 ,  1.66295884],
       [ 2.84054871, -1.4926348 , -0.28327639,  1.66295884]])

In [21]:
scaler

StandardScaler()

In [22]:
scaled_data=scaler.fit_transform([[10,20,100],[10,80,100]])


In [23]:
scaled_data

array([[ 0., -1.,  0.],
       [ 0.,  1.,  0.]])

In [24]:
from sklearn.preprocessing import StandardScaler
data=[[0,0],[0,0],[1,1],[5,5]]
scaler= StandardScaler()
print(scaler.fit_transform(data))

[[-0.72760688 -0.72760688]
 [-0.72760688 -0.72760688]
 [-0.24253563 -0.24253563]
 [ 1.69774938  1.69774938]]
