Data Overview

In [19]:
import pandas as pd

# Load the dataset
data = pd.read_csv('/home/lili/Downloads/Telegram Desktop/data mining/tamrin 3/cleaned_data.csv')

# Display the first few rows of the data
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,1.051434,0,1,1,2,1,2.058363,1.110872,1,1
1,0,0.78607,0,0,1,3,0,2.058363,-0.082921,2,1
2,1,1.62639,0,1,1,2,0,0.148256,0.535043,2,1
3,0,0.255342,0,0,1,2,1,2.058363,0.80189,3,1
4,0,1.582163,1,0,1,3,0,2.058363,-0.658751,2,1


Data Splitting for Training and Testing

In [20]:
from sklearn.model_selection import train_test_split

# Separate features and target variable
X = data.drop('stroke', axis=1)
y = data['stroke']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Balancing Classes Using SMOTE

In [21]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to balance the classes in the training set
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

Model Implementation: K-Nearest Neighbors (KNN) with euclidean

In [22]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize a list to store results
knn_results = []

# Test different values for n_neighbors and metrics
for n in [3, 5, 7]:
        knn = KNeighborsClassifier(n_neighbors=n, metric='euclidean', weights='distance')
        knn.fit(X_train, y_train)
        y_knn_pred = knn.predict(X_test)
        
        # Evaluate the model
        accuracy = accuracy_score(y_test, y_knn_pred)
        report = classification_report(y_test, y_knn_pred, output_dict=True)
        confusion = confusion_matrix(y_test, y_knn_pred)
        
        knn_results.append({
            "n_neighbors": n,
            "metric": 'euclidean',
            "accuracy": accuracy,
            "precision_stroke": report['1']['precision'],
            "recall_stroke": report['1']['recall'],
            "f1_score_stroke": report['1']['f1-score'],
            "confusion_matrix": confusion
        })

# Print the results
import pandas as pd
knn_df = pd.DataFrame(knn_results)
print(knn_df)


   n_neighbors     metric  accuracy  precision_stroke  recall_stroke  \
0            3  euclidean  0.931507          0.250000       0.064516   
1            5  euclidean  0.935421          0.250000       0.032258   
2            7  euclidean  0.938356          0.333333       0.016129   

   f1_score_stroke      confusion_matrix  
0         0.102564  [[948, 12], [58, 4]]  
1         0.057143   [[954, 6], [60, 2]]  
2         0.030769   [[958, 2], [61, 1]]  


Model Implementation: K-Nearest Neighbors (KNN) with manhattan

In [23]:
knn_results = []

# Test different values for n_neighbors and metrics
for n in [3, 5, 7]:
        knn = KNeighborsClassifier(n_neighbors=n, metric='manhattan', weights='distance')
        knn.fit(X_train, y_train)
        y_knn_pred = knn.predict(X_test)
        
        # Evaluate the model
        accuracy = accuracy_score(y_test, y_knn_pred)
        report = classification_report(y_test, y_knn_pred, output_dict=True)
        confusion = confusion_matrix(y_test, y_knn_pred)
        knn_results.append({
            "n_neighbors": n,
            "metric": 'manhattan',
            "accuracy": accuracy,
            "precision_stroke": report['1']['precision'],
            "recall_stroke": report['1']['recall'],
            "f1_score_stroke": report['1']['f1-score'],
            "confusion_matrix": confusion
        })

# Print the results
import pandas as pd
knn_df = pd.DataFrame(knn_results)
print(knn_df)

   n_neighbors     metric  accuracy  precision_stroke  recall_stroke  \
0            3  manhattan  0.932485          0.266667       0.064516   
1            5  manhattan  0.937378          0.375000       0.048387   
2            7  manhattan  0.937378          0.250000       0.016129   

   f1_score_stroke      confusion_matrix  
0         0.103896  [[949, 11], [58, 4]]  
1         0.085714   [[955, 5], [59, 3]]  
2         0.030303   [[957, 3], [61, 1]]  


Model Implementation: K-Nearest Neighbors (KNN) with Smote

In [24]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize a list to store results
knn_results = []

# Test different values for n_neighbors and metrics
for n in [3, 5, 7]:
        knn = KNeighborsClassifier(n_neighbors=n, metric='euclidean', weights='distance')
        knn.fit(X_resampled, y_resampled)
        y_knn_pred = knn.predict(X_test)
        
        # Evaluate the model
        accuracy = accuracy_score(y_test, y_knn_pred)
        report = classification_report(y_test, y_knn_pred, output_dict=True)
        confusion = confusion_matrix(y_test, y_knn_pred)
        
        knn_results.append({
            "n_neighbors": n,
            "metric": 'euclidean',
            "accuracy": accuracy,
            "precision_stroke": report['1']['precision'],
            "recall_stroke": report['1']['recall'],
            "f1_score_stroke": report['1']['f1-score'],
            "confusion_matrix": confusion
        })

# Print the results
import pandas as pd
knn_df = pd.DataFrame(knn_results)
print(knn_df)

   n_neighbors     metric  accuracy  precision_stroke  recall_stroke  \
0            3  euclidean  0.813112          0.122807       0.338710   
1            5  euclidean  0.802348          0.135417       0.419355   
2            7  euclidean  0.794521          0.130000       0.419355   

   f1_score_stroke        confusion_matrix  
0         0.180258  [[810, 150], [41, 21]]  
1         0.204724  [[794, 166], [36, 26]]  
2         0.198473  [[786, 174], [36, 26]]  


Model Implementation: K-Nearest Neighbors (KNN) with Smote

In [25]:
knn_results = []

# Test different values for n_neighbors and metrics
for n in [3, 5, 7]:
        knn = KNeighborsClassifier(n_neighbors=n, metric='manhattan', weights='distance')
        knn.fit(X_resampled, y_resampled)
        y_knn_pred = knn.predict(X_test)
        
        # Evaluate the model
        accuracy = accuracy_score(y_test, y_knn_pred)
        report = classification_report(y_test, y_knn_pred, output_dict=True)
        confusion = confusion_matrix(y_test, y_knn_pred)
        knn_results.append({
            "n_neighbors": n,
            "metric": 'manhattan',
            "accuracy": accuracy,
            "precision_stroke": report['1']['precision'],
            "recall_stroke": report['1']['recall'],
            "f1_score_stroke": report['1']['f1-score'],
            "confusion_matrix": confusion
        })

# Print the results
import pandas as pd
knn_df = pd.DataFrame(knn_results)
print(knn_df)

   n_neighbors     metric  accuracy  precision_stroke  recall_stroke  \
0            3  manhattan  0.816047          0.125000       0.338710   
1            5  manhattan  0.797456          0.139303       0.451613   
2            7  manhattan  0.789628          0.130435       0.435484   

   f1_score_stroke        confusion_matrix  
0         0.182609  [[813, 147], [41, 21]]  
1         0.212928  [[787, 173], [34, 28]]  
2         0.200743  [[780, 180], [35, 27]]  
