In [1]:
#Q1
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [3]:
# Step 1: Load the dataset
df = pd.read_csv('diabetes.csv')  # Load your dataset
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [4]:
# Step 2: Explore the dataset
print(df.head())  # Preview the dataset

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [6]:
# Step 3: Preprocess the data
# Replace any zeros with NaN in certain columns where zero values might be invalid (e.g., blood pressure, glucose, etc.)
df['Glucose'] = df['Glucose'].replace(0, np.nan)
df['BloodPressure'] = df['BloodPressure'].replace(0, np.nan)
df['SkinThickness'] = df['SkinThickness'].replace(0, np.nan)
df['Insulin'] = df['Insulin'].replace(0, np.nan)
df['BMI'] = df['BMI'].replace(0, np.nan)
# Fill missing values with the median of each column
df.fillna(df.median(), inplace=True)

In [7]:
# Step 4: Split the data into features and labels
X = df.drop('Outcome', axis=1)  # Features
y = df['Outcome']  # Labels

In [8]:
# Step 5: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
# Step 6: Normalize the features (standardize them)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
# Step 7: Use GridSearchCV to find the optimal value of K
param_grid = {'n_neighbors': np.arange(1, 31)}  # Try K values from 1 to 30
knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')

In [11]:
# Fit the model
grid_search.fit(X_train, y_train)

In [12]:
# Best K value
best_k = grid_search.best_params_['n_neighbors']
print(f'Optimal K value: {best_k}')

Optimal K value: 11


In [13]:
# Step 8: Train the KNN model with the optimal K
knn_optimal = KNeighborsClassifier(n_neighbors=best_k)
knn_optimal.fit(X_train, y_train)

In [14]:
# Step 9: Make predictions on the test set
y_pred = knn_optimal.predict(X_test)

In [15]:
# Step 10: Evaluate the model
print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}')
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print('Classification Report:')
print(classification_report(y_test, y_pred))

Accuracy: 0.7229
Confusion Matrix:
[[119  32]
 [ 32  48]]
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.79      0.79       151
           1       0.60      0.60      0.60        80

    accuracy                           0.72       231
   macro avg       0.69      0.69      0.69       231
weighted avg       0.72      0.72      0.72       231



In [17]:
# Example new patient data: [pregnancies, glucose, blood pressure, skin thickness, insulin, BMI, diabetes pedigree function, age]
# Note that this should have 8 features, not 9
new_patient = np.array([[2, 89, 58, 33, 50, 72, 31.6, 0.349]])  # Corrected to 8 features

In [18]:
# Normalize the new patient data using the same scaler that was fitted to the training data
new_patient = scaler.transform(new_patient)  # Transform the new patient data



In [19]:
# Prediction
new_prediction = knn_optimal.predict(new_patient)
print(f'Predicted Outcome for the new patient: {new_prediction[0]}')  # 0 = Not Diabetic, 1 = Diabetic

Predicted Outcome for the new patient: 0


In [None]:
#Q2
# Import necessary libraries
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

In [None]:
# 1. Load the dataset
# Replace 'groceries.csv' with the path to your grocery dataset file
groceries_data = pd.read_csv('groceries.csv')

In [None]:
# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(groceries_data.head())

In [None]:
# 2. Preprocess the Data
# Create a basket with each member's purchases
basket = groceries_data.groupby(['Member_number', 'itemDescription'])['Date'].count().unstack().fillna(0)
basket = basket.applymap(lambda x: 1 if x > 0 else 0)  # Convert counts to 1s and 0s

In [None]:
# 3. Apply the Apriori Algorithm
# Find frequent itemsets with a minimum support of 0.25
frequent_itemsets = apriori(basket, min_support=0.25, use_colnames=True)

In [None]:
# 4. Display the Results
print("\nFrequent Itemsets with support >= 0.25:")
print(frequent_itemsets)

In [None]:
# 5. Generate Association Rules (optional)
# Generate association rules if desired
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
print("\nAssociation Rules:")
print(rules)