APPLIED DATASCIENCE 1

CLUSTERING AND FITTING

NAME:SHAIK IMTHIYAZ

STUDENT ID:23108363

In [None]:
import os
os.environ["OMP_NUM_THREADS"] = "3"  # Limiting MKL to 3 threads for avoiding memory leak of Kmeans
#importing the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix, silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris

In [None]:

# Reading the diabetes dataset
data = pd.read_csv(r'C:\Users\imthi\Desktop\DataScience_Assignments\diabetes.csv')
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
#Preprocessing: Separate features and target variable
X = data.drop(columns='Outcome')
y = data['Outcome']

# Standardizing the features-->useful for K-means
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split for line fitting (regression)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
# ----------------- Function to create the histogram -----------------
def plot_histogram(data, column, title="Histogram", xlabel="Value", ylabel="Frequency"):

    plt.figure(figsize=(8, 6))
    plt.hist(data[column], bins=20, color='skyblue', edgecolor='black')
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.grid(True)
    plt.show()
# Plotting Histogram: Distribution of Glucose levels
plot_histogram(data, 'Glucose', title="Glucose Level Distribution")

In [None]:
# ----------------- Function to create scatter plot with line fit -----------------
def plot_scatter_with_line(X, y, title="Scatter Plot with Line Fit"):
    
    model = LinearRegression()
    model.fit(X, y)
    y_pred = model.predict(X)
    
    plt.figure(figsize=(8, 6))
    plt.scatter(X, y, color='blue', label='Data points')
    plt.plot(X, y_pred, color='red', label='Linear fit')
    plt.title(title)
    plt.xlabel("Feature")
    plt.ylabel("Outcome")
    plt.legend()
    plt.show()

# Plotting Scatter plot with line fit: Glucose vs Outcome
plot_scatter_with_line(X_scaled[:, 1].reshape(-1, 1), y, title="Glucose vs Outcome with Line Fit")


In [None]:
# ----------------- K-means clustering -----------------
def kmeans_clustering(X, n_clusters=2):
    """
    Applies K-means clustering and returns the predicted labels.

    Args:
    - X: feature data for clustering
    - n_clusters: number of clusters for K-means

    Returns:
    - Predicted cluster labels
    """
    kmeans = KMeans(n_clusters=2, n_init=1, random_state=42)
    return kmeans.fit_predict(X)

In [None]:
# ----------------- Function to create confusion matrix -----------------
def plot_confusion_matrix(y_true, y_pred, title="Confusion Matrix"):
   
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 16})
    plt.title(title)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()
# Plotting Confusion Matrix: Perform K-means clustering and compare with actual Outcome
y_pred_kmeans = kmeans_clustering(X_scaled, n_clusters=2)
plot_confusion_matrix(y, y_pred_kmeans, title="Confusion Matrix for K-means Clustering")

In [None]:
# ----------------- Function to create elbow plot for K-means -----------------
def plot_elbow(X, max_k=10, title="Elbow Plot"):
   
    distortions = []
    for k in range(1, max_k+1):
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(X)
        distortions.append(kmeans.inertia_)
    
    plt.figure(figsize=(8, 6))
    plt.plot(range(1, max_k+1), distortions, marker='o', color='b')
    plt.title(title)
    plt.xlabel('Number of Clusters')
    plt.ylabel('Distortion')
    plt.grid(True)
    plt.show()
# Plotting Elbow plot: Find the optimal number of clusters for K-means
plot_elbow(X_scaled, max_k=10, title="K-means Elbow Plot")