In [1]:
# 1. Write a Python program to train an SVM Classifier on the Iris dataset and evaluate accuracy

# Import necessary libraries
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load the Iris dataset
iris = datasets.load_iris()
X = iris.data
y = iris.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create an SVM classifier
# You can experiment with different kernels (e.g., 'linear', 'poly', 'rbf')
svm_classifier = SVC(kernel='linear', random_state=42)

# Train the classifier
svm_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm_classifier.predict(X_test)

# Evaluate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy
print(f"Accuracy of the SVM classifier: {accuracy:.2f}")

Accuracy of the SVM classifier: 1.00


In [2]:
# 2. Write a Python program to train two SVM classifiers with Linear and RBF kernels on the Wine dataset, then
# compare their accuracies

# Import necessary libraries
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load the Wine dataset
wine = datasets.load_wine()
X = wine.data
y = wine.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create and train an SVM classifier with a linear kernel
svm_linear = SVC(kernel='linear', random_state=42)
svm_linear.fit(X_train, y_train)

# Make predictions and evaluate accuracy for the linear kernel
y_pred_linear = svm_linear.predict(X_test)
accuracy_linear = accuracy_score(y_test, y_pred_linear)

# Create and train an SVM classifier with an RBF kernel
svm_rbf = SVC(kernel='rbf', random_state=42)
svm_rbf.fit(X_train, y_train)

# Make predictions and evaluate accuracy for the RBF kernel
y_pred_rbf = svm_rbf.predict(X_test)
accuracy_rbf = accuracy_score(y_test, y_pred_rbf)

# Print the accuracies for comparison
print(f"Accuracy of SVM with Linear Kernel: {accuracy_linear:.2f}")
print(f"Accuracy of SVM with RBF Kernel: {accuracy_rbf:.2f}")

Accuracy of SVM with Linear Kernel: 0.98
Accuracy of SVM with RBF Kernel: 0.76


In [None]:
# 3.  Write a Python program to train an SVM Regressor (SVR) on a housing dataset and evaluate it using Mean
# Squared Error (MSE)

# Import necessary libraries
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

# Load a housing dataset (using California Housing dataset as an example)
# If you have your own dataset, load it here
try:
    housing = datasets.fetch_california_housing()
    X = housing.data
    y = housing.target
except AttributeError:
    print("California Housing dataset not available. Using make_regression as an example.")
    from sklearn.datasets import make_regression
    X, y = make_regression(n_samples=200, n_features=10, noise=10, random_state=42)


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create an SVR regressor
# You can experiment with different kernels (e.g., 'linear', 'poly', 'rbf') and parameters (e.g., C, epsilon)
svr_regressor = SVR(kernel='linear') # You can change the kernel here

# Train the regressor
svr_regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svr_regressor.predict(X_test)

# Evaluate the regressor using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Print the Mean Squared Error
print(f"Mean Squared Error (MSE) of the SVR regressor: {mse:.2f}")

In [None]:
# 4.  Write a Python program to train an SVM Classifier with a Polynomial Kernel and visualize the decision
# boundary

# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

# Load the Iris dataset (for simplicity in visualization)
iris = datasets.load_iris()
X = iris.data[:, :2]  # We only take the first two features for easy visualization
y = iris.target

# Filter to only use two classes for simpler visualization
X = X[y != 2]
y = y[y != 2]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create an SVM classifier with a polynomial kernel
# degree: the degree of the polynomial kernel
# C: regularization parameter
svm_poly = SVC(kernel='poly', degree=3, C=1.0, random_state=42)

# Train the classifier
svm_poly.fit(X_train, y_train)

# Visualize the decision boundary
# Create a meshgrid to plot the decision boundary
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
                     np.arange(y_min, y_max, 0.02))

# Predict the class for each point in the meshgrid
Z = svm_poly.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Plot the decision boundary
plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8)

# Plot the training points
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=plt.cm.coolwarm, edgecolors='k')
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.title('SVM with Polynomial Kernel Decision Boundary')
plt.show()

In [None]:
# 5. Write a Python program to train a Gaussian Naïve Bayes classifier on the Breast Cancer dataset and
# evaluate accuracy

# Import necessary libraries
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Load the Breast Cancer dataset
breast_cancer = datasets.load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create a Gaussian Naive Bayes classifier
gnb = GaussianNB()

# Train the classifier
gnb.fit(X_train, y_train)

# Make predictions on the test set
y_pred = gnb.predict(X_test)

# Evaluate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy
print(f"Accuracy of the Gaussian Naive Bayes classifier: {accuracy:.2f}")

In [None]:
# 6. Write a Python program to train a Multinomial Naïve Bayes classifier for text classification using the 20
# Newsgroups dataset

# Import necessary libraries
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Load the 20 Newsgroups dataset
# We can select a subset of categories for faster execution, or load all of them.
# Here, we'll load a few related categories.
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

X_train = newsgroups_train.data
y_train = newsgroups_train.target
X_test = newsgroups_test.data
y_test = newsgroups_test.target

# Convert text data into numerical feature vectors using TF-IDF
# TF-IDF (Term Frequency-Inverse Document Frequency) is a common text vectorization technique.
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Create a Multinomial Naive Bayes classifier
# Multinomial Naive Bayes is well-suited for text data where features are counts or frequencies.
multinomial_nb = MultinomialNB()

# Train the classifier
multinomial_nb.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = multinomial_nb.predict(X_test_tfidf)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the Multinomial Naive Bayes classifier: {accuracy:.2f}")

# Print a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=newsgroups_test.target_names))

In [None]:
# 7. Write a Python program to train an SVM Classifier with different C values and compare the decision
# boundaries visually

# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

# Load the Iris dataset (for simplicity in visualization)
iris = datasets.load_iris()
X = iris.data[:, :2]  # We only take the first two features for easy visualization
y = iris.target

# Filter to only use two classes for simpler visualization
X = X[y != 2]
y = y[y != 2]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define different C values to experiment with
c_values = [0.1, 1, 10, 100]

# Create a meshgrid to plot the decision boundary
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
                     np.arange(y_min, y_max, 0.02))

# Iterate through different C values and visualize the decision boundaries
for c in c_values:
    # Create an SVM classifier with the current C value and RBF kernel
    # We use RBF kernel here as it often shows more interesting decision boundaries
    svm_classifier = SVC(kernel='rbf', C=c, random_state=42)

    # Train the classifier
    svm_classifier.fit(X_train, y_train)

    # Predict the class for each point in the meshgrid
    Z = svm_classifier.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    # Plot the decision boundary
    plt.figure()
    plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8)

    # Plot the training points
    plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=plt.cm.coolwarm, edgecolors='k')
    plt.xlabel('Sepal length')
    plt.ylabel('Sepal width')
    plt.title(f'SVM with RBF Kernel (C={c}) Decision Boundary')
    plt.show()

In [None]:
# 8.  Write a Python program to train a Bernoulli Naïve Bayes classifier for binary classification on a dataset with
# binary features

# Import necessary libraries
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, classification_report

# Create a synthetic dataset with binary features
# In a real-world scenario, you would load your binary dataset here.
# For demonstration, we'll create a simple dataset.
# Let's imagine features represent whether a customer bought certain products (1) or not (0).
# The target variable is whether they made a purchase on a specific day (1) or not (0).

# Number of samples
n_samples = 1000
# Number of binary features
n_features = 10

# Generate random binary features (0 or 1)
X = np.random.randint(0, 2, size=(n_samples, n_features))

# Create a synthetic binary target variable based on some simple rule
# For example, if features 0 and 3 are 1, the target is more likely to be 1
y = np.zeros(n_samples)
y[(X[:, 0] == 1) & (X[:, 3] == 1)] = 1
y[np.random.rand(n_samples) < 0.1] = 1 # Add some noise/random positive cases

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create a Bernoulli Naive Bayes classifier
# BernoulliNB is suitable for binary features. It assumes features are independent boolean variables. [1] [2]
bnb = BernoulliNB()

# Train the classifier
bnb.fit(X_train, y_train)

# Make predictions on the test set
y_pred = bnb.predict(X_test)

# Evaluate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the Bernoulli Naive Bayes classifier: {accuracy:.2f}")

# Print a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# 9.  Write a Python program to apply feature scaling before training an SVM model and compare results with
# unscaled data


# Import necessary libraries
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Load a dataset (using Iris dataset as an example)
iris = datasets.load_iris()
X = iris.data
y = iris.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# --- Train SVM on unscaled data ---

# Create an SVM classifier (using RBF kernel, which is sensitive to feature scaling)
svm_unscaled = SVC(kernel='rbf', random_state=42)

# Train the classifier on unscaled data
svm_unscaled.fit(X_train, y_train)

# Make predictions on the unscaled test set
y_pred_unscaled = svm_unscaled.predict(X_test)

# Evaluate accuracy on unscaled data
accuracy_unscaled = accuracy_score(y_test, y_pred_unscaled)
print(f"Accuracy of SVM on unscaled data: {accuracy_unscaled:.2f}")

# --- Apply Feature Scaling ---

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler on the training data and transform it
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data using the same scaler
X_test_scaled = scaler.transform(X_test)

# --- Train SVM on scaled data ---

# Create an SVM classifier with the same parameters
svm_scaled = SVC(kernel='rbf', random_state=42)

# Train the classifier on scaled data
svm_scaled.fit(X_train_scaled, y_train)

# Make predictions on the scaled test set
y_pred_scaled = svm_scaled.predict(X_test_scaled)

# Evaluate accuracy on scaled data
accuracy_scaled = accuracy_score(y_test, y_pred_scaled)
print(f"Accuracy of SVM on scaled data: {accuracy_scaled:.2f}")

In [None]:
# 10.  Write a Python program to train a Gaussian Naïve Bayes model and compare the predictions before and
#  Laplace Smoothing

# Import necessary libraries
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

# Create a synthetic dataset with a feature that might have zero variance in a class
# For demonstration, let's make a feature constant for one class
X = np.array([[1.0, 2.0, 3.0],
              [1.1, 2.1, 3.2],
              [1.2, 2.2, 3.1],
              [5.0, 6.0, 7.0],
              [5.1, 6.2, 7.1],
              [5.3, 6.1, 7.3]])

y = np.array([0, 0, 0, 1, 1, 1]) # Two classes

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

print("Training data:\n", X_train)
print("Training labels:", y_train)
print("Test data:\n", X_test)
print("Test labels:", y_test)

# --- Train Gaussian Naive Bayes without explicit smoothing (default) ---
# scikit-learn's GaussianNB has a 'var_smoothing' parameter

print("\n--- Gaussian Naive Bayes without explicit variance smoothing (default) ---")
gnb_default = GaussianNB()
gnb_default.fit(X_train, y_train)

# Print estimated parameters (mean and variance) for each class
print("Mean (feature 0, Class 0):", gnb_default.theta_[0, 0])
print("Variance (feature 0, Class 0):", gnb_default.sigma_[0, 0])
print("Mean (feature 0, Class 1):", gnb_default.theta_[1, 0])
print("Variance (feature 0, Class 1):", gnb_default.sigma_[1, 0])


y_pred_default = gnb_default.predict(X_test)
accuracy_default = accuracy_score(y_test, y_pred_default)
print(f"Accuracy (default smoothing): {accuracy_default:.2f}")
print("Classification Report (default smoothing):\n", classification_report(y_test, y_pred_default))


# --- Train Gaussian Naive Bayes with variance smoothing ---
# Use the 'var_smoothing' parameter in GaussianNB

print("\n--- Gaussian Naive Bayes with variance smoothing (e.g., var_smoothing=1e-9) ---")
gnb_smoothed = GaussianNB(var_smoothing=1e-9) # Add a small value to the variance
gnb_smoothed.fit(X_train, y_train)

# Print estimated parameters with smoothing
print("Mean (feature 0, Class 0):", gnb_smoothed.theta_[0, 0])
print("Variance (feature 0, Class 0) with smoothing:", gnb_smoothed.sigma_[0, 0])
print("Mean (feature 0, Class 1):", gnb_smoothed.theta_[1, 0])
print("Variance (feature 0, Class 1) with smoothing:", gnb_smoothed.sigma_[1, 0])


y_pred_smoothed = gnb_smoothed.predict(X_test)
accuracy_smoothed = accuracy_score(y_test, y_pred_smoothed)
print(f"Accuracy (with variance smoothing): {accuracy_smoothed:.2f}")
print("Classification Report (with variance smoothing):\n", classification_report(y_test, y_pred_smoothed))

# Compare predictions
print("\nComparison of predictions:")
print("Test labels:", y_test)
print("Predictions (default):", y_pred_default)
print("Predictions (smoothed):", y_pred_smoothed)

In [None]:
# 11.  Write a Python program to train an SVM Classifier and use GridSearchCV to tune the hyperparameters (C,
# gamma, kernel)

# Import necessary libraries
from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load the Iris dataset
iris = datasets.load_iris()
X = iris.data
y = iris.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the parameter grid for GridSearchCV
# This is a dictionary where keys are hyperparameters and values are lists of values to try.
param_grid = {
    'C': [0.1, 1, 10, 100],          # Regularization parameter
    'gamma': [1, 0.1, 0.01, 0.001],  # Kernel coefficient for 'rbf', 'poly', 'sigmoid'
    'kernel': ['rbf', 'linear']      # Type of kernel
}

# Create an SVM classifier instance
svm = SVC()

# Create a GridSearchCV object
# estimator: The model to tune (SVC in this case)
# param_grid: The grid of hyperparameters to search
# cv: Number of folds for cross-validation
# scoring: The metric to evaluate the model (e.g., 'accuracy')
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit GridSearchCV to the training data
# This will perform the grid search with cross-validation
grid_search.fit(X_train, y_train)

# Print the best parameters found by GridSearchCV
print("Best parameters found by GridSearchCV:")
print(grid_search.best_params_)

# Print the best cross-validation score
print(f"Best cross-validation accuracy: {grid_search.best_score_:.2f}")

# Get the best model
best_svm_model = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred = best_svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy on the test set
print(f"Accuracy of the best SVM model on the test set: {accuracy:.2f}")

In [None]:
# 12.  Write a Python program to train an SVM Classifier on an imbalanced dataset and apply class weighting and
# check it improve accuracy

# Import necessary libraries
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Create a synthetic imbalanced dataset
# Let's create a dataset with two classes, where one class is significantly larger than the other.
n_samples_total = 1000
n_samples_majority = int(n_samples_total * 0.9) # 90% majority class
n_samples_minority = n_samples_total - n_samples_majority # 10% minority class

# Generate synthetic features (example: normally distributed features)
X_majority = np.random.rand(n_samples_majority, 2) * 5
X_minority = np.random.rand(n_samples_minority, 2) * 2 + 3 # Shift minority to overlap

# Create target labels
y_majority = np.zeros(n_samples_majority)
y_minority = np.ones(n_samples_minority)

# Combine majority and minority classes
X = np.vstack((X_majority, X_minority))
y = np.hstack((y_majority, y_minority))

# Shuffle the dataset
indices = np.arange(n_samples_total)
np.random.shuffle(indices)
X = X[indices]
y = y[indices]

print(f"Dataset shape: {X.shape}")
print(f"Number of samples in majority class (0): {np.sum(y == 0)}")
print(f"Number of samples in minority class (1): {np.sum(y == 1)}")

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
# Using stratify=y is important for imbalanced datasets to maintain the class distribution in train/test splits

print(f"\nTraining set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Number of samples in training majority class (0): {np.sum(y_train == 0)}")
print(f"Number of samples in training minority class (1): {np.sum(y_train == 1)}")

# --- Train SVM without class weighting ---

print("\n--- Training SVM without class weighting ---")
svm_no_weight = SVC(kernel='linear', random_state=42)
svm_no_weight.fit(X_train, y_train)
y_pred_no_weight = svm_no_weight.predict(X_test)

accuracy_no_weight = accuracy_score(y_test, y_pred_no_weight)
print(f"Accuracy (no weighting): {accuracy_no_weight:.2f}")
print("Classification Report (no weighting):\n", classification_report(y_test, y_pred_no_weight))

# --- Train SVM with class weighting ---

print("\n--- Training SVM with class weighting ---")
# The 'balanced' mode automatically adjusts weights inversely proportional to class frequencies
svm_with_weight = SVC(kernel='linear', class_weight='balanced', random_state=42)
svm_with_weight.fit(X_train, y_train)
y_pred_with_weight = svm_with_weight.predict(X_test)

accuracy_with_weight = accuracy_score(y_test, y_pred_with_weight)
print(f"Accuracy (with weighting): {accuracy_with_weight:.2f}")
print("Classification Report (with weighting):\n", classification_report(y_test, y_pred_with_weight))

# You can visually compare the classification reports to see the effect of class weighting on
# precision, recall, and F1-score for the minority class.

In [None]:
# 13. Write a Python program to implement a Naïve Bayes classifier for spam detection using email data

# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer # or TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Create a small synthetic dataset of emails and their labels (0 for ham, 1 for spam)
emails = [
    'Hey, how are you doing?',
    'Buy cheap viagra now',
    'Meet me for lunch tomorrow',
    'Get your free prize!',
    'Just checking in',
    'Claim your reward!',
    'Hi, let\'s catch up soon',
    'Spam email with lots of keywords like free money now',
    'Project update meeting at 2 PM',
    'Earn money online fast'
]

labels = [0, 1, 0, 1, 0, 1, 0, 1, 0, 1] # 0 for ham, 1 for spam

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(emails, labels, test_size=0.3, random_state=42)

# Convert text data into numerical feature vectors using CountVectorizer
# CountVectorizer converts a collection of text documents to a matrix of token counts.
# Alternatively, TfidfVectorizer can be used for TF-IDF features.
vectorizer = CountVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

# Create a Multinomial Naive Bayes classifier
# MultinomialNB is suitable for discrete counts, like word counts in text.
multinomial_nb = MultinomialNB()

# Train the classifier
multinomial_nb.fit(X_train_vectors, y_train)

# Make predictions on the test set
y_pred = multinomial_nb.predict(X_test_vectors)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the Naive Bayes spam classifier: {accuracy:.2f}")

# Print a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))

# Example of classifying a new email
new_email = ["Claim your free gift card now!"]
new_email_vector = vectorizer.transform(new_email)
prediction = multinomial_nb.predict(new_email_vector)

print(f"\nNew email: '{new_email[0]}'")
print(f"Prediction: {'Spam' if prediction[0] == 1 else 'Ham'}")

new_email_2 = ["Hey, just wanted to follow up on the meeting."]
new_email_vector_2 = vectorizer.transform(new_email_2)
prediction_2 = multinomial_nb.predict(new_email_vector_2)

print(f"\nNew email: '{new_email_2[0]}'")
print(f"Prediction: {'Spam' if prediction_2[0] == 1 else 'Ham'}")


In [None]:
# 14.  Write a Python program to train an SVM Classifier and a Naïve Bayes Classifier on the same dataset and
# compare their accuracy

# Import necessary libraries
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB  # Using GaussianNB as an example
from sklearn.metrics import accuracy_score, classification_report

# Load a dataset (using the Iris dataset as an example)
data = datasets.load_iris()
X = data.data
y = data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# --- Train SVM Classifier ---

print("--- Training SVM Classifier ---")
# Create an SVM classifier (you can choose different kernels)
svm_classifier = SVC(kernel='linear', random_state=42)

# Train the classifier
svm_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred_svm = svm_classifier.predict(X_test)

# Evaluate the accuracy of the SVM classifier
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"Accuracy of the SVM classifier: {accuracy_svm:.2f}")
print("Classification Report (SVM):\n", classification_report(y_test, y_pred_svm, target_names=data.target_names))

# --- Train Naive Bayes Classifier ---

print("\n--- Training Naive Bayes Classifier ---")
# Create a Naive Bayes classifier (using GaussianNB for this dataset)
# For text data, you might use MultinomialNB
naive_bayes_classifier = GaussianNB()

# Train the classifier
naive_bayes_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred_nb = naive_bayes_classifier.predict(X_test)

# Evaluate the accuracy of the Naive Bayes classifier
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f"Accuracy of the Naive Bayes classifier: {accuracy_nb:.2f}")
print("Classification Report (Naive Bayes):\n", classification_report(y_test, y_pred_nb, target_names=data.target_names))

# --- Compare Accuracies ---

print("\n--- Comparing Accuracies ---")
print(f"SVM Classifier Accuracy: {accuracy_svm:.2f}")
print(f"Naive Bayes Classifier Accuracy: {accuracy_nb:.2f}")


In [None]:
# 15.  Write a Python program to perform feature selection before training a Naïve Bayes classifier and compare
# results

# Import necessary libraries
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import SelectKBest, chi2  # Import SelectKBest and chi2
from sklearn.metrics import accuracy_score, classification_report

# Load the 20 Newsgroups dataset
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

X_train = newsgroups_train.data
y_train = newsgroups_train.target
X_test = newsgroups_test.data
y_test = newsgroups_test.target

# Convert text data into numerical feature vectors using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"Shape of TF-IDF vectors before feature selection: {X_train_tfidf.shape}")

# --- Train Multinomial Naive Bayes without Feature Selection ---

print("\n--- Training Multinomial Naive Bayes without Feature Selection ---")
multinomial_nb_no_fs = MultinomialNB()
multinomial_nb_no_fs.fit(X_train_tfidf, y_train)
y_pred_no_fs = multinomial_nb_no_fs.predict(X_test_tfidf)
accuracy_no_fs = accuracy_score(y_test, y_pred_no_fs)
print(f"Accuracy without Feature Selection: {accuracy_no_fs:.2f}")
print("Classification Report (without Feature Selection):\n", classification_report(y_test, y_pred_no_fs, target_names=newsgroups_test.target_names))

# --- Apply Feature Selection ---

# Use SelectKBest to select the top K features based on chi-squared statistics
# chi2 works well with non-negative data like TF-IDF.
# You need to choose an appropriate value for k (number of features to select).
k_best_features = 1000 # Example: select the top 1000 features

selector = SelectKBest(chi2, k=k_best_features)

# Fit the selector on the training data and transform both training and test data
X_train_selected = selector.fit_transform(X_train_tfidf, y_train)
X_test_selected = selector.transform(X_test_tfidf)

print(f"\nShape of TF-IDF vectors after feature selection: {X_train_selected.shape}")

# --- Train Multinomial Naive Bayes with Feature Selection ---

print("\n--- Training Multinomial Naive Bayes with Feature Selection ---")
multinomial_nb_with_fs = MultinomialNB()
multinomial_nb_with_fs.fit(X_train_selected, y_train)
y_pred_with_fs = multinomial_nb_with_fs.predict(X_test_selected)
accuracy_with_fs = accuracy_score(y_test, y_pred_with_fs)
print(f"Accuracy with Feature Selection: {accuracy_with_fs:.2f}")
print("Classification Report (with Feature Selection):\n", classification_report(y_test, y_pred_with_fs, target_names=newsgroups_test.target_names))

# --- Compare Accuracies ---

print("\n--- Comparing Accuracies ---")
print(f"Accuracy without Feature Selection: {accuracy_no_fs:.2f}")
print(f"Accuracy with Feature Selection: {accuracy_with_fs:.2f}")

In [None]:
# 16. Write a Python program to train an SVM Classifier using One-vs-Rest (OvR) and One-vs-One (OvO)
# strategies on the Wine dataset and compare their accuracy

# Import necessary libraries
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the Wine dataset (which is a multiclass dataset)
wine = datasets.load_wine()
X = wine.data
y = wine.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# --- Train SVM using One-vs-Rest (OvR) strategy ---

print("--- Training SVM using One-vs-Rest (OvR) strategy ---")
# Create a base binary SVM classifier
base_svm_ovr = SVC(kernel='linear', random_state=42) # You can use different kernels

# Create the OneVsRestClassifier wrapper with the base SVM
ovr_classifier = OneVsRestClassifier(base_svm_ovr)

# Train the OvR classifier
ovr_classifier.fit(X_train, y_train)

# Make predictions
y_pred_ovr = ovr_classifier.predict(X_test)

# Evaluate accuracy and report
accuracy_ovr = accuracy_score(y_test, y_pred_ovr)
print(f"Accuracy of SVM with OvR strategy: {accuracy_ovr:.2f}")
print("Classification Report (OvR):\n", classification_report(y_test, y_pred_ovr, target_names=wine.target_names))

# --- Train SVM using One-vs-One (OvO) strategy ---

print("\n--- Training SVM using One-vs-One (OvO) strategy ---")
# Create a base binary SVM classifier (can be the same as for OvR)
base_svm_ovo = SVC(kernel='linear', random_state=42) # Using the same base SVM

# Create the OneVsOneClassifier wrapper with the base SVM
ovo_classifier = OneVsOneClassifier(base_svm_ovo)

# Train the OvO classifier
ovo_classifier.fit(X_train, y_train)

# Make predictions
y_pred_ovo = ovo_classifier.predict(X_test)

# Evaluate accuracy and report
accuracy_ovo = accuracy_score(y_test, y_pred_ovo)
print(f"Accuracy of SVM with OvO strategy: {accuracy_ovo:.2f}")
print("Classification Report (OvO):\n", classification_report(y_test, y_pred_ovo, target_names=wine.target_names))

# --- Compare Accuracies ---

print("\n--- Comparing Accuracies ---")
print(f"SVM with OvR Accuracy: {accuracy_ovr:.2f}")
print(f"SVM with OvO Accuracy: {accuracy_ovo:.2f}")

In [None]:
# 17. Write a Python program to train an SVM Classifier using Linear, Polynomial, and RBF kernels on the Breast
# Cancer dataset and compare their accuracy

# Import necessary libraries
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the Breast Cancer dataset
breast_cancer = datasets.load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# --- Train SVM with Linear Kernel ---

print("--- Training SVM with Linear Kernel ---")
svm_linear = SVC(kernel='linear', random_state=42)
svm_linear.fit(X_train, y_train)
y_pred_linear = svm_linear.predict(X_test)
accuracy_linear = accuracy_score(y_test, y_pred_linear)
print(f"Accuracy (Linear Kernel): {accuracy_linear:.2f}")
print("Classification Report (Linear Kernel):\n", classification_report(y_test, y_pred_linear, target_names=breast_cancer.target_names))

# --- Train SVM with Polynomial Kernel ---

print("\n--- Training SVM with Polynomial Kernel ---")
# You can adjust the 'degree' and 'C' parameters for the polynomial kernel
svm_poly = SVC(kernel='poly', degree=3, C=1.0, random_state=42)
svm_poly.fit(X_train, y_train)
y_pred_poly = svm_poly.predict(X_test)
accuracy_poly = accuracy_score(y_test, y_pred_poly)
print(f"Accuracy (Polynomial Kernel): {accuracy_poly:.2f}")
print("Classification Report (Polynomial Kernel):\n", classification_report(y_test, y_pred_poly, target_names=breast_cancer.target_names))

# --- Train SVM with RBF Kernel ---

print("\n--- Training SVM with RBF Kernel ---")
# You can adjust the 'gamma' and 'C' parameters for the RBF kernel
svm_rbf = SVC(kernel='rbf', gamma='scale', C=1.0, random_state=42) # 'scale' is a good default for gamma
svm_rbf.fit(X_train, y_train)
y_pred_rbf = svm_rbf.predict(X_test)
accuracy_rbf = accuracy_score(y_test, y_pred_rbf)
print(f"Accuracy (RBF Kernel): {accuracy_rbf:.2f}")
print("Classification Report (RBF Kernel):\n", classification_report(y_test, y_pred_rbf, target_names=breast_cancer.target_names))

# --- Compare Accuracies ---

print("\n--- Comparing Accuracies ---")
print(f"Accuracy (Linear Kernel): {accuracy_linear:.2f}")
print(f"Accuracy (Polynomial Kernel): {accuracy_poly:.2f}")
print(f"Accuracy (RBF Kernel): {accuracy_rbf:.2f}")

In [None]:
# 18. Write a Python program to train an SVM Classifier using Stratified K-Fold Cross-Validation and compute the
# average accuracy

# Import necessary libraries
from sklearn import datasets
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.svm import SVC
import numpy as np

# Load the Iris dataset
iris = datasets.load_iris()
X = iris.data
y = iris.target

# Create an SVM classifier
svm_classifier = SVC(kernel='linear', random_state=42) # You can choose other kernels

# Create a Stratified K-Fold cross-validation object
# n_splits: The number of folds (e.g., 5 or 10 are common)
# shuffle: Whether to shuffle the data before splitting (usually recommended)
# random_state: For reproducibility when shuffling
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Perform cross-validation using cross_val_score
# estimator: The model to evaluate (SVM classifier)
# X, y: The data and target variable
# cv: The cross-validation splitting strategy (our StratifiedKFold object)
# scoring: The evaluation metric (e.g., 'accuracy')
scores = cross_val_score(svm_classifier, X, y, cv=skf, scoring='accuracy')

# Print the scores for each fold
print(f"Accuracy scores for each fold: {scores}")

# Compute and print the average accuracy
average_accuracy = np.mean(scores)
print(f"Average accuracy: {average_accuracy:.2f}")

In [None]:
# 19. Write a Python program to train a Naïve Bayes classifier using different prior probabilities and compare
#nperformance

# Import necessary libraries
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Create a synthetic imbalanced dataset with simple features
# Class 0 (Majority), Class 1 (Minority)
X = np.array([
    [1, 0, 0], # Class 0
    [1, 1, 0], # Class 0
    [1, 0, 1], # Class 0
    [1, 0, 0], # Class 0
    [1, 1, 0], # Class 0
    [0, 1, 1], # Class 1
    [0, 1, 1]  # Class 1
])
y = np.array([0, 0, 0, 0, 0, 1, 1])

print(f"Dataset shape: {X.shape}")
print(f"Class distribution (0: Majority, 1: Minority): {np.bincount(y)}")

# Split the dataset into training and testing sets
# Using stratify=y is recommended for imbalanced datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)

print(f"\nTraining set class distribution: {np.bincount(y_train)}")
print(f"Test set class distribution: {np.bincount(y_test)}")

# --- Train Multinomial Naive Bayes with default priors ---

print("\n--- Training Multinomial Naive Bayes with Default Priors ---")
multinomial_nb_default = MultinomialNB()

# Train the classifier
multinomial_nb_default.fit(X_train, y_train)

# Print learned priors
print("Learned priors (default):", np.exp(multinomial_nb_default.class_log_prior_))

# Make predictions on the test set
y_pred_default = multinomial_nb_default.predict(X_test)

# Evaluate the classifier
accuracy_default = accuracy_score(y_test, y_pred_default)
print(f"Accuracy (default priors): {accuracy_default:.2f}")
print("Classification Report (default priors):\n", classification_report(y_test, y_pred_default))

# --- Train Multinomial Naive Bayes with custom priors ---

print("\n--- Training Multinomial Naive Bayes with Custom Priors ---")
# Define custom prior probabilities
# These should sum to 1 and be in the order of your class labels (e.g., [prior_class_0, prior_class_1])
custom_priors = np.array([0.3, 0.7]) # Example: assigning higher prior to the minority class

# Create a Multinomial Naive Bayes classifier with custom priors
multinomial_nb_custom = MultinomialNB(priors=custom_priors)

# Train the classifier
# When priors are specified, the fit method does not learn the priors from the data.
multinomial_nb_custom.fit(X_train, y_train)

# Print the priors used
print("Used priors (custom):", np.exp(multinomial_nb_custom.class_log_prior_))

# Make predictions on the test set
y_pred_custom = multinomial_nb_custom.predict(X_test)

# Evaluate the classifier
accuracy_custom = accuracy_score(y_test, y_pred_custom)
print(f"Accuracy (custom priors): {accuracy_custom:.2f}")
print("Classification Report (custom priors):\n", classification_report(y_test, y_pred_custom))

# --- Compare Predictions ---
print("\nComparison of predictions:")
print("Test labels:", y_test)
print("Predictions (default priors):", y_pred_default)
print("Predictions (custom priors):", y_pred_custom)

In [None]:
# 20. Write a Python program to perform Recursive Feature Elimination (RFE) before training an SVM Classifier and
# compare accuracy

# Import necessary libraries
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Load the Iris dataset
iris = datasets.load_iris()
X = iris.data
y = iris.target
feature_names = iris.feature_names

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"Shape of data before RFE: {X_train.shape}")

# --- Train SVM on all features ---

print("\n--- Training SVM on all features ---")
svm_all_features = SVC(kernel='linear', random_state=42)
svm_all_features.fit(X_train, y_train)
y_pred_all = svm_all_features.predict(X_test)
accuracy_all = accuracy_score(y_test, y_pred_all)
print(f"Accuracy with all features: {accuracy_all:.2f}")
print("Classification Report (all features):\n", classification_report(y_test, y_pred_all, target_names=iris.target_names))

# --- Perform Recursive Feature Elimination (RFE) ---

# Create a base SVM classifier (must support feature importance or coefficients)
# A linear kernel SVM (SVC or LinearSVC) or a linear model works well with RFE
estimator = SVC(kernel='linear') # Using a linear SVM as the base estimator

# Create the RFE object
# estimator: The base model to use for feature ranking
# n_features_to_select: The desired number of features to keep
rfe = RFE(estimator=estimator, n_features_to_select=2) # Example: Select the top 2 features

# Fit RFE on the training data
rfe.fit(X_train, y_train)

# Print the ranking of features (1 being the most important)
print("\nFeature ranking by RFE:")
# Create a list of tuples (feature_name, rank)
ranked_features = sorted([(feature_names[i], rank) for i, rank in enumerate(rfe.ranking_)])
for name, rank in ranked_features:
    print(f"  {name}: Rank {rank}")

# Print the selected features (True means selected, False means eliminated)
print("\nSelected features by RFE:")
print(rfe.support_)
selected_feature_indices = np.where(rfe.support_)[0]
print("Selected feature indices:", selected_feature_indices)
print("Selected feature names:", [feature_names[i] for i in selected_feature_indices])

# Transform the training and test data to include only the selected features
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

print(f"\nShape of data after RFE: {X_train_rfe.shape}")

# --- Train SVM on selected features ---

print("\n--- Training SVM on selected features ---")
svm_rfe_features = SVC(kernel='linear', random_state=42) # Use the same kernel as before
svm_rfe_features.fit(X_train_rfe, y_train)
y_pred_rfe = svm_rfe_features.predict(X_test_rfe)
accuracy_rfe = accuracy_score(y_test, y_pred_rfe)
print(f"Accuracy with RFE selected features: {accuracy_rfe:.2f}")
print("Classification Report (RFE selected features):\n", classification_report(y_test, y_pred_rfe, target_names=iris.target_names))

# --- Compare Accuracies ---

print("\n--- Comparing Accuracies ---")
print(f"Accuracy with all features: {accuracy_all:.2f}")
print(f"Accuracy with RFE selected features: {accuracy_rfe:.2f}")


In [None]:
# 21. Write a Python program to train an SVM Classifier and evaluate its performance using Precision, Recall, and
# F1-Score instead of accuracy

# Import necessary libraries
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

# Load the Breast Cancer dataset
breast_cancer = datasets.load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target # 0 for malignant, 1 for benign

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create an SVM classifier
svm_classifier = SVC(kernel='linear', random_state=42) # Using linear kernel as an example

# Train the classifier
svm_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm_classifier.predict(X_test)

# --- Evaluate using Precision, Recall, and F1-Score ---

# Calculate Precision
# Precision is the ratio of correctly predicted positive observations to the total predicted positive observations.
# A high precision relates to a low false positive rate.
# For binary classification, you need to specify the positive label if it's not the default (1).
precision = precision_score(y_test, y_pred, pos_label=1) # Assuming 1 is the positive class (benign)

# Calculate Recall (Sensitivity)
# Recall is the ratio of correctly predicted positive observations to the all observations in actual class - yes.
# A high recall relates to a low false negative rate.
recall = recall_score(y_test, y_pred, pos_label=1)

# Calculate F1-Score
# The F1-score is the harmonic mean of Precision and Recall. [1]
# It is a weighted average of the precision and recall, where an F1-score reaches its best value at 1 and worst at 0.
f1 = f1_score(y_test, y_pred, pos_label=1)

# Print the calculated metrics
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

# --- Alternatively, use classification_report for all metrics ---

print("\nClassification Report:")
# The classification_report function provides precision, recall, F1-score, and support for each class.
print(classification_report(y_test, y_pred, target_names=breast_cancer.target_names))

In [None]:
# 22.  Write a Python program to train a Naïve Bayes Classifier and evaluate its performance using Log Loss
# (Cross-Entropy Loss)

# Import necessary libraries
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import log_loss, accuracy_score # Import log_loss

# Load the 20 Newsgroups dataset
# We can select a subset of categories for faster execution
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

X_train = newsgroups_train.data
y_train = newsgroups_train.target
X_test = newsgroups_test.data
y_test = newsgroups_test.target

# Convert text data into numerical feature vectors using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Create a Multinomial Naive Bayes classifier
multinomial_nb = MultinomialNB()

# Train the classifier
multinomial_nb.fit(X_train_tfidf, y_train)

# --- Evaluate using Log Loss ---

# Get the predicted probabilities for the test set
# log_loss requires probability estimates (output of predict_proba)
y_pred_proba = multinomial_nb.predict_proba(X_test_tfidf)

# Calculate Log Loss
# The first argument is the true labels (y_test)
# The second argument is the predicted probabilities (y_pred_proba)
logloss = log_loss(y_test, y_pred_proba)

# Print the Log Loss
print(f"Log Loss (Cross-Entropy) of the Naive Bayes classifier: {logloss:.4f}")

# --- Also print accuracy for comparison ---
# (Accuracy is calculated from the predicted class, not probabilities)
y_pred = multinomial_nb.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the Naive Bayes classifier: {accuracy:.2f}")

In [None]:
# 23. Write a Python program to train an SVM Classifier and visualize the Confusion Matrix using seaborn

# Import necessary libraries
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Load the Iris dataset (suitable for visualization with 3 classes)
iris = datasets.load_iris()
X = iris.data
y = iris.target
target_names = iris.target_names # Get the class names

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create and train an SVM classifier
svm_classifier = SVC(kernel='linear', random_state=42) # Using linear kernel for simplicity
svm_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm_classifier.predict(X_test)

# --- Visualize the Confusion Matrix ---

# Calculate the Confusion Matrix
# The Confusion Matrix is a 2D array where rows represent actual classes and columns represent predicted classes.
# Entry (i, j) is the number of samples with actual class i and predicted class j.
cm = confusion_matrix(y_test, y_pred)

# Print the Confusion Matrix (optional)
print("Confusion Matrix:")
print(cm)

# Visualize the Confusion Matrix using seaborn and matplotlib
plt.figure(figsize=(8, 6)) # Set the figure size

# Use seaborn.heatmap to create the heatmap
# annot=True: Annotate the heatmap with the data values
# fmt='d': Format the annotations as integers
# cmap='Blues': Use a colormap
# xticklabels, yticklabels: Set the labels for the x and y axes (predicted and actual class names)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names)

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for SVM Classifier')
plt.show()

# --- Alternatively, use ConfusionMatrixDisplay from sklearn ---
# ConfusionMatrixDisplay is newer and can be more convenient

print("\n--- Using ConfusionMatrixDisplay ---")
cmd = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)

fig, ax = plt.subplots(figsize=(8, 6))
cmd.plot(ax=ax, cmap='Blues', values_format='d')

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for SVM Classifier')
plt.show()

In [None]:
# 24.  Write a Python program to train an SVM Regressor (SVR) and evaluate its performance using Mean Absolute
# Error (MAE) instead of MSE

# Import necessary libraries
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error # Import mean_absolute_error

# Load a housing dataset (using California Housing dataset as an example)
# If you have your own dataset, load it here
try:
    housing = datasets.fetch_california_housing()
    X = housing.data
    y = housing.target
except AttributeError:
    print("California Housing dataset not available. Using make_regression as an example.")
    from sklearn.datasets import make_regression
    X, y = make_regression(n_samples=200, n_features=10, noise=10, random_state=42)


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create an SVR regressor
# You can experiment with different kernels and parameters
svr_regressor = SVR(kernel='linear') # Using linear kernel as an example

# Train the regressor
svr_regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svr_regressor.predict(X_test)

# --- Evaluate the regressor using Mean Absolute Error (MAE) ---

# Calculate Mean Absolute Error
# MAE is the average of the absolute differences between the actual and predicted values.
mae = mean_absolute_error(y_test, y_pred)

# Print the Mean Absolute Error
print(f"Mean Absolute Error (MAE) of the SVR regressor: {mae:.2f}")

# --- You can also calculate MSE for comparison if you want ---
# from sklearn.metrics import mean_squared_error
# mse = mean_squared_error(y_test, y_pred)
# print(f"Mean Squared Error (MSE) of the SVR regressor: {mse:.2f}")

In [None]:
# 25 . Write a Python program to train a Naïve Bayes classifier and evaluate its performance using the ROC-AUC
# score

# Import necessary libraries
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_curve, auc, roc_auc_score # Import ROC/AUC functions
import matplotlib.pyplot as plt

# Load the Breast Cancer dataset (a binary classification dataset)
breast_cancer = datasets.load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target # 0 for malignant, 1 for benign

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create and train a Gaussian Naive Bayes classifier
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# --- Evaluate using ROC-AUC Score ---

# Get the probability estimates for the positive class (class 1)
# roc_curve and roc_auc_score require the probability of the positive class
# The output of predict_proba is a 2D array: [[prob_class_0, prob_class_1], ...]
y_pred_proba = gnb.predict_proba(X_test)[:, 1] # Select probabilities of the positive class (index 1)

# Calculate the ROC curve points
# fpr: False Positive Rate
# tpr: True Positive Rate (Recall)
# thresholds: Thresholds used to compute fpr and tpr
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

# Calculate the Area Under the ROC Curve (AUC)
roc_auc = auc(fpr, tpr)

# Alternatively, use roc_auc_score for a single score calculation
roc_auc_single = roc_auc_score(y_test, y_pred_proba)


# Print the ROC-AUC score
print(f"ROC-AUC Score: {roc_auc_single:.2f}")

# --- Visualize the ROC curve (optional) ---

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random (AUC = 0.50)') # Baseline random classifier
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Recall)')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

In [None]:
# 26. Write a Python program to train an SVM Classifier and visualize the Precision-Recall Curve

# Import necessary libraries
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import precision_recall_curve, average_precision_score # Import PR curve functions
import matplotlib.pyplot as plt

# Load the Breast Cancer dataset (a binary classification dataset)
breast_cancer = datasets.load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target # 0 for malignant, 1 for benign

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create and train an SVM classifier
# Note: For precision_recall_curve, the classifier needs to output probability estimates or decision function values.
# SVC with probability=True can output probabilities, but decision_function is often preferred for PR curves.
svm_classifier = SVC(kernel='linear', random_state=42, probability=True) # Set probability=True to get probabilities
# Or use decision_function:
# svm_classifier = SVC(kernel='linear', random_state=42)

svm_classifier.fit(X_train, y_train)

# --- Visualize the Precision-Recall Curve ---

# Get the probability estimates of the positive class (class 1)
# precision_recall_curve and average_precision_score can use probabilities or decision function values.
# Using probabilities (requires probability=True in SVC):
y_scores = svm_classifier.predict_proba(X_test)[:, 1]
# Or using decision_function (often preferred for PR curves with SVM):
# y_scores = svm_classifier.decision_function(X_test)


# Calculate the Precision-Recall curve points
# precision, recall, thresholds = precision_recall_curve(y_test, y_scores) [1]
precision, recall, _ = precision_recall_curve(y_test, y_scores) # _ is for thresholds

# Calculate the Average Precision (AP) score
# AP is the area under the Precision-Recall curve.
average_precision = average_precision_score(y_test, y_scores)

# Print the Average Precision score
print(f"Average Precision (AP) Score: {average_precision:.2f}")

# --- Visualize the Precision-Recall curve ---

plt.figure(figsize=(8, 6))
plt.plot(recall, precision, color='darkorange', lw=2, label=f'Precision-Recall curve (AP = {average_precision:.2f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")
plt.grid(True)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.show()