 Script Assignment 1: Spam Classifier

Before we start, we import some preliminary libraries.

In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the SpamAssassin dataset using pandas
spam_df = pd.read_csv('spam_dataset.csv')

# Explore the dataset
print(spam_df.head())

# Handle missing values if any
spam_df.dropna(inplace=True)



# Split the data into features (X) and target variable (y)
X = spam_df['text']
y = spam_df['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Text vectorization using CountVectorizer
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Linear Regression Classifier
linear_classifier = LogisticRegression(max_iter=1000)
linear_classifier.fit(X_train_vec, y_train)
linear_predictions = linear_classifier.predict(X_test_vec)

# Decision Tree Classifier
tree_classifier = DecisionTreeClassifier()
tree_classifier.fit(X_train_vec, y_train)
tree_predictions = tree_classifier.predict(X_test_vec)

# Random Forest Classifier
forest_classifier = RandomForestClassifier()
forest_classifier.fit(X_train_vec, y_train)
forest_predictions = forest_classifier.predict(X_test_vec)

# Evaluate the models
def evaluate_model(y_true, y_pred, classifier_name):
    print(f"Classification Report for {classifier_name}:")
    print(classification_report(y_true, y_pred))
    print(f"Confusion Matrix for {classifier_name}:")
    print(confusion_matrix(y_true, y_pred))
    print(f"Accuracy Score for {classifier_name}: {accuracy_score(y_true, y_pred)}")
    print("\n")

# Evaluate each classifier
evaluate_model(y_test, linear_predictions, "Linear Regression")
evaluate_model(y_test, tree_predictions, "Decision Tree")
evaluate_model(y_test, forest_predictions, "Random Forest")


                                                text  target
0  From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...       0
1  From gort44@excite.com Mon Jun 24 17:54:21 200...       1
2  From fork-admin@xent.com Mon Jul 29 11:39:57 2...       1
3  From dcm123@btamail.net.cn Mon Jun 24 17:49:23...       1
4  From ilug-admin@linux.ie Mon Aug 19 11:02:47 2...       0
Classification Report for Linear Regression:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       779
           1       1.00      0.99      0.99       381

    accuracy                           1.00      1160
   macro avg       1.00      0.99      1.00      1160
weighted avg       1.00      1.00      1.00      1160

Confusion Matrix for Linear Regression:
[[778   1]
 [  4 377]]
Accuracy Score for Linear Regression: 0.9956896551724138


Classification Report for Decision Tree:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99 

 Script Assignment 2: Handwritten Digits Classifier

In [1]:
#Import necessary libraries
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

# Load the MNIST dataset
digits = datasets.load_digits()

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.2, random_state=42)

# K-nearest Neighbors (KNN) classifier
knn_classifier = KNeighborsClassifier()
knn_classifier.fit(X_train, y_train)

# Support Vector Machines (SVM) classifier
svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)

# Naive Bayes classifier
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)

# Evaluate the models
knn_predictions = knn_classifier.predict(X_test)
svm_predictions = svm_classifier.predict(X_test)
nb_predictions = nb_classifier.predict(X_test)

# Print performance metrics
print("KNN Accuracy:", metrics.accuracy_score(y_test, knn_predictions))
print("SVM Accuracy:", metrics.accuracy_score(y_test, svm_predictions))
print("Naive Bayes Accuracy:", metrics.accuracy_score(y_test, nb_predictions))



KNN Accuracy: 0.9861111111111112
SVM Accuracy: 0.9861111111111112
Naive Bayes Accuracy: 0.8472222222222222
