# Import libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from scipy.sparse import issparse
import warnings

warnings.filterwarnings("ignore")

# Read data and prepare the dataset

In [2]:
# Read data from text file
data_path = '../../../../data/classification/SMSSpamCollection'
data = pd.read_csv(data_path, sep='\t', header=None, names=['label', 'text'])

# Convert labels 'ham' to 0 and 'spam' to 1
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# Split the dataset

In [3]:
# Split data into training and testing sets with a ratio of (8|2)
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)

# Classification models

In [4]:
# List of classification models
models = [
    ("Nearest Neighbors", KNeighborsClassifier()),
    ("Linear SVM", LinearSVC()),
    ("RBF SVM", SVC(gamma='scale')),
    ("Gaussian Process", GaussianProcessClassifier()),
    ("Decision Tree", DecisionTreeClassifier()),
    ("Random Forest", RandomForestClassifier()),
    ("Neural Net", MLPClassifier(max_iter=1000)),
    ("AdaBoost", AdaBoostClassifier()),
    ("Naive Bayes", MultinomialNB()),
    ("QDA", QuadraticDiscriminantAnalysis())
]

# Train and evaluate each vectorizer and model

**Function to train and evaluate models**

In [5]:
def train_and_evaluate_models(vectorizer, models, X_train, X_test, y_train, y_test):
    """
    Train and evaluate models using the provided vectorizer on the training and testing datasets.
    
    Parameters:
    - vectorizer: The vectorizer to transform the data
    - models: A list of models to train and evaluate
    - X_train: The training dataset
    - X_test: The testing dataset
    - y_train: The labels corresponding to the training dataset
    - y_test: The labels corresponding to the testing dataset
    
    Returns:
    - DataFrame containing the scores of the models
    """
    # Initialize a list to store the scores of the models
    score_data = []

    # Train and evaluate each model
    for model_name, model in models:
        # Vectorize the data
        X_train_vectorized = vectorizer.fit_transform(X_train)
        X_test_vectorized = vectorizer.transform(X_test)

        # Convert sparse matrix to dense if necessary
        if issparse(X_train_vectorized):
            X_train_vectorized = X_train_vectorized.toarray()
            X_test_vectorized = X_test_vectorized.toarray()

        # Train the model
        model.fit(X_train_vectorized, y_train)

        # Evaluate the model on the test set and save the score
        score = model.score(X_test_vectorized, y_test)
        score_data.append({
            'Model': model_name,
            'Score': score
        })

    # Create a DataFrame from the score data and return it
    score_df = pd.DataFrame(score_data)
    return score_df

## CountVectorizer

In [6]:
result_count_df = train_and_evaluate_models(CountVectorizer(), models, X_train, X_test, y_train, y_test)
print(result_count_df)

               Model     Score
0  Nearest Neighbors  0.925561
1         Linear SVM  0.990135
2            RBF SVM  0.986547
3   Gaussian Process  0.947085
4      Decision Tree  0.973991
5      Random Forest  0.978475
6         Neural Net  0.990135
7           AdaBoost  0.971300
8        Naive Bayes  0.991928
9                QDA  0.781166


## TfidfVectorizer

In [7]:
result_tfidf_df = train_and_evaluate_models(TfidfVectorizer(), models, X_train, X_test, y_train, y_test)
print(result_tfidf_df)

               Model     Score
0  Nearest Neighbors  0.919283
1         Linear SVM  0.991928
2            RBF SVM  0.989238
3   Gaussian Process  0.947982
4      Decision Tree  0.970404
5      Random Forest  0.982960
6         Neural Net  0.991031
7           AdaBoost  0.978475
8        Naive Bayes  0.966816
9                QDA  0.597309


# Result observations

- **Most models achieved high and stable performance (except QDA) using both CountVectorizer (Naive Bayes highest) and TF-IDF (Linear SVM highest)**
- **QDA showed clear dependency on CountVectorizer and performed less effectively with TF-IDF**
- **Choosing the Linear SVM model**:
    - **High performance**: Provides accurate spam detection while minimizing errors.
    - **Stability**: Maintains good effectiveness with both text representation methods, less dependent on data processing.
    - **Computational efficiency**: Simple and easy to implement, saving computational resources.
    - **Computational efficiency**: Easy to understand how it works, supporting adjustments and improvements to the model. 