In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import spacy
import re
import joblib

# Load SpaCy model
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

# Step 1: Load the dataset
def load_data():
    url = "https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv"
    df = pd.read_csv(url)
    return df

In [24]:
# Step 2: Preprocess the URLs using SpaCy
def preprocess_url(url):
    # Convert to lowercase and remove protocol
    url = url.lower().replace('http://', '').replace('https://', '')
    
    # Split by punctuation
    tokens = re.split(r'[./?=&-]', url)
    
    # Join tokens for SpaCy processing
    text = ' '.join([token for token in tokens if token])
    
    # Process with SpaCy
    doc = nlp(text)
    
    # Lemmatize and remove stopwords
    processed_tokens = [token.lemma_ for token in doc if not token.is_stop and token.text.strip()]
    
    return ' '.join(processed_tokens)

def prepare_data(df):
    # Apply preprocessing
    df['processed_url'] = df['url'].apply(preprocess_url)
    
    # Convert labels to binary (True=1, False=0)
    df['is_spam'] = df['is_spam'].astype(int)
    
    # Split features and target
    X = df['processed_url']
    y = df['is_spam']
    
    # Convert text to numerical features using TF-IDF
    vectorizer = TfidfVectorizer(max_features=5000)
    X_vectorized = vectorizer.fit_transform(X)
    
    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X_vectorized, y, test_size=0.2, random_state=42
    )
    
    return X_train, X_test, y_train, y_test, vectorizer

In [25]:
# Step 3: Build and train initial SVM
def train_initial_svm(X_train, X_test, y_train, y_test):
    # Initialize SVM with default parameters
    svm = SVC(random_state=42)
    
    # Train the model
    svm.fit(X_train, y_train)
    
    # Make predictions
    y_pred = svm.predict(X_test)
    
    # Print initial results
    print("Initial SVM Results:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    return svm

In [26]:
# Step 4: Optimize SVM with GridSearchCV
def optimize_svm(X_train, X_test, y_train, y_test):
    # Define parameter grid
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto', 0.1, 0.01]
    }
    
    # Initialize SVM
    svm = SVC(random_state=42)
    
    # Perform grid search
    grid_search = GridSearchCV(
        estimator=svm,
        param_grid=param_grid,
        cv=5,
        scoring='accuracy',
        n_jobs=-1
    )
    
    # Fit grid search
    grid_search.fit(X_train, y_train)
    
    # Get best model
    best_svm = grid_search.best_estimator_
    
    # Make predictions with optimized model
    y_pred = best_svm.predict(X_test)
    
    # Print optimized results
    print("\nOptimized SVM Results:")
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Best Cross-validation Score: {grid_search.best_score_:.4f}")
    print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    return best_svm

In [27]:
# Step 5: Save the model and vectorizer
def save_model(model, vectorizer, model_path='svm_spam_detector_spacy.pkl', 
              vectorizer_path='tfidf_vectorizer_spacy.pkl'):
    joblib.dump(model, model_path)
    joblib.dump(vectorizer, vectorizer_path)
    print(f"\nModel saved to {model_path}")
    print(f"Vectorizer saved to {vectorizer_path}")

# Main execution
def main():
    # Load data
    df = load_data()
    
    # Prepare data
    X_train, X_test, y_train, y_test, vectorizer = prepare_data(df)
    
    # Train initial SVM
    initial_svm = train_initial_svm(X_train, X_test, y_train, y_test)
    
    # Optimize SVM
    optimized_svm = optimize_svm(X_train, X_test, y_train, y_test)
    
    # Save model and vectorizer
    save_model(optimized_svm, vectorizer)

if __name__ == "__main__":
    main()

Initial SVM Results:
Accuracy: 0.9550

Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       455
           1       0.99      0.82      0.90       145

    accuracy                           0.95       600
   macro avg       0.97      0.91      0.93       600
weighted avg       0.96      0.95      0.95       600


Optimized SVM Results:
Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Best Cross-validation Score: 0.9654
Test Accuracy: 0.9667

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       455
           1       0.96      0.90      0.93       145

    accuracy                           0.97       600
   macro avg       0.96      0.95      0.95       600
weighted avg       0.97      0.97      0.97       600


Model saved to svm_spam_detector_spacy.pkl
Vectorizer saved to tfidf_vectorizer_spacy.pkl
