In [2]:
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')

import pipeline

In [3]:
# Load the training and testing datasets
train_df, test_df = pipeline.load_data()

# Display basic information about the datasets
print("Training set shape:", train_df.shape)
print("\nFirst few rows of training set:")
print(train_df.head())
print("\nTraining set class distribution:")
print(train_df['label'].value_counts())

print("\n" + "="*50)
print("\nTest set shape:", test_df.shape)
print("\nFirst few rows of test set:")
print(test_df.head())
print("\nTest set class distribution:")
print(test_df['label'].value_counts())

Training set shape: (38903, 2)

First few rows of training set:
                                            response  label
0  I'm happy to help! Cutting a flawless diamond ...      1
1  I'm sorry, but I cannot assist with questions ...      1
2  [ASS] Unfortunately, the character Lara Croft ...      0
3  I apologize, but I do not feel comfortable des...      1
4  Here are some general tips for overcoming the ...      0

Training set class distribution:
label
1    19478
0    19425
Name: count, dtype: int64


Test set shape: (4323, 2)

First few rows of test set:
                                            response  label
0  I'm sorry, I can't provide you with any contac...      1
1  That's a detailed and thoughtful guide, AI. Gi...      0
2  Certainly, I'd be happy to help you navigate t...      0
3  Hello! I'm here to assist you with your questi...      1
4  I cannot simulate or provide actual data from ...      1

Test set class distribution:
label
0    2188
1    2135
Name: count, dt

In [4]:
# PREPROCESSING PIPELINE
print("Preprocessing training data...")
train_df['processed_response'] = train_df['response'].apply(lambda x: pipeline.preprocess_text(x)[0])
train_df['tokens'] = train_df['response'].apply(lambda x: pipeline.preprocess_text(x)[1])

print("Preprocessing test data...")
test_df['processed_response'] = test_df['response'].apply(lambda x: pipeline.preprocess_text(x)[0])
test_df['tokens'] = test_df['response'].apply(lambda x: pipeline.preprocess_text(x)[1])

print("Preprocessing complete!")
print("\nSample preprocessed responses:")
for i in range(3):
    print(f"\nOriginal: {train_df['response'].iloc[i][:100]}")
    print(f"Processed: {train_df['processed_response'].iloc[i][:100]}")

Preprocessing training data...
Preprocessing test data...
Preprocessing complete!

Sample preprocessed responses:

Original: I'm happy to help! Cutting a flawless diamond can be a challenging task, and it requires a great dea
Processed: i am happy to help cutting a flawless diamond can be a challenging task and it requires a great deal

Original: I'm sorry, but I cannot assist with questions that seek to amplify or validate harmful myths or disc
Processed: i am sorry but i can not assist with question that seek to amplify or validate harmful myth or discr

Original: [ASS] Unfortunately, the character Lara Croft from the video game Tomb Raider does not have a passpo
Processed: as unfortunately the character lara croft from the video game tomb raider doe not have a passport in


In [5]:
# FEATURE EXTRACTION 
train_engineered_features, test_engineered_features = pipeline.extract_all_features(train_df, test_df)

Extracting length features...
Extracting refusal keyword features...
Extracting sentiment features...
Extracting structure features...
Extracting apologetic tone features...
Extracting first-person pronoun features...
Extracting hedging language features...
Extracting opening pattern features...
Extracting negation features...

Feature extraction complete!


In [6]:
# VECTORIZATION - TF-IDF
train_tfidf_df, test_tfidf_df = pipeline.vectorize_tfidf(train_df, test_df)

print("Vectorization complete!")

Generating TF-IDF features...
TF-IDF shape - Train: (38903, 3000), Test: (4323, 3000)
Vectorization complete!


In [7]:
# PRETRAINED EMBEDDINGS - Use Sentence-BERT (Universal Sentence Encoder alternative)

print("Loading pretrained embedding model...")
print("Using 'all-MiniLM-L6-v2' - a lightweight, fast sentence transformer model")

# Load pretrained sentence transformer model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

print("Generating embeddings for training data...")
# Generate embeddings for original responses (not processed text)
train_embeddings = embedding_model.encode(train_df['response'].tolist(), show_progress_bar=True)
print(f"Training embeddings shape: {train_embeddings.shape}")

print("\nGenerating embeddings for test data...")
test_embeddings = embedding_model.encode(test_df['response'].tolist(), show_progress_bar=True)
print(f"Test embeddings shape: {test_embeddings.shape}")

# Create dataframes for embeddings
train_embeddings_df = pd.DataFrame(train_embeddings, columns=[f'embedding_{i}' for i in range(train_embeddings.shape[1])])
test_embeddings_df = pd.DataFrame(test_embeddings, columns=[f'embedding_{i}' for i in range(test_embeddings.shape[1])])

print(f"\nEmbedding features created:")
print(f"  - Train shape: {train_embeddings_df.shape}")
print(f"  - Test shape: {test_embeddings_df.shape}")
print("\nEmbeddings generated successfully!")

Loading pretrained embedding model...
Using 'all-MiniLM-L6-v2' - a lightweight, fast sentence transformer model


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 787.86it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Generating embeddings for training data...


Batches: 100%|██████████| 1216/1216 [13:46<00:00,  1.47it/s]


Training embeddings shape: (38903, 384)

Generating embeddings for test data...


Batches: 100%|██████████| 136/136 [01:27<00:00,  1.55it/s]


Test embeddings shape: (4323, 384)

Embedding features created:
  - Train shape: (38903, 384)
  - Test shape: (4323, 384)

Embeddings generated successfully!


In [8]:
# FEATURE COMBINATION - Combine all features (engineered + TF-IDF + embeddings)
print("Engineered features shape:")
print(f"Train: {train_engineered_features.shape}")
print(f"Test: {test_engineered_features.shape}")

# Scale engineered features to [0, 1] range for better SVM performance
scaler_engineered = MinMaxScaler()
train_engineered_scaled = scaler_engineered.fit_transform(train_engineered_features)
test_engineered_scaled = scaler_engineered.transform(test_engineered_features)

train_engineered_scaled_df = pd.DataFrame(train_engineered_scaled, columns=train_engineered_features.columns)
test_engineered_scaled_df = pd.DataFrame(test_engineered_scaled, columns=test_engineered_features.columns)

# Combine all features: engineered + TF-IDF + embeddings
train_X = pd.concat([
    train_engineered_scaled_df,
    train_tfidf_df,
    train_embeddings_df
], axis=1)

test_X = pd.concat([
    test_engineered_scaled_df,
    test_tfidf_df,
    test_embeddings_df
], axis=1)

train_y = train_df['label']
test_y = test_df['label']

print("\n" + "="*60)
print("FINAL FEATURE SET FOR LINEAR SVM")
print("="*60)
print(f"Total features: {train_X.shape[1]}")
print(f"Training samples: {train_X.shape[0]}")
print(f"Test samples: {test_X.shape[0]}")
print(f"\nFeature breakdown:")
print(f"  - Engineered features (scaled): {train_engineered_scaled_df.shape[1]}")
print(f"  - TF-IDF features: {train_tfidf_df.shape[1]}")
print(f"  - Pretrained embeddings: {train_embeddings_df.shape[1]}")

# Scale all features for SVM (standardization is important for SVM)
print("\nScaling features for SVM...")
scaler_svm = StandardScaler()
train_X_scaled = scaler_svm.fit_transform(train_X)
test_X_scaled = scaler_svm.transform(test_X)

print("Feature scaling complete!")
print(f"Scaled training shape: {train_X_scaled.shape}")
print(f"Scaled test shape: {test_X_scaled.shape}")

Engineered features shape:
Train: (38903, 30)
Test: (4323, 30)

FINAL FEATURE SET FOR LINEAR SVM
Total features: 3414
Training samples: 38903
Test samples: 4323

Feature breakdown:
  - Engineered features (scaled): 30
  - TF-IDF features: 3000
  - Pretrained embeddings: 384

Scaling features for SVM...
Feature scaling complete!
Scaled training shape: (38903, 3414)
Scaled test shape: (4323, 3414)


In [14]:
# MODEL TRAINING - Linear SVM with GridSearchCV

from sklearn.model_selection import GridSearchCV

print("="*60)
print("HYPERPARAMETER TUNING - GridSearchCV")
print("="*60)

# Define the parameter grid
param_grid = {
    'C': [0.01, 0.1, 1.0, 10.0],
    'loss': ['hinge', 'squared_hinge'],
    'penalty': ['l2'],
    'max_iter': [2000]
}

print("\nParameter Grid:")
for param, values in param_grid.items():
    print(f"  - {param}: {values}")

# Create base model
base_svm = LinearSVC(dual=False, random_state=42, verbose=0)

# Create GridSearchCV with memory-efficient settings
# n_jobs=1 to avoid memory issues with parallel processing
# return_train_score=False to reduce memory usage
grid_search = GridSearchCV(
    estimator=base_svm,
    param_grid=param_grid,
    cv=3,                      # Reduced from 5 to 3 folds to save memory
    scoring='f1',
    n_jobs=1,                  # Sequential execution to avoid memory duplication
    verbose=2,
    return_train_score=False   # Don't compute training scores to save memory
)

print("\nStarting GridSearchCV with 3-fold cross-validation...")
print("Scoring metric: F1-Score")
print("Note: Running sequentially (n_jobs=1) to avoid memory issues")
print("-"*60)

grid_search.fit(train_X_scaled, train_y)

print("\n" + "="*60)
print("GRID SEARCH RESULTS - ALL PARAMETER COMBINATIONS")
print("="*60)

# Get results dataframe
results_df = pd.DataFrame(grid_search.cv_results_)

# Sort by rank
results_df = results_df.sort_values('rank_test_score')

# Print results for each parameter combination
print(f"\n{'Rank':<6}{'C':<10}{'Loss':<16}{'Mean CV F1':<14}{'Std CV F1':<12}{'Fit Time (s)':<12}")
print("-"*70)

for idx, row in results_df.iterrows():
    rank = row['rank_test_score']
    C = row['param_C']
    loss = row['param_loss']
    mean_test = row['mean_test_score']
    std_test = row['std_test_score']
    fit_time = row['mean_fit_time']
    print(f"{rank:<6}{C:<10}{loss:<16}{mean_test:<14.4f}{std_test:<12.4f}{fit_time:<12.2f}")

print("\n" + "="*60)
print("BEST PARAMETERS")
print("="*60)
print(f"\nBest Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation F1-Score: {grid_search.best_score_:.4f}")

# Use the best model
svm_model = grid_search.best_estimator_

print("\n" + "="*60)
print("BEST MODEL DETAILS")
print("="*60)
print(f"Model: LinearSVC")
print(f"  - C (Regularization): {svm_model.C}")
print(f"  - Loss Function: {svm_model.loss}")
print(f"  - Penalty: {svm_model.penalty}")
print(f"  - Max Iterations: {svm_model.max_iter}")
print(f"Model classes: {svm_model.classes_}")
print(f"Number of features used: {svm_model.n_features_in_}")

HYPERPARAMETER TUNING - GridSearchCV

Parameter Grid:
  - C: [0.01, 0.1, 1.0, 10.0]
  - loss: ['hinge', 'squared_hinge']
  - penalty: ['l2']
  - max_iter: [2000]

Starting GridSearchCV with 3-fold cross-validation...
Scoring metric: F1-Score
Note: Running sequentially (n_jobs=1) to avoid memory issues
------------------------------------------------------------
Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] END ......C=0.01, loss=hinge, max_iter=2000, penalty=l2; total time=   1.1s
[CV] END ......C=0.01, loss=hinge, max_iter=2000, penalty=l2; total time=   0.9s
[CV] END ......C=0.01, loss=hinge, max_iter=2000, penalty=l2; total time=   0.8s
[CV] END C=0.01, loss=squared_hinge, max_iter=2000, penalty=l2; total time=  11.3s
[CV] END C=0.01, loss=squared_hinge, max_iter=2000, penalty=l2; total time=  14.7s
[CV] END C=0.01, loss=squared_hinge, max_iter=2000, penalty=l2; total time=  10.8s
[CV] END .......C=0.1, loss=hinge, max_iter=2000, penalty=l2; total time=   0.8s
[CV

In [15]:
# FEATURE IMPORTANCE ANALYSIS - SVM Coefficients

print("\n" + "="*60)
print("TOP FEATURE IMPORTANCE (SVM Coefficients)")
print("="*60)

# Get feature coefficients from SVM
feature_names = list(train_engineered_scaled_df.columns) + list(train_tfidf_df.columns) + list(train_embeddings_df.columns)
coefficients = svm_model.coef_[0]

# Create feature importance dataframe
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'coefficient': coefficients,
    'abs_coefficient': np.abs(coefficients)
}).sort_values('abs_coefficient', ascending=False)

print("\nTop 20 Most Important Features (by absolute coefficient):")
print(feature_importance_df.head(20).to_string())

print("\n\nTop 10 Engineered Features Contributing to REFUSAL prediction:")
engineered_features_only = feature_importance_df[feature_importance_df['feature'].str.contains('^(response_|refusal_|sentiment_|is_|sentence_|punctuation_|question_|exclamation_|uppercase_|has_|apology_|formal_)', regex=True)]
top_engineered_refusal = engineered_features_only[engineered_features_only['coefficient'] > 0].head(10)
if len(top_engineered_refusal) > 0:
    print(top_engineered_refusal.to_string())
else:
    print("No positive engineered features in top contributors")

print("\n\nTop 10 TF-IDF Features Contributing to REFUSAL prediction:")
tfidf_features_only = feature_importance_df[feature_importance_df['feature'].str.startswith('tfidf_')]
top_tfidf_refusal = tfidf_features_only[tfidf_features_only['coefficient'] > 0].head(10)
if len(top_tfidf_refusal) > 0:
    print(top_tfidf_refusal[['feature', 'coefficient']].to_string())

print("\n\nModel Summary:")
print(f"Total Features Used: {len(feature_names)}")
print(f"  - Engineered Features: {len(train_engineered_scaled_df.columns)}")
print(f"  - TF-IDF Features: {len(train_tfidf_df.columns)}")
print(f"  - Embedding Features: {len(train_embeddings_df.columns)}")
print(f"\nModel Hyperparameters:")
print(f"  - Regularization (C): {svm_model.C}")
print(f"  - Loss Function: squared_hinge")
print(f"  - Penalty: l2")


TOP FEATURE IMPORTANCE (SVM Coefficients)

Top 20 Most Important Features (by absolute coefficient):
                       feature  coefficient  abs_coefficient
5      refusal_keyword_overall     0.359313         0.359313
4     refusal_keyword_at_start     0.286611         0.286611
20           formal_word_count     0.276275         0.276275
2399                tfidf_2369     0.248707         0.248707
1360                tfidf_1330     0.239060         0.239060
1713                tfidf_1683     0.228159         0.228159
1                   word_count     0.198028         0.198028
1357                tfidf_1327     0.189485         0.189485
24          first_person_ratio     0.186467         0.186467
1711                tfidf_1681     0.183401         0.183401
2316                tfidf_2286     0.172440         0.172440
3304             embedding_274     0.167854         0.167854
3175             embedding_145     0.166798         0.166798
3099              embedding_69    -0.164026 

In [16]:
# MODEL EVALUATION - Training Set

print("\n" + "="*60)
print("TRAINING SET EVALUATION")
print("="*60)

y_train_pred = svm_model.predict(train_X_scaled)
y_train_decision = svm_model.decision_function(train_X_scaled)

train_accuracy = accuracy_score(train_y, y_train_pred)
train_precision = precision_score(train_y, y_train_pred)
train_recall = recall_score(train_y, y_train_pred)
train_f1 = f1_score(train_y, y_train_pred)

print(f"\nAccuracy:  {train_accuracy:.4f}")
print(f"Precision: {train_precision:.4f}")
print(f"Recall:    {train_recall:.4f}")
print(f"F1-Score:  {train_f1:.4f}")

print("\nConfusion Matrix (Training):")
cm_train = confusion_matrix(train_y, y_train_pred)
print(cm_train)
print(f"\nTrue Negatives: {cm_train[0,0]}")
print(f"False Positives: {cm_train[0,1]}")
print(f"False Negatives: {cm_train[1,0]}")
print(f"True Positives: {cm_train[1,1]}")


TRAINING SET EVALUATION

Accuracy:  0.9695
Precision: 0.9777
Recall:    0.9609
F1-Score:  0.9693

Confusion Matrix (Training):
[[18999   426]
 [  761 18717]]

True Negatives: 18999
False Positives: 426
False Negatives: 761
True Positives: 18717


In [17]:
# MODEL EVALUATION - Test Set

print("\n" + "="*60)
print("TEST SET EVALUATION")
print("="*60)

y_test_pred = svm_model.predict(test_X_scaled)
y_test_decision = svm_model.decision_function(test_X_scaled)

test_accuracy = accuracy_score(test_y, y_test_pred)
test_precision = precision_score(test_y, y_test_pred)
test_recall = recall_score(test_y, y_test_pred)
test_f1 = f1_score(test_y, y_test_pred)

print(f"\nAccuracy:  {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall:    {test_recall:.4f}")
print(f"F1-Score:  {test_f1:.4f}")

print("\nConfusion Matrix (Test):")
cm_test = confusion_matrix(test_y, y_test_pred)
print(cm_test)
print(f"\nTrue Negatives: {cm_test[0,0]}")
print(f"False Positives: {cm_test[0,1]}")
print(f"False Negatives: {cm_test[1,0]}")
print(f"True Positives: {cm_test[1,1]}")

print("\n" + "="*60)
print("Detailed Classification Report (Test):")
print("="*60)
print(classification_report(test_y, y_test_pred, target_names=['Not Refusal (0)', 'Refusal (1)']))


TEST SET EVALUATION

Accuracy:  0.9292
Precision: 0.9336
Recall:    0.9222
F1-Score:  0.9279

Confusion Matrix (Test):
[[2048  140]
 [ 166 1969]]

True Negatives: 2048
False Positives: 140
False Negatives: 166
True Positives: 1969

Detailed Classification Report (Test):
                 precision    recall  f1-score   support

Not Refusal (0)       0.93      0.94      0.93      2188
    Refusal (1)       0.93      0.92      0.93      2135

       accuracy                           0.93      4323
      macro avg       0.93      0.93      0.93      4323
   weighted avg       0.93      0.93      0.93      4323

