In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')
import pipeline

In [2]:
# Load the training and testing datasets
train_df, test_df = pipeline.load_data()

# Display basic information about the datasets
print("Training set shape:", train_df.shape)
print("\nFirst few rows of training set:")
print(train_df.head())
print("\nTraining set class distribution:")
print(train_df['label'].value_counts())

print("\n" + "="*50)
print("\nTest set shape:", test_df.shape)
print("\nFirst few rows of test set:")
print(test_df.head())
print("\nTest set class distribution:")
print(test_df['label'].value_counts())

Training set shape: (38903, 2)

First few rows of training set:
                                            response  label
0  I'm happy to help! Cutting a flawless diamond ...      1
1  I'm sorry, but I cannot assist with questions ...      1
2  [ASS] Unfortunately, the character Lara Croft ...      0
3  I apologize, but I do not feel comfortable des...      1
4  Here are some general tips for overcoming the ...      0

Training set class distribution:
label
1    19478
0    19425
Name: count, dtype: int64


Test set shape: (4323, 2)

First few rows of test set:
                                            response  label
0  I'm sorry, I can't provide you with any contac...      1
1  That's a detailed and thoughtful guide, AI. Gi...      0
2  Certainly, I'd be happy to help you navigate t...      0
3  Hello! I'm here to assist you with your questi...      1
4  I cannot simulate or provide actual data from ...      1

Test set class distribution:
label
0    2188
1    2135
Name: count, dt

In [3]:
# PREPROCESSING PIPELINE
print("Preprocessing training data...")
train_df['processed_response'] = train_df['response'].apply(lambda x: pipeline.preprocess_text(x)[0])
train_df['tokens'] = train_df['response'].apply(lambda x: pipeline.preprocess_text(x)[1])

print("Preprocessing test data...")
test_df['processed_response'] = test_df['response'].apply(lambda x: pipeline.preprocess_text(x)[0])
test_df['tokens'] = test_df['response'].apply(lambda x: pipeline.preprocess_text(x)[1])

print("Preprocessing complete!")
print("\nSample preprocessed responses:")
for i in range(3):
    print(f"\nOriginal: {train_df['response'].iloc[i][:100]}")
    print(f"Processed: {train_df['processed_response'].iloc[i][:100]}")

Preprocessing training data...
Preprocessing test data...
Preprocessing complete!

Sample preprocessed responses:

Original: I'm happy to help! Cutting a flawless diamond can be a challenging task, and it requires a great dea
Processed: i 'm happy to help cutting a flawless diamond can be a challenging task and it requires a great deal

Original: I'm sorry, but I cannot assist with questions that seek to amplify or validate harmful myths or disc
Processed: i 'm sorry but i can not assist with question that seek to amplify or validate harmful myth or discr

Original: [ASS] Unfortunately, the character Lara Croft from the video game Tomb Raider does not have a passpo
Processed: as unfortunately the character lara croft from the video game tomb raider doe not have a passport in


In [4]:
# FEATURE EXTRACTION 
train_engineered_features, test_engineered_features = pipeline.extract_all_features(train_df, test_df)

Extracting length features...
Extracting refusal keyword features...
Extracting sentiment features...
Extracting structure features...
Extracting apologetic tone features...

Feature extraction complete!


In [5]:
# VECTORIZATION - TF-IDF and Count Vectorizer
train_tfidf_df, test_tfidf_df = pipeline.vectorize_tfidf(train_df, test_df)
train_count_df, test_count_df = pipeline.vectorize_count(train_df, test_df)
print("\nVectorization complete!")

Generating TF-IDF features...
TF-IDF shape - Train: (38903, 3000), Test: (4323, 3000)

Generating Count Vectorizer features...
Count Vectorizer shape - Train: (38903, 2000), Test: (4323, 2000)

Vectorization complete!


In [6]:
# FEATURE COMBINATION - Combine all engineered features
print("Engineered features shape:")
print(f"Train: {train_engineered_features.shape}")
print(f"Test: {test_engineered_features.shape}")

# Scale engineered features to [0, 1] range for better XGBoost performance
scaler_engineered = MinMaxScaler()
train_engineered_scaled = scaler_engineered.fit_transform(train_engineered_features)
test_engineered_scaled = scaler_engineered.transform(test_engineered_features)

train_engineered_scaled_df = pd.DataFrame(train_engineered_scaled, columns=train_engineered_features.columns)
test_engineered_scaled_df = pd.DataFrame(test_engineered_scaled, columns=test_engineered_features.columns)

# Combine engineered features with vectorized features
train_X = pd.concat([
    train_engineered_scaled_df,
    train_tfidf_df,
    train_count_df
], axis=1)

test_X = pd.concat([
    test_engineered_scaled_df,
    test_tfidf_df,
    test_count_df
], axis=1)

train_y = train_df['label']
test_y = test_df['label']

print("\n" + "="*60)
print("FINAL FEATURE SET FOR XGBOOST")
print("="*60)
print(f"Total features: {train_X.shape[1]}")
print(f"Training samples: {train_X.shape[0]}")
print(f"Test samples: {test_X.shape[0]}")
print(f"\nFeature breakdown:")
print(f"  - Engineered features (scaled): {train_engineered_scaled_df.shape[1]}")
print(f"  - TF-IDF features: {train_tfidf_df.shape[1]}")
print(f"  - Count Vectorizer features: {train_count_df.shape[1]}")

Engineered features shape:
Train: (38903, 23)
Test: (4323, 23)

FINAL FEATURE SET FOR XGBOOST
Total features: 5023
Training samples: 38903
Test samples: 4323

Feature breakdown:
  - Engineered features (scaled): 23
  - TF-IDF features: 3000
  - Count Vectorizer features: 2000


In [7]:
# MODEL TRAINING - XGBoost

print("Training XGBoost model...")
print("Using XGBClassifier with optimized hyperparameters")

xgb_model = xgb.XGBClassifier(
    n_estimators=200,           # Number of boosting rounds
    learning_rate=0.05,         # Learning rate (eta)
    max_depth=6,                # Maximum tree depth
    min_child_weight=1,         # Minimum sum of weights in child
    subsample=0.8,              # Subsample ratio of training instances
    colsample_bytree=0.8,       # Subsample ratio of features
    gamma=0,                    # Minimum loss reduction required
    reg_alpha=0,                # L1 regularization term on weights
    reg_lambda=1,               # L2 regularization term on weights
    objective='binary:logistic', # Binary classification
    random_state=42,
    n_jobs=-1,                  # Use all CPUs
    verbosity=1
)

xgb_model.fit(train_X, train_y)

print("\nXGBoost model trained successfully!")
print(f"Model classes: {xgb_model.classes_}")
print(f"Number of features used: {xgb_model.n_features_in_}")

Training XGBoost model...
Using XGBClassifier with optimized hyperparameters

XGBoost model trained successfully!
Model classes: [0 1]
Number of features used: 5023


In [8]:
# MODEL EVALUATION - Training Set

print("\n" + "="*60)
print("TRAINING SET EVALUATION")
print("="*60)

y_train_pred = xgb_model.predict(train_X)
y_train_proba = xgb_model.predict_proba(train_X)

train_accuracy = accuracy_score(train_y, y_train_pred)
train_precision = precision_score(train_y, y_train_pred)
train_recall = recall_score(train_y, y_train_pred)
train_f1 = f1_score(train_y, y_train_pred)

print(f"\nAccuracy:  {train_accuracy:.4f}")
print(f"Precision: {train_precision:.4f}")
print(f"Recall:    {train_recall:.4f}")
print(f"F1-Score:  {train_f1:.4f}")

print("\nConfusion Matrix (Training):")
cm_train = confusion_matrix(train_y, y_train_pred)
print(cm_train)
print(f"\nTrue Negatives: {cm_train[0,0]}")
print(f"False Positives: {cm_train[0,1]}")
print(f"False Negatives: {cm_train[1,0]}")
print(f"True Positives: {cm_train[1,1]}")


TRAINING SET EVALUATION

Accuracy:  0.9586
Precision: 0.9829
Recall:    0.9336
F1-Score:  0.9576

Confusion Matrix (Training):
[[19109   316]
 [ 1293 18185]]

True Negatives: 19109
False Positives: 316
False Negatives: 1293
True Positives: 18185


In [9]:
# MODEL EVALUATION - Test Set

print("\n" + "="*60)
print("TEST SET EVALUATION")
print("="*60)

y_test_pred = xgb_model.predict(test_X)
y_test_proba = xgb_model.predict_proba(test_X)

test_accuracy = accuracy_score(test_y, y_test_pred)
test_precision = precision_score(test_y, y_test_pred)
test_recall = recall_score(test_y, y_test_pred)
test_f1 = f1_score(test_y, y_test_pred)

print(f"\nAccuracy:  {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall:    {test_recall:.4f}")
print(f"F1-Score:  {test_f1:.4f}")

print("\nConfusion Matrix (Test):")
cm_test = confusion_matrix(test_y, y_test_pred)
print(cm_test)
print(f"\nTrue Negatives: {cm_test[0,0]}")
print(f"False Positives: {cm_test[0,1]}")
print(f"False Negatives: {cm_test[1,0]}")
print(f"True Positives: {cm_test[1,1]}")

print("\n" + "="*60)
print("Detailed Classification Report (Test):")
print("="*60)
print(classification_report(test_y, y_test_pred, target_names=['Not Refusal (0)', 'Refusal (1)']))


TEST SET EVALUATION

Accuracy:  0.9366
Precision: 0.9577
Recall:    0.9119
F1-Score:  0.9343

Confusion Matrix (Test):
[[2102   86]
 [ 188 1947]]

True Negatives: 2102
False Positives: 86
False Negatives: 188
True Positives: 1947

Detailed Classification Report (Test):
                 precision    recall  f1-score   support

Not Refusal (0)       0.92      0.96      0.94      2188
    Refusal (1)       0.96      0.91      0.93      2135

       accuracy                           0.94      4323
      macro avg       0.94      0.94      0.94      4323
   weighted avg       0.94      0.94      0.94      4323



In [10]:
# FEATURE IMPORTANCE ANALYSIS - XGBoost

print("\n" + "="*60)
print("TOP FEATURE IMPORTANCE (XGBoost Gain/Importance)")
print("="*60)

# Get feature importances from XGBoost
feature_names = list(train_engineered_scaled_df.columns) + list(train_tfidf_df.columns) + list(train_count_df.columns)
importances = xgb_model.feature_importances_

# Create feature importance dataframe
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances,
    'abs_importance': np.abs(importances)
}).sort_values('abs_importance', ascending=False)

print("\nTop 20 Most Important Features (by gain):")
print(feature_importance_df.head(20).to_string())

# Calculate importance by feature type
engineered_importance = feature_importance_df[feature_importance_df['feature'].isin(train_engineered_scaled_df.columns)]['importance'].sum()
tfidf_importance = feature_importance_df[feature_importance_df['feature'].str.startswith('tfidf_')]['importance'].sum()
count_importance = feature_importance_df[feature_importance_df['feature'].str.startswith('count_')]['importance'].sum()

print("\n\nFeature Importance by Type:")
print(f"  - Engineered Features: {engineered_importance:.4f} ({engineered_importance*100:.2f}%)")
print(f"  - TF-IDF Features: {tfidf_importance:.4f} ({tfidf_importance*100:.2f}%)")
print(f"  - Count Vectorizer Features: {count_importance:.4f} ({count_importance*100:.2f}%)")

print("\n\nTop 10 Engineered Features:")
top_engineered = feature_importance_df[feature_importance_df['feature'].isin(train_engineered_scaled_df.columns)].head(10)
if len(top_engineered) > 0:
    print(top_engineered[['feature', 'importance']].to_string())

print("\n\nModel Summary:")
print(f"Total Features Used: {len(feature_names)}")
print(f"  - Engineered Features: {len(train_engineered_scaled_df.columns)}")
print(f"  - TF-IDF Features: {len(train_tfidf_df.columns)}")
print(f"  - Count Vectorizer Features: {len(train_count_df.columns)}")
print(f"\nModel Hyperparameters:")
print(f"  - Number of Trees: {xgb_model.n_estimators}")
print(f"  - Learning Rate: {xgb_model.learning_rate}")
print(f"  - Max Depth: {xgb_model.max_depth}")
print(f"  - Subsample Ratio: {xgb_model.subsample}")
print(f"  - Column Subsample: {xgb_model.colsample_bytree}")


TOP FEATURE IMPORTANCE (XGBoost Gain/Importance)

Top 20 Most Important Features (by gain):
                       feature  importance  abs_importance
4     refusal_keyword_at_start    0.119073        0.119073
1908                tfidf_1885    0.045638        0.045638
5      refusal_keyword_overall    0.041784        0.041784
6      has_any_refusal_keyword    0.023978        0.023978
4286                count_1263    0.021015        0.021015
4430                count_1407    0.010411        0.010411
14           punctuation_count    0.008531        0.008531
12              sentence_count    0.007107        0.007107
3805                 count_782    0.005514        0.005514
1175                tfidf_1152    0.004655        0.004655
144                  tfidf_121    0.004602        0.004602
3829                 count_806    0.004147        0.004147
1261                tfidf_1238    0.003979        0.003979
1711                tfidf_1688    0.003737        0.003737
3888                 c