In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')
import pipeline

In [2]:
# Load the training and testing datasets
train_df, test_df = pipeline.load_data()

# Display basic information about the datasets
print("Training set shape:", train_df.shape)
print("\nFirst few rows of training set:")
print(train_df.head())
print("\nTraining set class distribution:")
print(train_df['label'].value_counts())

print("\n" + "="*50)
print("\nTest set shape:", test_df.shape)
print("\nFirst few rows of test set:")
print(test_df.head())
print("\nTest set class distribution:")
print(test_df['label'].value_counts())

Training set shape: (38903, 2)

First few rows of training set:
                                            response  label
0  I'm happy to help! Cutting a flawless diamond ...      1
1  I'm sorry, but I cannot assist with questions ...      1
2  [ASS] Unfortunately, the character Lara Croft ...      0
3  I apologize, but I do not feel comfortable des...      1
4  Here are some general tips for overcoming the ...      0

Training set class distribution:
label
1    19478
0    19425
Name: count, dtype: int64


Test set shape: (4323, 2)

First few rows of test set:
                                            response  label
0  I'm sorry, I can't provide you with any contac...      1
1  That's a detailed and thoughtful guide, AI. Gi...      0
2  Certainly, I'd be happy to help you navigate t...      0
3  Hello! I'm here to assist you with your questi...      1
4  I cannot simulate or provide actual data from ...      1

Test set class distribution:
label
0    2188
1    2135
Name: count, dt

In [None]:
# PREPROCESSING PIPELINE
print("Preprocessing training data...")
train_df['processed_response'] = train_df['response'].apply(lambda x: pipeline.preprocess_text(x)[0])
train_df['tokens'] = train_df['response'].apply(lambda x: pipeline.preprocess_text(x)[1])

print("Preprocessing test data...")
test_df['processed_response'] = test_df['response'].apply(lambda x: pipeline.preprocess_text(x)[0])
test_df['tokens'] = test_df['response'].apply(lambda x: pipeline.preprocess_text(x)[1])

print("Preprocessing complete!")
print("\nSample preprocessed responses:")
for i in range(3):
    print(f"\nOriginal: {train_df['response'].iloc[i][:100]}")
    print(f"Processed: {train_df['processed_response'].iloc[i][:100]}")

Preprocessing training data...
Preprocessing test data...
Preprocessing complete!

Sample preprocessed responses:

Original: I'm happy to help! Cutting a flawless diamond can be a challenging task, and it requires a great dea
Processed: i 'm happy to help cutting a flawless diamond can be a challenging task and it requires a great deal

Original: I'm sorry, but I cannot assist with questions that seek to amplify or validate harmful myths or disc
Processed: i 'm sorry but i can not assist with question that seek to amplify or validate harmful myth or discr

Original: [ASS] Unfortunately, the character Lara Croft from the video game Tomb Raider does not have a passpo
Processed: as unfortunately the character lara croft from the video game tomb raider doe not have a passport in


Unnamed: 0,response,label,processed_response,tokens
0,I'm happy to help! Cutting a flawless diamond ...,1,i 'm happy to help cutting a flawless diamond ...,"[i, 'm, happy, to, help, cutting, a, flawless,..."
1,"I'm sorry, but I cannot assist with questions ...",1,i 'm sorry but i can not assist with question ...,"[i, 'm, sorry, but, i, can, not, assist, with,..."
2,"[ASS] Unfortunately, the character Lara Croft ...",0,as unfortunately the character lara croft from...,"[as, unfortunately, the, character, lara, crof..."
3,"I apologize, but I do not feel comfortable des...",1,i apologize but i do not feel comfortable desc...,"[i, apologize, but, i, do, not, feel, comforta..."
4,Here are some general tips for overcoming the ...,0,here are some general tip for overcoming the f...,"[here, are, some, general, tip, for, overcomin..."


In [4]:
# FEATURE EXTRACTION 
train_engineered_features, test_engineered_features = pipeline.extract_all_features(train_df, test_df)

Extracting length features...
Extracting refusal keyword features...
Extracting sentiment features...
Extracting structure features...
Extracting apologetic tone features...

Feature extraction complete!


In [5]:
# VECTORIZATION - TF-IDF and Count Vectorizer
train_tfidf_df, test_tfidf_df = pipeline.vectorize_tfidf(train_df, test_df)
train_count_df, test_count_df = pipeline.vectorize_count(train_df, test_df)

print("\nVectorization complete!")

Generating TF-IDF features...
TF-IDF shape - Train: (38903, 3000), Test: (4323, 3000)

Generating Count Vectorizer features...
Count Vectorizer shape - Train: (38903, 2000), Test: (4323, 2000)

Vectorization complete!


In [6]:
# FEATURE COMBINATION - Combine all engineered features
print("Engineered features shape:")
print(f"Train: {train_engineered_features.shape}")
print(f"Test: {test_engineered_features.shape}")

# Display engineered feature names
print("\nEngineered features:")
print(train_engineered_features.columns.tolist())

# Combine engineered features with vectorized features
train_X = pd.concat([
    train_engineered_features,
    train_tfidf_df,
    train_count_df
], axis=1)

test_X = pd.concat([
    test_engineered_features,
    test_tfidf_df,
    test_count_df
], axis=1)

train_y = train_df['label']
test_y = test_df['label']

print("\n" + "="*50)
print("FINAL FEATURE SET")
print("="*50)
print(f"Total features: {train_X.shape[1]}")
print(f"Training samples: {train_X.shape[0]}")
print(f"Test samples: {test_X.shape[0]}")
print(f"\nFeature breakdown:")
print(f"  - Engineered features: {train_engineered_features.shape[1]}")
print(f"  - TF-IDF features: {train_tfidf_df.shape[1]}")
print(f"  - Count Vectorizer features: {train_count_df.shape[1]}")

Engineered features shape:
Train: (38903, 23)
Test: (4323, 23)

Engineered features:
['response_length', 'word_count', 'avg_word_length', 'char_per_word', 'refusal_keyword_at_start', 'refusal_keyword_overall', 'has_any_refusal_keyword', 'sentiment_polarity', 'sentiment_subjectivity', 'is_negative_sentiment', 'is_neutral_sentiment', 'is_positive_sentiment', 'sentence_count', 'avg_sentence_length', 'punctuation_count', 'question_mark_count', 'exclamation_count', 'uppercase_ratio', 'has_multiple_sentences', 'apology_word_count', 'formal_word_count', 'is_apologetic', 'is_formal']

FINAL FEATURE SET
Total features: 5023
Training samples: 38903
Test samples: 4323

Feature breakdown:
  - Engineered features: 23
  - TF-IDF features: 3000
  - Count Vectorizer features: 2000


In [7]:
# FEATURE SCALING - Important for Logistic Regression

print("Scaling features...")
scaler = StandardScaler()

# Fit scaler on training data and transform both train and test
train_X_scaled = scaler.fit_transform(train_X)
test_X_scaled = scaler.transform(test_X)

print("Features scaled successfully!")
print(f"Scaled training shape: {train_X_scaled.shape}")
print(f"Scaled test shape: {test_X_scaled.shape}")

Scaling features...
Features scaled successfully!
Scaled training shape: (38903, 5023)
Scaled test shape: (4323, 5023)


In [8]:
# MODEL TRAINING 

print("Training Logistic Regression model...")
log_reg_model = LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1, solver='lbfgs')
log_reg_model.fit(train_X_scaled, train_y)

print("Logistic Regression model trained successfully!")
print(f"Model classes: {log_reg_model.classes_}")
print(f"Model intercept: {log_reg_model.intercept_}")
print(f"Number of features used: {log_reg_model.n_features_in_}")

Training Logistic Regression model...
Logistic Regression model trained successfully!
Model classes: [0 1]
Model intercept: [6.58866232]
Number of features used: 5023


In [9]:
# FEATURE IMPORTANCE ANALYSIS

print("\n" + "="*60)
print("TOP FEATURE IMPORTANCE (Logistic Regression Coefficients)")
print("="*60)

# Get feature coefficients
feature_names = train_X.columns.tolist()
coefficients = log_reg_model.coef_[0]

# Create feature importance dataframe
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'coefficient': coefficients,
    'abs_coefficient': np.abs(coefficients)
}).sort_values('abs_coefficient', ascending=False)

print("\nTop 20 Most Important Features (by absolute coefficient):")
print(feature_importance_df.head(20).to_string())

print("\n\nTop 10 Features Contributing to REFUSAL prediction (positive coefficients):")
top_refusal = feature_importance_df[feature_importance_df['coefficient'] > 0].head(10)
print(top_refusal.to_string())

print("\n\nTop 10 Features Contributing to NOT REFUSAL prediction (negative coefficients):")
top_not_refusal = feature_importance_df[feature_importance_df['coefficient'] < 0].head(10)
print(top_not_refusal.to_string())


TOP FEATURE IMPORTANCE (Logistic Regression Coefficients)

Top 20 Most Important Features (by absolute coefficient):
                       feature  coefficient  abs_coefficient
20           formal_word_count     3.663747         3.663747
4     refusal_keyword_at_start     3.082610         3.082610
3886                 count_863     2.484431         2.484431
4553                count_1530     2.356712         2.356712
6      has_any_refusal_keyword     2.278657         2.278657
4146                count_1123     2.040851         2.040851
3804                 count_781     2.009436         2.009436
1706                tfidf_1683     1.939828         1.939828
80                    tfidf_57     1.742301         1.742301
4924                count_1901    -1.735808         1.735808
412                  tfidf_389    -1.733673         1.733673
4610                count_1587     1.724934         1.724934
1702                tfidf_1679     1.715393         1.715393
29                     tfidf

In [10]:
# MODEL EVALUATION - Training Set

print("\n" + "="*60)
print("TRAINING SET EVALUATION")
print("="*60)

y_train_pred = log_reg_model.predict(train_X_scaled)
y_train_proba = log_reg_model.predict_proba(train_X_scaled)

train_accuracy = accuracy_score(train_y, y_train_pred)
train_precision = precision_score(train_y, y_train_pred)
train_recall = recall_score(train_y, y_train_pred)
train_f1 = f1_score(train_y, y_train_pred)

print(f"\nAccuracy:  {train_accuracy:.4f}")
print(f"Precision: {train_precision:.4f}")
print(f"Recall:    {train_recall:.4f}")
print(f"F1-Score:  {train_f1:.4f}")

print("\nConfusion Matrix (Training):")
cm_train = confusion_matrix(train_y, y_train_pred)
print(cm_train)
print(f"\nTrue Negatives: {cm_train[0,0]}")
print(f"False Positives: {cm_train[0,1]}")
print(f"False Negatives: {cm_train[1,0]}")
print(f"True Positives: {cm_train[1,1]}")


TRAINING SET EVALUATION

Accuracy:  0.9973
Precision: 0.9984
Recall:    0.9961
F1-Score:  0.9973

Confusion Matrix (Training):
[[19394    31]
 [   75 19403]]

True Negatives: 19394
False Positives: 31
False Negatives: 75
True Positives: 19403


In [11]:
# MODEL EVALUATION - Test Set

print("\n" + "="*60)
print("TEST SET EVALUATION")
print("="*60)

y_test_pred = log_reg_model.predict(test_X_scaled)
y_test_proba = log_reg_model.predict_proba(test_X_scaled)

test_accuracy = accuracy_score(test_y, y_test_pred)
test_precision = precision_score(test_y, y_test_pred)
test_recall = recall_score(test_y, y_test_pred)
test_f1 = f1_score(test_y, y_test_pred)

print(f"\nAccuracy:  {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall:    {test_recall:.4f}")
print(f"F1-Score:  {test_f1:.4f}")

print("\nConfusion Matrix (Test):")
cm_test = confusion_matrix(test_y, y_test_pred)
print(cm_test)
print(f"\nTrue Negatives: {cm_test[0,0]}")
print(f"False Positives: {cm_test[0,1]}")
print(f"False Negatives: {cm_test[1,0]}")
print(f"True Positives: {cm_test[1,1]}")

print("\nDetailed Classification Report (Test):")
print(classification_report(test_y, y_test_pred, target_names=['Not Refusal (0)', 'Refusal (1)']))


TEST SET EVALUATION

Accuracy:  0.9065
Precision: 0.8983
Recall:    0.9143
F1-Score:  0.9062

Confusion Matrix (Test):
[[1967  221]
 [ 183 1952]]

True Negatives: 1967
False Positives: 221
False Negatives: 183
True Positives: 1952

Detailed Classification Report (Test):
                 precision    recall  f1-score   support

Not Refusal (0)       0.91      0.90      0.91      2188
    Refusal (1)       0.90      0.91      0.91      2135

       accuracy                           0.91      4323
      macro avg       0.91      0.91      0.91      4323
   weighted avg       0.91      0.91      0.91      4323

