In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import math
import copy
import warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv('train.csv')
data.sample(5)

Unnamed: 0,message_id,num_links,num_words,has_offer,sender_score,all_caps,is_spam
12128,12129,2,149,0,0.4551,0,0
17135,17136,1,37,0,0.928472,0,0
11219,11220,2,98,0,0.907456,0,0
15451,15452,2,95,0,0.830141,0,0
7350,7351,3,111,0,0.608864,0,0


**No need for the message_id col, won't help in classification**

In [4]:
data = data.drop('message_id', axis=1)

In [5]:
data.head()

Unnamed: 0,num_links,num_words,has_offer,sender_score,all_caps,is_spam
0,3,98,1,0.718607,0,0
1,0,170,0,0.698901,1,0
2,0,38,0,0.620466,0,0
3,0,116,0,0.701755,0,0
4,3,89,1,0.583621,1,1


In [6]:
print(f"Missing values: {data.isna().sum()}")

Missing values: num_links       0
num_words       0
has_offer       0
sender_score    0
all_caps        0
is_spam         0
dtype: int64


In [7]:
print(f"Datatypes: {data.dtypes}")

Datatypes: num_links         int64
num_words         int64
has_offer         int64
sender_score    float64
all_caps          int64
is_spam           int64
dtype: object


In [8]:
X = data[['num_links', 'num_words', 'has_offer', 'sender_score', 'all_caps']]
y = data['is_spam']

In [9]:
from sklearn.preprocessing import StandardScaler

numerical_features = ['num_links', 'num_words', 'sender_score']
scaler = StandardScaler()
X[numerical_features] = scaler.fit_transform(X[numerical_features])

print(f"Scaled features sample: {X.head()}")

Scaled features sample:    num_links  num_words  has_offer  sender_score  all_caps
0   1.229832  -0.224189          1      0.129767         0
1  -1.227003   1.161143          0      0.025105         1
2  -1.227003  -1.378632          0     -0.391486         0
3  -1.227003   0.122144          0      0.040259         0
4   1.229832  -0.397355          1     -0.587176         1


In [10]:
# Quick class distribution check (no plot)
spam_count = np.sum(data['is_spam'] == 1)
non_spam_count = np.sum(data['is_spam'] == 0)
print(f"Class distribution: Non-Spam={non_spam_count}, Spam={spam_count}")
print(f"Imbalance ratio: {non_spam_count/spam_count:.1f}:1")

Class distribution: Non-Spam=17354, Spam=1746
Imbalance ratio: 9.9:1


**We can notice class imbalance!!**

In [11]:
#Weight balancing using inverse frequency
class_counts = np.bincount(y) #[17354, 1746]
print(class_counts,"\n")
print(len(y))
class_weights = len(y) / (class_counts * len(class_counts))

print(f"not-spam / spam: {class_weights}")

[17354  1746] 

19100
not-spam / spam: [0.55030541 5.4696449 ]


In [12]:
X_train, X_val, y_train, y_val = train_test_split(
    X.values, y, test_size=0.2, random_state=45, stratify=y
)

y_train = np.array(y_train)
y_val = np.array(y_val)

print("\nTraining set shape:", X_train.shape, y_train.shape)
print("Validation set shape:", X_val.shape, y_val.shape)
print("Training class distribution:", np.bincount(y_train) / len(y_train))
print("Validation class distribution:", np.bincount(y_val) / len(y_val))


Training set shape: (15280, 5) (15280,)
Validation set shape: (3820, 5) (3820,)
Training class distribution: [0.9085733 0.0914267]
Validation class distribution: [0.90863874 0.09136126]


In [13]:
# Apply SMOTE for data balancing (NO PLOTS - FAST MODE)
print("\n" + "="*60)
print("APPLYING SMOTE FOR DATA BALANCING")
print("="*60)

print("Before SMOTE:")
print(f"Training set shape: {X_train.shape}")
print(f"Class distribution: {np.bincount(y_train)}")
print(f"Class percentages: {np.bincount(y_train) / len(y_train) * 100}")

# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42, k_neighbors=5)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print(f"\nAfter SMOTE:")
print(f"Training set shape: {X_train_balanced.shape}")
print(f"Class distribution: {np.bincount(y_train_balanced)}")
print(f"Class percentages: {np.bincount(y_train_balanced) / len(y_train_balanced) * 100}")

# Update class weights for balanced data
class_counts_smote = np.bincount(y_train_balanced)
class_weights_smote = len(y_train_balanced) / (class_counts_smote * len(class_counts_smote))

print(f"\nSMOTE class weights: {class_weights_smote}")
print(f"Reduction in weight imbalance: {class_weights[1]/class_weights[0]:.2f} → {class_weights_smote[1]/class_weights_smote[0]:.2f}")
print("✅ SMOTE BALANCING COMPLETE - NO PLOTS FOR SPEED!")


APPLYING SMOTE FOR DATA BALANCING
Before SMOTE:
Training set shape: (15280, 5)
Class distribution: [13883  1397]
Class percentages: [90.85732984  9.14267016]

After SMOTE:
Training set shape: (27766, 5)
Class distribution: [13883 13883]
Class percentages: [50. 50.]

SMOTE class weights: [1. 1.]
Reduction in weight imbalance: 9.94 → 1.00
✅ SMOTE BALANCING COMPLETE - NO PLOTS FOR SPEED!


**let's begin with our logistic regression**

In [14]:
# 🎯 SIMPLE LOGISTIC REGRESSION FROM SCRATCH
def sigmoid(z):
    """Sigmoid activation function"""
    z = np.clip(z, -500, 500)  # Prevent overflow
    return 1 / (1 + np.exp(-z))

def compute_cost(X, y, w, b, lambda_l1=0.0):
    """Cost function with L1 regularization"""
    m = X.shape[0]
    z = np.dot(X, w) + b
    y_pred = sigmoid(z)
    
    # Prevent log(0) by clipping predictions
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    
    # Cross-entropy cost
    cost = -np.mean(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))
    
    # L1 regularization (Lasso)
    l1_cost = lambda_l1 * np.sum(np.abs(w))
    
    return cost + l1_cost

def compute_gradients(X, y, w, b, lambda_l1=0.0):
    """Compute gradients for gradient descent"""
    m = X.shape[0]
    z = np.dot(X, w) + b
    y_pred = sigmoid(z)
    
    # Gradients
    dw = (1/m) * np.dot(X.T, (y_pred - y)) + lambda_l1 * np.sign(w)
    db = (1/m) * np.sum(y_pred - y)
    
    return dw, db

def gradient_descent(X, y, w, b, learning_rate=0.1, lambda_l1=0.0, num_iterations=1000):
    """Simple gradient descent optimization"""
    costs = []
    
    for i in range(num_iterations):
        # Compute cost and gradients
        cost = compute_cost(X, y, w, b, lambda_l1)
        dw, db = compute_gradients(X, y, w, b, lambda_l1)
        
        # Update parameters
        w = w - learning_rate * dw
        b = b - learning_rate * db
        
        # Store cost every 100 iterations
        if i % 100 == 0:
            costs.append(cost)
    
    return w, b, costs

def predict(X, w, b, threshold=0.5):
    """Make predictions"""
    z = np.dot(X, w) + b
    y_pred_proba = sigmoid(z)
    y_pred = (y_pred_proba >= threshold).astype(int)
    return y_pred, y_pred_proba

def evaluate_model(y_true, y_pred):
    """Calculate evaluation metrics"""
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1_score': f1,
        'tp': tp, 'fp': fp, 'fn': fn, 'tn': tn
    }

print("✅ Logistic Regression Functions Ready!")

✅ Logistic Regression Functions Ready!


In [15]:
# 📊 SIMPLE EVALUATION HELPER
def print_results(metrics, title="Model Results"):
    """Print evaluation metrics in a clean format"""
    print(f"\n{title}")
    print("="*50)
    print(f"Accuracy:  {metrics['accuracy']:.4f} ({metrics['accuracy']*100:.2f}%)")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall:    {metrics['recall']:.4f}")
    print(f"F1 Score:  {metrics['f1_score']:.4f}")
    print(f"\nConfusion Matrix:")
    print(f"  TP: {metrics['tp']:4d} | FP: {metrics['fp']:4d}")
    print(f"  FN: {metrics['fn']:4d} | TN: {metrics['tn']:4d}")

In [60]:
# 🚀 DIRECT TRAINING WITH OPTIMAL PARAMETERS
print("🎯 TRAINING LOGISTIC REGRESSION WITH OPTIMAL HYPERPARAMETERS")
print("="*65)

# Optimal parameters from previous experiments
LEARNING_RATE = 0.1     # Best learning rate found
L1_REGULARIZATION = 0.001  # Best L1 value found  
THRESHOLD = 0.85    # Best classification threshold
NUM_ITERATIONS = 2000   # Sufficient for convergence

print(f"📊 Using optimized parameters:")
print(f"   Learning Rate: {LEARNING_RATE}")
print(f"   L1 Regularization: {L1_REGULARIZATION}")
print(f"   Classification Threshold: {THRESHOLD}")
print(f"   Iterations: {NUM_ITERATIONS}")

# Initialize weights randomly
np.random.seed(42)  # For reproducibility
n_features = X_train_balanced.shape[1]
w = np.random.normal(0, 0.01, n_features)  # Small random weights
b = 0.0

print(f"\n🚀 Training on balanced data...")
print(f"   Training samples: {X_train_balanced.shape[0]}")
print(f"   Features: {X_train_balanced.shape[1]}")

# Train the model
w_trained, b_trained, cost_history = gradient_descent(
    X_train_balanced, y_train_balanced,
    w, b,
    learning_rate=LEARNING_RATE,
    lambda_l1=L1_REGULARIZATION,
    num_iterations=NUM_ITERATIONS
)


🎯 TRAINING LOGISTIC REGRESSION WITH OPTIMAL HYPERPARAMETERS
📊 Using optimized parameters:
   Learning Rate: 0.1
   L1 Regularization: 0.001
   Classification Threshold: 0.85
   Iterations: 2000

🚀 Training on balanced data...
   Training samples: 27766
   Features: 5


In [61]:
# 📊 MODEL EVALUATION ON VALIDATION SET
print("📊 EVALUATING TRAINED MODEL ON VALIDATION DATA")
print("="*50)

# Make predictions on validation set
y_pred_val, y_pred_proba_val = predict(X_val, w_trained, b_trained, THRESHOLD)

# Calculate evaluation metrics
val_metrics = evaluate_model(y_val, y_pred_val)

# Display results using our helper function
print_results(val_metrics, "🏆 VALIDATION SET PERFORMANCE")



📊 EVALUATING TRAINED MODEL ON VALIDATION DATA

🏆 VALIDATION SET PERFORMANCE
Accuracy:  0.9374 (93.74%)
Precision: 0.6884
Recall:    0.5759
F1 Score:  0.6271

Confusion Matrix:
  TP:  201 | FP:   91
  FN:  148 | TN: 3380


In [62]:
# LOAD AND PREPROCESS TEST DATA
print("LOADING TEST DATA FOR PREDICTIONS")
print("="*40)

# Load test data
test_data = pd.read_csv('test.csv')
print(f"Test data shape: {test_data.shape}")
print(f"Test data columns: {list(test_data.columns)}")

# Extract message IDs for submission
test_message_ids = test_data['message_id'].values
print(f"Message ID range: {test_message_ids.min()} to {test_message_ids.max()}")

# Prepare features (same as training)
X_test = test_data[['num_links', 'num_words', 'has_offer', 'sender_score', 'all_caps']]

# Apply same scaling as training data
X_test_scaled = X_test.copy()
X_test_scaled[numerical_features] = scaler.transform(X_test[numerical_features])

print(f"Test features shape: {X_test_scaled.shape}")
print(f"Test features sample:")
print(X_test_scaled.head())

LOADING TEST DATA FOR PREDICTIONS
Test data shape: (900, 6)
Test data columns: ['message_id', 'num_links', 'num_words', 'has_offer', 'sender_score', 'all_caps']
Message ID range: 20000 to 20899
Test features shape: (900, 5)
Test features sample:
   num_links  num_words  has_offer  sender_score  all_caps
0  -0.408058  -1.397873          0     -0.319948         0
1  -1.227003   1.238106          0     -0.618016         0
2  -1.227003   0.122144          0     -1.583159         0
3  -1.227003   0.045181          0     -0.253475         0
4   0.410887  -0.339633          1      1.344932         1


In [63]:
# MAKE PREDICTIONS ON TEST DATA
print("GENERATING PREDICTIONS ON TEST DATA")
print("="*40)

# Use trained model to make predictions
y_test_pred, y_test_proba = predict(X_test_scaled.values, w_trained, b_trained, THRESHOLD)

print(f"Predictions generated for {len(y_test_pred)} samples")
print(f"Predicted spam count: {np.sum(y_test_pred)}")
print(f"Predicted non-spam count: {np.sum(1 - y_test_pred)}")
print(f"Spam percentage: {np.mean(y_test_pred)*100:.1f}%")

# Show some sample predictions
print(f"\nSample predictions:")
for i in range(5):
    spam_status = "SPAM" if y_test_pred[i] == 1 else "NOT SPAM"
    confidence = y_test_proba[i] if y_test_pred[i] == 1 else (1 - y_test_proba[i])
    print(f"  Message {test_message_ids[i]}: {spam_status} (confidence: {confidence:.3f})")

GENERATING PREDICTIONS ON TEST DATA
Predictions generated for 900 samples
Predicted spam count: 76
Predicted non-spam count: 824
Spam percentage: 8.4%

Sample predictions:
  Message 20000: NOT SPAM (confidence: 0.977)
  Message 20001: NOT SPAM (confidence: 0.990)
  Message 20002: NOT SPAM (confidence: 0.983)
  Message 20003: NOT SPAM (confidence: 0.993)
  Message 20004: SPAM (confidence: 0.928)


In [64]:
# CREATE SUBMISSION CSV FILE
print("CREATING SUBMISSION CSV FILE")
print("="*30)

# Create submission dataframe
submission_df = pd.DataFrame({
    'message_id': test_message_ids,
    'is_spam': y_test_pred
})

# Verify the format
print(f"Submission file shape: {submission_df.shape}")
print(f"Columns: {list(submission_df.columns)}")
print(f"\nFirst 5 rows:")
print(submission_df.head())

print(f"\nLast 5 rows:")
print(submission_df.tail())

# Save to CSV file
submission_filename = 'submission.csv'
submission_df.to_csv(submission_filename, index=False)

print(f"\n✅ Submission file saved as '{submission_filename}'")
print(f"File contains {len(submission_df)} predictions")
print(f"Ready for submission!")

CREATING SUBMISSION CSV FILE
Submission file shape: (900, 2)
Columns: ['message_id', 'is_spam']

First 5 rows:
   message_id  is_spam
0       20000        0
1       20001        0
2       20002        0
3       20003        0
4       20004        1

Last 5 rows:
     message_id  is_spam
895       20895        0
896       20896        0
897       20897        0
898       20898        0
899       20899        0

✅ Submission file saved as 'submission.csv'
File contains 900 predictions
Ready for submission!
