In [None]:
# Install dependencies
!pip install kagglehub nltk scikit-learn tensorflow pandas numpy matplotlib

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings('ignore')

## Dataset Acquisition

Loading sarcasm headlines dataset from Kaggle.

In [None]:
import kagglehub
import os

# Download dataset
dataset_path = kagglehub.dataset_download("shariphthapa/sarcasm-json-datasets")
print(f"Dataset downloaded to: {dataset_path}")

# Locate the JSON file
json_candidates = [
    os.path.join(dataset_path, "Sarcasm.json"),
    os.path.join(dataset_path, "Sarcasm_Headlines_Dataset.json")
]

json_file = next((f for f in json_candidates if os.path.exists(f)), None)

if not json_file:
    raise FileNotFoundError("Dataset JSON file not found")

print(f"Loading: {json_file}")

In [None]:
# Load and explore dataset
try:
    data = pd.read_json(json_file, lines=True)
except ValueError:
    data = pd.read_json(json_file)

print(f"Dataset shape: {data.shape}")
print(f"\nColumns: {data.columns.tolist()}")
print(f"\nFirst 3 rows:")
data.head(3)

In [None]:
# Remove unnecessary columns and check class distribution
data = data[['headline', 'is_sarcastic']].copy()

print("Class distribution:")
print(data['is_sarcastic'].value_counts())
print(f"\nSarcastic ratio: {data['is_sarcastic'].mean():.2%}")

# Show sample headlines
print("\nSample headlines:")
for label in [0, 1]:
    sample = data[data['is_sarcastic'] == label].iloc[0]['headline']
    label_name = "Sarcastic" if label == 1 else "Literal"
    print(f"  [{label_name}] {sample}")

## Text Preprocessing

Cleaning and tokenizing headlines while removing stop words.

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import string

# Download NLTK data
for resource in ['punkt', 'punkt_tab', 'stopwords']:
    try:
        nltk.data.find(f'tokenizers/{resource}' if resource != 'stopwords' else 'corpora/stopwords')
    except LookupError:
        nltk.download(resource, quiet=True)

stop_words = set(stopwords.words('english'))
print(f"Loaded {len(stop_words)} English stop words")

In [None]:
def clean_text(text):
    """Normalize text: lowercase, remove punctuation, filter stop words"""
    if not isinstance(text, str):
        return []
    
    # Lowercase
    text = text.lower()
    
    # Remove punctuation
    text = re.sub(f'[{re.escape(string.punctuation)}]', ' ', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stop words and empty tokens
    tokens = [t for t in tokens if t and t not in stop_words]
    
    return tokens

# Apply preprocessing
data['tokens'] = data['headline'].apply(clean_text)

# Show preprocessing effect
print("Preprocessing examples:\n")
for idx in range(2):
    print(f"Original: {data['headline'].iloc[idx]}")
    print(f"Cleaned:  {' '.join(data['tokens'].iloc[idx])}\n")

## Vectorization with Tokenizer

Converting text to numerical sequences using vocabulary mapping.

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Configure tokenizer
VOCAB_SIZE = 5000
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<UNK>")

# Fit on cleaned text
text_corpus = [' '.join(tokens) for tokens in data['tokens']]
tokenizer.fit_on_texts(text_corpus)

print(f"Vocabulary size: {len(tokenizer.word_index)}")
print(f"Using top {VOCAB_SIZE} words")
print(f"\nMost frequent words: {list(tokenizer.word_index.items())[:15]}")

In [None]:
# Convert to sequences and pad
sequences = tokenizer.texts_to_sequences(text_corpus)

# Determine sequence length
seq_lengths = [len(s) for s in sequences]
MAX_LENGTH = min(max(seq_lengths), 100)

print(f"Sequence length statistics:")
print(f"  Mean: {np.mean(seq_lengths):.1f}")
print(f"  Max: {max(seq_lengths)}")
print(f"  Using: {MAX_LENGTH}")

# Pad sequences
X_padded = pad_sequences(sequences, maxlen=MAX_LENGTH, padding='post')

print(f"\nPadded sequences shape: {X_padded.shape}")

## Baseline Model: Logistic Regression

Training on token indices as features.

In [None]:
# Prepare data
X = X_padded.astype('float32')
y = data['is_sarcastic'].values

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")

In [None]:
# Train baseline model
baseline_model = LogisticRegression(
    max_iter=3000, 
    random_state=42, 
    class_weight='balanced',
    solver='lbfgs'
)

print("Training baseline model...")
baseline_model.fit(X_train, y_train)

# Predictions
y_pred_baseline = baseline_model.predict(X_test)
y_proba_baseline = baseline_model.predict_proba(X_test)[:, 1]

# Metrics
acc_baseline = accuracy_score(y_test, y_pred_baseline)
auc_baseline = roc_auc_score(y_test, y_proba_baseline)

print(f"\nBaseline Results:")
print(f"  Accuracy: {acc_baseline:.4f}")
print(f"  ROC AUC: {auc_baseline:.4f}")

In [None]:
# Detailed baseline evaluation
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_baseline))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_baseline, 
                          target_names=['Not Sarcastic', 'Sarcastic']))

## Enhanced Model: Embedding Layer

Using learned word embeddings to capture semantic relationships.

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding

# Embedding configuration
EMBEDDING_DIM = 64
VOCAB_INPUT_SIZE = VOCAB_SIZE + 1  # Account for padding

# Create embedding layer
embedding_layer = Sequential([
    Embedding(
        input_dim=VOCAB_INPUT_SIZE,
        output_dim=EMBEDDING_DIM,
        input_length=MAX_LENGTH
    )
])

print(f"Embedding configuration:")
print(f"  Vocabulary: {VOCAB_INPUT_SIZE}")
print(f"  Dimensions: {EMBEDDING_DIM}")
print(f"  Sequence length: {MAX_LENGTH}")

In [None]:
# Generate embeddings
print("Computing embeddings for all sequences...")
embeddings = embedding_layer.predict(X_padded, verbose=0)

print(f"Embedding tensor shape: {embeddings.shape}")

# Aggregate embeddings (average pooling)
X_embedded = embeddings.mean(axis=1)

print(f"Aggregated embeddings shape: {X_embedded.shape}")

In [None]:
# Split embedded data
X_train_emb, X_test_emb, y_train_emb, y_test_emb = train_test_split(
    X_embedded, y, test_size=0.25, random_state=42, stratify=y
)

# Train on embeddings
embedding_model = LogisticRegression(
    max_iter=3000,
    random_state=42,
    class_weight='balanced',
    solver='lbfgs'
)

print("Training embedding-based model...")
embedding_model.fit(X_train_emb, y_train_emb)

# Predictions
y_pred_emb = embedding_model.predict(X_test_emb)
y_proba_emb = embedding_model.predict_proba(X_test_emb)[:, 1]

# Metrics
acc_emb = accuracy_score(y_test_emb, y_pred_emb)
auc_emb = roc_auc_score(y_test_emb, y_proba_emb)

print(f"\nEmbedding Model Results:")
print(f"  Accuracy: {acc_emb:.4f}")
print(f"  ROC AUC: {auc_emb:.4f}")

In [None]:
# Detailed embedding model evaluation
print("Confusion Matrix:")
print(confusion_matrix(y_test_emb, y_pred_emb))

print("\nClassification Report:")
print(classification_report(y_test_emb, y_pred_emb, 
                          target_names=['Not Sarcastic', 'Sarcastic']))

## Model Comparison

Analyzing performance differences between approaches.

In [None]:
# Performance comparison
comparison = pd.DataFrame({
    'Model': ['Baseline (Token IDs)', 'Embedding-based'],
    'Accuracy': [acc_baseline, acc_emb],
    'ROC AUC': [auc_baseline, auc_emb]
})

print(comparison.to_string(index=False))

# Calculate improvements
acc_diff = (acc_emb - acc_baseline) * 100
auc_diff = auc_emb - auc_baseline

print(f"\nImprovement with embeddings:")
print(f"  Accuracy: {acc_diff:+.2f} percentage points")
print(f"  ROC AUC: {auc_diff:+.4f}")

In [None]:
# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Accuracy comparison
models = ['Baseline', 'Embedding']
accuracies = [acc_baseline, acc_emb]
colors = ['#3498db', '#e74c3c']

axes[0].bar(models, accuracies, color=colors, alpha=0.7, edgecolor='black')
axes[0].set_ylabel('Accuracy')
axes[0].set_title('Model Accuracy Comparison')
axes[0].set_ylim(0.5, 1.0)
axes[0].axhline(y=0.5, color='gray', linestyle='--', alpha=0.5, label='Random')
axes[0].legend()

for i, v in enumerate(accuracies):
    axes[0].text(i, v + 0.01, f'{v:.4f}', ha='center', fontweight='bold')

# ROC curves
fpr_baseline, tpr_baseline, _ = roc_curve(y_test, y_proba_baseline)
fpr_emb, tpr_emb, _ = roc_curve(y_test_emb, y_proba_emb)

axes[1].plot(fpr_baseline, tpr_baseline, label=f'Baseline (AUC={auc_baseline:.4f})', 
            color=colors[0], linewidth=2)
axes[1].plot(fpr_emb, tpr_emb, label=f'Embedding (AUC={auc_emb:.4f})', 
            color=colors[1], linewidth=2)
axes[1].plot([0, 1], [0, 1], 'k--', alpha=0.3, label='Random')
axes[1].set_xlabel('False Positive Rate')
axes[1].set_ylabel('True Positive Rate')
axes[1].set_title('ROC Curves')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Test on custom headlines
def predict_sarcasm(headline, use_embedding=True):
    """Predict if a headline is sarcastic"""
    # Preprocess
    tokens = clean_text(headline)
    text = ' '.join(tokens)
    
    # Tokenize and pad
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=MAX_LENGTH, padding='post')
    
    if use_embedding:
        # Use embedding model
        emb = embedding_layer.predict(padded, verbose=0).mean(axis=1)
        prob = embedding_model.predict_proba(emb)[0, 1]
        pred = embedding_model.predict(emb)[0]
    else:
        # Use baseline model
        prob = baseline_model.predict_proba(padded)[0, 1]
        pred = baseline_model.predict(padded)[0]
    
    return pred, prob

# Test examples
test_headlines = [
    "Local Man Wins Nobel Prize for Physics",
    "Area Man Knows All The Shortcuts That Will Save You Time",
    "Scientists Discover Cure for Major Disease",
    "Nation's Girlfriends Unveil New Economic Plan: Let's Just Stay In Tonight"
]

print("Testing custom headlines:\n")
for headline in test_headlines:
    pred, prob = predict_sarcasm(headline, use_embedding=True)
    label = "SARCASTIC" if pred == 1 else "LITERAL"
    print(f"{label} ({prob:.2%}): {headline}")