# Part 1: Imports and Initialization

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from scipy import sparse
import tensorflow as tf
from transformers import BertTokenizer, DistilBertTokenizer, TFDistilBertForSequenceClassification
from tabulate import tabulate

# Download necessary NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# Check GPU availability
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))


Num GPUs Available:  0


# Part 2: Load and Preprocess Data

In [2]:
# Step 1: Load the dataset
df = pd.read_csv('/content/drive/MyDrive/train-balanced-sarcasm.csv')
relevant_columns = ['label', 'comment', 'subreddit', 'parent_comment']
df = df[relevant_columns]

# Step 3: Data ablation settings with stratified sampling
df_sample, _ = train_test_split(df, train_size=len(df) // 10000, stratify=df['label'], random_state=42)
df = df_sample.reset_index(drop=True)  # Reset index to avoid issues

# Count and print the number of sarcastic (label=1) and non-sarcastic (label=0) entries after sampling
sarcastic_count = df['label'].sum()
non_sarcastic_count = len(df) - sarcastic_count
print(f"\nNumber of sarcastic entries: {sarcastic_count}")
print(f"Number of non-sarcastic entries: {non_sarcastic_count}\n")

# Step 4: Preprocessing

# def preprocess_text(text, lowercase=True, remove_stopwords=False, stemming=False, lemmatization=False, remove_punctuation=False):
#     if lowercase:
#         text = text.lower()

#     if remove_punctuation:
#         text = text.translate(str.maketrans('', '', string.punctuation))

#     if remove_stopwords:
#         stop_words = set(stopwords.words('english'))
#         text = ' '.join([word for word in text.split() if word not in stop_words])

#     if stemming:
#         stemmer = PorterStemmer()
#         text = ' '.join([stemmer.stem(word) for word in text.split()])

#     if lemmatization:
#         lemmatizer = WordNetLemmatizer()
#         text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

#     return text

# # Apply preprocessing to the 'comment' and 'parent_comment' columns
# df['processed_comment'] = df['comment'].apply(lambda x: preprocess_text(x,
#                                                               lowercase=True,
#                                                               remove_stopwords=True,
#                                                               stemming=False,
#                                                               lemmatization=True,
#                                                               remove_punctuation=True))

# df['processed_parent_comment'] = df['parent_comment'].apply(lambda x: preprocess_text(x,
#                                                                             lowercase=True,
#                                                                             remove_stopwords=True,
#                                                                             stemming=False,
#                                                                             lemmatization=True,
#                                                                             remove_punctuation=True))


stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text, remove_stopwords=True, lemmatization=True):
    if pd.isna(text):
        return ""
    tokens = nltk.word_tokenize(str(text).lower())
    if remove_stopwords:
        tokens = [token for token in tokens if token not in stop_words]
    if lemmatization:
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

df['processed_comment'] = df['comment'].apply(preprocess_text)
df['processed_parent_comment'] = df['parent_comment'].apply(preprocess_text)


Number of sarcastic entries: 51
Number of non-sarcastic entries: 50



# Part 3: Feature Extraction and Train-Test Split

In [3]:
# Step 5: Feature extraction
tfidf = TfidfVectorizer(max_features=5000)
tfidf_features = tfidf.fit_transform(df['processed_comment'])
tfidf_parent = TfidfVectorizer(max_features=5000)
tfidf_parent_features = tfidf_parent.fit_transform(df['processed_parent_comment'])

df['comment_length'] = df['comment'].fillna('').apply(len)
df['parent_comment_length'] = df['parent_comment'].fillna('').apply(len)

subreddit_dummies = pd.get_dummies(df['subreddit'], prefix='subreddit')

numerical_features = df[['comment_length', 'parent_comment_length']].values
features = sparse.hstack([tfidf_features, tfidf_parent_features, numerical_features, subreddit_dummies])

# Step 6: Train-test split (no stratify needed as it's already applied)
X_train, X_test, y_train, y_test, train_indices, test_indices = train_test_split(
    features, df['label'], df.index, test_size=0.2, random_state=42
)

# For BERT, extract the text using the original train indices
X_train_text = df['processed_comment'].iloc[train_indices]
X_test_text = df['processed_comment'].iloc[test_indices]


# Part 4: Model Training Functions

Random Forest Model

In [4]:
# Random Forest
def train_random_forest(X_train, X_test, y_train, y_test):
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    return rf_model

SVM Model

In [5]:
# SVM
def train_svm(X_train, X_test, y_train, y_test):
    svm_model = SVC(kernel='rbf', random_state=42, probability=True)
    svm_model.fit(X_train, y_train)
    return svm_model

BERT Model

In [6]:
# DistilBERT
def create_bert_model():
    bert_model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5, epsilon=1e-8)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    bert_model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
    return bert_model

def prepare_bert_data(texts, tokenizer, max_length=128):
    encoded = tokenizer.batch_encode_plus(
        texts,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='tf'
    )
    return encoded['input_ids'], encoded['attention_mask']

def train_bert(X_train_text, X_test_text, y_train, y_test):
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    X_train_ids, X_train_mask = prepare_bert_data(X_train_text, tokenizer)

    # Create the BERT model
    bert_model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

    # Compile the model without explicitly specifying the optimizer
    bert_model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

    # Train the BERT model
    bert_model.fit(
        [X_train_ids, X_train_mask], y_train,
        epochs=3, batch_size=32, validation_split=0.1
    )

    return bert_model, tokenizer

LSTM Model

In [7]:
# LSTM
def create_lstm_model(vocab_size, embedding_dim=100, max_length=100):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
        tf.keras.layers.LSTM(64),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def prepare_lstm_data(texts, tokenizer, max_length=100):
    sequences = tokenizer.texts_to_sequences(texts)
    padded = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_length, padding='post')
    return padded

def train_lstm(X_train_text, X_test_text, y_train, y_test):
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(X_train_text)

    X_train_seq = prepare_lstm_data(X_train_text, tokenizer)

    vocab_size = len(tokenizer.word_index) + 1
    lstm_model = create_lstm_model(vocab_size)
    lstm_model.fit(X_train_seq, y_train, epochs=5, batch_size=32, validation_split=0.1)

    return lstm_model, tokenizer

Hybrid (BERT+LSTM) Model

In [8]:
# Hybrid model (DistilBERT + LSTM)
def create_hybrid_model(bert_model, lstm_model):
    bert_input_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name='bert_input_ids')
    bert_attention_mask = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name='bert_attention_mask')
    lstm_input = tf.keras.layers.Input(shape=(100,), dtype=tf.int32, name='lstm_input')

    bert_output = bert_model([bert_input_ids, bert_attention_mask]).logits
    lstm_output = lstm_model(lstm_input)

    combined = tf.keras.layers.concatenate([bert_output, lstm_output])

    x = tf.keras.layers.Dense(64, activation='relu')(combined)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(x)

    hybrid_model = tf.keras.Model(inputs=[bert_input_ids, bert_attention_mask, lstm_input], outputs=output)

    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5, epsilon=1e-8)
    hybrid_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    return hybrid_model

def train_hybrid(bert_model, lstm_model, X_train_bert, X_train_lstm, y_train):
    hybrid_model = create_hybrid_model(bert_model, lstm_model)
    hybrid_model.fit(
        [X_train_bert[0], X_train_bert[1], X_train_lstm],
        y_train,
        epochs=3,
        batch_size=32,
        validation_split=0.1
    )

    return hybrid_model

# Part 5: Model Evaluation

In [9]:
# Step 7: Model evaluation
def evaluate_model(y_true, y_pred, model_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
    cm = confusion_matrix(y_true, y_pred)

    print(f"{model_name} Results:")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-score: {f1}")
    print("Confusion Matrix:")
    print(cm)
    print("-" * 40)  # Separator for better output readability

# Part 6: Cross-Validation

In [10]:
# Step 8: Cross-validation for all models
def perform_cross_validation(model_name, model, X, y, tokenizer=None, cv=3):
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    fold_scores = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(np.zeros(len(X)), y)):
        print(f"Training fold {fold + 1}/{cv} for {model_name}...")

        if model_name in ['BERT', 'Hybrid', 'LSTM']:
            # Convert indices to lists to ensure compatibility with iloc
            X_train_text = X.iloc[train_idx].tolist()
            X_val_text = X.iloc[val_idx].tolist()

            if model_name == 'BERT':
                X_train_ids, X_train_mask = prepare_bert_data(X_train_text, tokenizer)
                X_val_ids, X_val_mask = prepare_bert_data(X_val_text, tokenizer)

                model.fit(
                    [X_train_ids, X_train_mask], y.iloc[train_idx],
                    validation_data=([X_val_ids, X_val_mask], y.iloc[val_idx]),
                    epochs=3,
                    batch_size=32
                )
                val_pred_logits = model.predict([X_val_ids, X_val_mask]).logits
                val_preds = (tf.nn.sigmoid(val_pred_logits[:, 0]).numpy().flatten() > 0.5).astype(int)

            elif model_name == 'LSTM':
                X_train_lstm = prepare_lstm_data(X_train_text, tokenizer)
                X_val_lstm = prepare_lstm_data(X_val_text, tokenizer)

                model.fit(
                    X_train_lstm, y.iloc[train_idx],
                    validation_data=(X_val_lstm, y.iloc[val_idx]),
                    epochs=3,
                    batch_size=32
                )
                val_preds = model.predict(X_val_lstm)
                val_preds = (val_preds > 0.5).astype(int).flatten()

            elif model_name == 'Hybrid':
                X_train_ids, X_train_mask = prepare_bert_data(X_train_text, tokenizer)
                X_val_ids, X_val_mask = prepare_bert_data(X_val_text, tokenizer)

                X_train_lstm = prepare_lstm_data(X_train_text, tokenizer)
                X_val_lstm = prepare_lstm_data(X_val_text, tokenizer)

                model.fit(
                    [X_train_ids, X_train_mask, X_train_lstm], y.iloc[train_idx],
                    validation_data=([X_val_ids, X_val_mask, X_val_lstm], y.iloc[val_idx]),
                    epochs=3,
                    batch_size=32
                )
                val_pred_logits = model.predict([X_val_ids, X_val_mask, X_val_lstm])
                val_preds = (tf.nn.sigmoid(val_pred_logits[:, 0]).numpy().flatten() > 0.5).astype(int)

        else:
            # For non-BERT and non-LSTM models
            model.fit(X[train_idx], y.iloc[train_idx])
            val_preds = model.predict(X[val_idx])

        accuracy = accuracy_score(y.iloc[val_idx], val_preds)
        precision, recall, f1, _ = precision_recall_fscore_support(y.iloc[val_idx], val_preds, average='binary')
        fold_scores.append((accuracy, precision, recall, f1))
        print(f"Fold {fold + 1} - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}")
        print("-" * 40)

    fold_scores = np.array(fold_scores)
    print(f"\n{model_name} Cross-Validation Results:")
    print(f"Mean Accuracy: {fold_scores[:, 0].mean()}")
    print(f"Mean Precision: {fold_scores[:, 1].mean()}")
    print(f"Mean Recall: {fold_scores[:, 2].mean()}")
    print(f"Mean F1-Score: {fold_scores[:, 3].mean()}")
    print("-" * 40)

# Part 7: Ablation Studies

In [11]:
# Step 9: Ablation studies for all models
def perform_ablation_studies(model_name, model, X, y, tokenizer=None):
    if model_name in ['BERT', 'Hybrid', 'LSTM']:
        feature_combinations = [
            ('Full Text', X)
        ]
    else:
        feature_combinations = [
            ('TF-IDF only', X[:, :5000]),
            ('TF-IDF + Numerical', X[:, :5002]),
            ('TF-IDF + Numerical + Subreddit', X)
        ]

    for name, features in feature_combinations:
        print(f"\nAblation study: {name}")

        if model_name in ['BERT', 'Hybrid', 'LSTM']:
            X_train_text, X_test_text, y_train_fold, y_test_fold = train_test_split(features.tolist(), y, test_size=0.2, random_state=42)

            if model_name == 'BERT':
                X_train_ids, X_train_mask = prepare_bert_data(X_train_text, tokenizer)
                X_test_ids, X_test_mask = prepare_bert_data(X_test_text, tokenizer)

                model.fit(
                    [X_train_ids, X_train_mask], y_train_fold,
                    validation_data=([X_test_ids, X_test_mask], y_test_fold),
                    epochs=3,
                    batch_size=32
                )
                val_pred_logits = model.predict([X_test_ids, X_test_mask]).logits
                y_pred = (tf.nn.sigmoid(val_pred_logits[:, 0]).numpy().flatten() > 0.5).astype(int)
            elif model_name == 'Hybrid':
                X_train_lstm = prepare_lstm_data(X_train_text, tokenizer)
                X_test_lstm = prepare_lstm_data(X_test_text, tokenizer)

                X_train_ids, X_train_mask = prepare_bert_data(X_train_text, tokenizer)
                X_test_ids, X_test_mask = prepare_bert_data(X_test_text, tokenizer)

                model.fit(
                    [X_train_ids, X_train_mask, X_train_lstm], y_train_fold,
                    validation_data=([X_test_ids, X_test_mask, X_test_lstm], y_test_fold),
                    epochs=3,
                    batch_size=32
                )
                val_pred_logits = model.predict([X_test_ids, X_test_mask, X_test_lstm])
                y_pred = (tf.nn.sigmoid(val_pred_logits[:, 0]).numpy().flatten() > 0.5).astype(int)
            elif model_name == 'LSTM':
                X_train_lstm = prepare_lstm_data(X_train_text, tokenizer)
                X_test_lstm = prepare_lstm_data(X_test_text, tokenizer)

                model.fit(
                    X_train_lstm, y_train_fold,
                    validation_data=(X_test_lstm, y_test_fold),
                    epochs=3,
                    batch_size=32
                )
                y_pred = model.predict(X_test_lstm)
                y_pred = (y_pred > 0.5).astype(int).flatten()

        else:
            X_train, X_test, y_train_fold, y_test_fold = train_test_split(features, y, test_size=0.2, random_state=42)
            model.fit(X_train, y_train_fold)
            y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test_fold, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(y_test_fold, y_pred, average='binary')
        print(f"{model_name} ({name}) - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}")
        print("-" * 40)

# Part 8: Model Comparison and Prediction

In [12]:
# Step 10: Model comparison
def compare_models(models, X_test, y_test, test_indices):
    results = []
    for name, model in models.items():
        if name in ['BERT', 'Hybrid']:
            if name == 'BERT':
                X_test_processed = prepare_bert_data(df.loc[test_indices, 'processed_comment'], bert_tokenizer)
                predictions = model.predict(X_test_processed, batch_size=32)
                logits = predictions.logits
            elif name == 'Hybrid':
                X_test_bert = prepare_bert_data(df.loc[test_indices, 'processed_comment'], bert_tokenizer)
                X_test_lstm = prepare_lstm_data(df.loc[test_indices, 'processed_comment'], lstm_tokenizer)
                predictions = model.predict([X_test_bert[0], X_test_bert[1], X_test_lstm], batch_size=32)
                logits = predictions

            probabilities = tf.nn.sigmoid(logits[:, 0]).numpy().flatten()
            y_pred = (probabilities > 0.5).astype(int)
        elif name == 'LSTM':
            X_test_processed = prepare_lstm_data(df.loc[test_indices, 'processed_comment'], lstm_tokenizer)
            y_pred = model.predict(X_test_processed)
            y_pred = (y_pred > 0.5).astype(int).flatten()
        else:
            # Convert to dense if the model is SVM and the input is sparse
            if name == 'SVM' and sparse.issparse(X_test):
                X_test_dense = X_test.toarray()
                y_pred = model.predict(X_test_dense)
            else:
                y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
        results.append([name, accuracy, precision, recall, f1])

    headers = ["Model", "Accuracy", "Precision", "Recall", "F1-Score"]
    print(tabulate(results, headers=headers, floatfmt=".4f"))
    print("-" * 40)

# Step 11: Prediction for new input
def predict_sarcasm(models, text):
    processed_text = preprocess_text(text)

    results = []
    for name, model in models.items():
        if name == 'Random Forest' or name == 'SVM':
            tfidf_features = tfidf.transform([processed_text])
            tfidf_parent_features = tfidf_parent.transform([''])
            comment_length = len(text)
            parent_comment_length = 0

            subreddit_dummies_empty = pd.DataFrame([np.zeros(subreddit_dummies.shape[1])], columns=subreddit_dummies.columns)
            numerical_features = np.array([[comment_length, parent_comment_length]])

            features = sparse.hstack([tfidf_features, tfidf_parent_features, numerical_features, subreddit_dummies_empty])

            # Convert to dense if the model is SVM and the input is sparse
            if name == 'SVM' and sparse.issparse(features):
                features = features.toarray()

            prediction = model.predict(features)[0]

        elif name == 'BERT':
            input_ids, attention_mask = prepare_bert_data([processed_text], bert_tokenizer)
            prediction_logits = model.predict([input_ids, attention_mask]).logits
            prediction_prob = tf.nn.sigmoid(prediction_logits[:, 0]).numpy().flatten()
            prediction = 1 if prediction_prob > 0.5 else 0

        elif name == 'LSTM':
            sequence = prepare_lstm_data([processed_text], lstm_tokenizer)
            prediction_prob = model.predict(sequence)[0][0]
            prediction = 1 if prediction_prob > 0.5 else 0

        elif name == 'Hybrid':
            input_ids, attention_mask = prepare_bert_data([processed_text], bert_tokenizer)
            lstm_input = prepare_lstm_data([processed_text], lstm_tokenizer)
            prediction_logits = model.predict([input_ids, attention_mask, lstm_input])
            prediction_prob = tf.nn.sigmoid(prediction_logits[:, 0]).numpy().flatten()
            prediction = 1 if prediction_prob > 0.5 else 0

        results.append([name, "Sarcastic" if prediction == 1 else "Not sarcastic"])

    headers = ["Model", "Prediction"]
    print(tabulate(results, headers=headers))
    print("-" * 40)


# Part 9: Main Execution and Interactive Prediction

In [13]:
# Main execution
if __name__ == "__main__":
    # Train Random Forest
    print("Training Random Forest...")
    rf_model = train_random_forest(X_train, X_test, y_train, y_test)
    rf_pred = rf_model.predict(X_test)
    evaluate_model(y_test, rf_pred, "Random Forest")
    perform_cross_validation("Random Forest", rf_model, features.toarray(), df['label'])
    perform_ablation_studies("Random Forest", rf_model, features.toarray(), df['label'])

    # Train SVM
    print("\nTraining SVM...")
    svm_model = train_svm(X_train, X_test, y_train, y_test)
    svm_pred = svm_model.predict(X_test)
    evaluate_model(y_test, svm_pred, "SVM")

    # Train BERT
    print("\nTraining BERT...")
    bert_model, bert_tokenizer = train_bert(X_train_text, X_test_text, y_train, y_test)
    X_test_bert = prepare_bert_data(X_test_text, bert_tokenizer)
    bert_pred = bert_model.predict([X_test_bert[0], X_test_bert[1]])
    bert_pred = np.argmax(bert_pred.logits, axis=1)
    evaluate_model(y_test, bert_pred, "BERT")

    # Initialize LSTM tokenizer before using it
    lstm_tokenizer = tf.keras.preprocessing.text.Tokenizer()
    lstm_tokenizer.fit_on_texts(X_train_text)  # Assuming X_train_text is the training text data

    # Train LSTM
    print("\nTraining LSTM...")
    lstm_model, lstm_tokenizer = train_lstm(X_train_text, X_test_text, y_train, y_test)
    X_test_lstm = prepare_lstm_data(X_test_text, lstm_tokenizer)
    lstm_pred = lstm_model.predict(X_test_lstm)
    lstm_pred = (lstm_pred > 0.5).astype(int).flatten()
    evaluate_model(y_test, lstm_pred, "LSTM")
    perform_cross_validation("LSTM", lstm_model, X_train_text, y_train, tokenizer=lstm_tokenizer)
    perform_ablation_studies("LSTM", lstm_model, X_train_text, y_train, tokenizer=lstm_tokenizer)

    # Train Hybrid (BERT + LSTM)
    print("\nTraining Hybrid model...")
    X_train_bert = prepare_bert_data(X_train_text, bert_tokenizer)
    X_train_lstm = prepare_lstm_data(X_train_text, lstm_tokenizer)
    hybrid_model = train_hybrid(bert_model, lstm_model, X_train_bert, X_train_lstm, y_train)
    X_test_bert = prepare_bert_data(X_test_text, bert_tokenizer)
    X_test_lstm = prepare_lstm_data(X_test_text, lstm_tokenizer)
    hybrid_pred = hybrid_model.predict([X_test_bert[0], X_test_bert[1], X_test_lstm])
    hybrid_pred = (hybrid_pred > 0.5).astype(int).flatten()
    evaluate_model(y_test, hybrid_pred, "Hybrid (BERT + LSTM)")

    # Store all trained models in a dictionary
    models = {
        "Random Forest": rf_model,
        "SVM": svm_model,
        "BERT": bert_model,
        "LSTM": lstm_model,
        "Hybrid": hybrid_model
    }

    # Compare all models
    print("\nModel Comparison:")
    compare_models(models, X_test, y_test, test_indices)

Training Random Forest...
Random Forest Results:
Accuracy: 0.47619047619047616
Precision: 0.45
Recall: 1.0
F1-score: 0.6206896551724138
Confusion Matrix:
[[ 1 11]
 [ 0  9]]
----------------------------------------
Training fold 1/2 for Random Forest...
Fold 1 - Accuracy: 0.43137254901960786, Precision: 0.46153846153846156, Recall: 0.6923076923076923, F1: 0.553846153846154
----------------------------------------
Training fold 2/2 for Random Forest...
Fold 2 - Accuracy: 0.44, Precision: 0.46511627906976744, Recall: 0.8, F1: 0.5882352941176471
----------------------------------------

Random Forest Cross-Validation Results:
Mean Accuracy: 0.4356862745098039
Mean Precision: 0.4633273703041145
Mean Recall: 0.7461538461538462
Mean F1-Score: 0.5710407239819005
----------------------------------------

Ablation study: TF-IDF only
Random Forest (TF-IDF only) - Accuracy: 0.47619047619047616, Precision: 0.45, Recall: 1.0, F1: 0.6206896551724138
----------------------------------------

Ablation 

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

BERT Results:
Accuracy: 0.42857142857142855
Precision: 0.42857142857142855
Recall: 1.0
F1-score: 0.6
Confusion Matrix:
[[ 0 12]
 [ 0  9]]
----------------------------------------

Training LSTM...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
LSTM Results:
Accuracy: 0.42857142857142855
Precision: 0.42857142857142855
Recall: 1.0
F1-score: 0.6
Confusion Matrix:
[[ 0 12]
 [ 0  9]]
----------------------------------------
Training fold 1/2 for LSTM...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold 1 - Accuracy: 0.525, Precision: 0.525, Recall: 1.0, F1: 0.6885245901639345
----------------------------------------
Training fold 2/2 for LSTM...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Fold 2 - Accuracy: 0.525, Precision: 0.525, Recall: 1.0, F1: 0.6885245901639345
----------------------------------------

LSTM Cross-Validation Results:
Mean Accuracy: 0.525
Mean Precision: 0.525
Mean Recall: 1.0
Mean F1-Score: 0.6885245901639345
----------------------------------------

Ablation study: Full Text
Epoch 1/3
Epoch 2/

  _warn_prf(average, modifier, msg_start, len(result))


Hybrid (BERT + LSTM) Results:
Accuracy: 0.5714285714285714
Precision: 0.0
Recall: 0.0
F1-score: 0.0
Confusion Matrix:
[[12  0]
 [ 9  0]]
----------------------------------------

Model Comparison:


  _warn_prf(average, modifier, msg_start, len(result))


Model            Accuracy    Precision    Recall    F1-Score
-------------  ----------  -----------  --------  ----------
Random Forest      0.4762       0.4500    1.0000      0.6207
SVM                0.6190       0.5294    1.0000      0.6923
BERT               0.5714       0.0000    0.0000      0.0000
LSTM               0.4286       0.4286    1.0000      0.6000
Hybrid             0.4286       0.4286    1.0000      0.6000
----------------------------------------


User Interactive Prediction

In [14]:
    # Example of predicting sarcasm for a new input
    print("\nSarcasm Prediction for New Input:")
    new_text = "I absolutely love waiting in long lines at the DMV. It's the highlight of my day!"
    predict_sarcasm(models, new_text)

    # Interactive prediction
    while True:
        user_input = input("\nEnter a comment to check for sarcasm (or 'quit' to exit): ")
        if user_input.lower() == 'quit':
            break
        predict_sarcasm(models, user_input)

print("Program completed.")


Sarcasm Prediction for New Input:
Model          Prediction
-------------  -------------
Random Forest  Sarcastic
SVM            Sarcastic
BERT           Not sarcastic
LSTM           Sarcastic
Hybrid         Sarcastic
----------------------------------------

Enter a comment to check for sarcasm (or 'quit' to exit): We are working on this feature, it can not predict accurately. Thanks.
Model          Prediction
-------------  -------------
Random Forest  Sarcastic
SVM            Sarcastic
BERT           Not sarcastic
LSTM           Sarcastic
Hybrid         Sarcastic
----------------------------------------

Enter a comment to check for sarcasm (or 'quit' to exit): quit
Program completed.
