<a href="https://colab.research.google.com/github/jonsol/small_projects/blob/master/fcc_sms_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import libraries
import tensorflow as tf
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [None]:
# To pd dataframe and change labels to numeric
train_dataset = pd.read_csv(train_file_path, sep='\t', header=None, names=['label', 'message'])
test_dataset = pd.read_csv(test_file_path, sep='\t', header=None, names=['label', 'message'])

train_dataset['label'] = train_dataset['label'].map({'ham': 0, 'spam': 1})
test_dataset['label'] = test_dataset['label'].map({'ham': 0, 'spam': 1})

In [None]:
train_dataset.head()

In [None]:
train_labels =  train_dataset["label"].values
train_ds = tf.data.Dataset.from_tensor_slices(
    (train_dataset["message"].values, train_labels)
)

test_labels =  test_dataset["label"].values
test_ds = tf.data.Dataset.from_tensor_slices(
    (test_dataset["message"].values, test_labels)
)

In [None]:
train_dataset.head()

In [None]:
BATCH_SIZE = 128
BUFFER_SIZE = 1000

train_ds = train_ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

print("Batch size:", BATCH_SIZE)
print("Buffer size:", BUFFER_SIZE)

In [None]:
train_ds

In [None]:
# Text vectorization
vec = tf.keras.layers.TextVectorization(
    output_mode='int',
    max_tokens=2000,
    output_sequence_length=500,
)

vec.adapt(train_ds.map(lambda text, label: text))

In [None]:
vocab = np.array(vec.get_vocabulary())
vocab[:32]

In [None]:
# Model
model = tf.keras.Sequential([
    vec,
    tf.keras.layers.Embedding(
        len(vec.get_vocabulary()),  # Input dim: size of vocab
        64,  # output dim: Dimension of the dense embedding
        mask_zero=True,
    ),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(1)
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), optimizer=tf.keras.optimizers.Adam(1e-4), metrics=['accuracy'])
model.summary()

In [None]:
val_steps = int(len(test_dataset)/BATCH_SIZE)
val_steps

In [None]:
# Train
history = model.fit(train_ds, validation_data=test_ds, validation_steps=val_steps, epochs=10)

In [None]:
test_loss, test_acc = model.evaluate(test_ds)

print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)

In [None]:
# Added function for
def predict_message(model, texts, vectorizer=None):
    """
    Predict probabilities and labels for text input(s) using the given model.

    Args:
        model: Trained Keras model.
        texts: str or list of str / list of tokens.
        vectorizer: TextVectorization layer, if used during training (optional).
        threshold: Probability threshold to classify as "spam" or "ham".

    Returns:
        List of tuples: (probability, label)
    """
    # Ensure input is a list
    if isinstance(texts, str) or isinstance(texts, list) and all(isinstance(t, str) for t in texts):
        processed_texts = texts if isinstance(texts, list) else [texts]
    else:
        # Convert list of tokens to strings
        processed_texts = [" ".join(t) if isinstance(t, list) else t for t in texts]

    # Convert to tf.Tensor
    text_tensor = tf.constant(processed_texts)

    # Apply TextVectorization if needed
    if vectorizer is not None and not any(isinstance(l, tf.keras.layers.TextVectorization) for l in model.layers):
        text_tensor = vectorizer(text_tensor)

    # Predict
    logits = model.predict(text_tensor)
    probs = tf.sigmoid(logits).numpy().flatten()

    # Assign labels based on threshold
    prediction = []
    for prob in probs:
        label = "spam" if prob >= 0.5 else "ham"
        prediction.append((float(prob), label))

    return prediction

In [None]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(model, texts, vectorizer=None):
    """
    Predict labels for text input(s) using the given model.

    Args:
        model: Trained Keras model.
        texts: str or list of str / list of tokens.
        vectorizer: TextVectorization layer, if used during training (optional).
        threshold: Probability threshold to classify as "spam" or "ham".

    Returns:
        - Single tuple (probability, label) if input is a single string
        - List of tuples [(probability, label), ...] if input is a list of strings
    """
    # Determine if input is a single string
    single_input = False
    if isinstance(texts, str):
        texts = [texts]
        single_input = True
    elif isinstance(texts, list) and all(isinstance(t, str) for t in texts):
        pass
    else:
        # Convert list of tokens to strings
        texts = [" ".join(t) if isinstance(t, list) else t for t in texts]

    # Convert to tf.Tensor
    text_tensor = tf.constant(texts)

    # Apply TextVectorization if needed
    if vectorizer is not None and not any(isinstance(l, tf.keras.layers.TextVectorization) for l in model.layers):
        text_tensor = vectorizer(text_tensor)

    # Predict
    logits = model.predict(text_tensor)
    probs = tf.sigmoid(logits).numpy().flatten()

    # Assign labels and cast probabilities to Python float
    prediction = [(float(prob), "spam" if prob >= 0.5 else "ham") for prob in probs]

    # Return single tuple if single input
    return prediction[0] if single_input else prediction

In [None]:
text = 'How are you doing today?'
prob = predict_message(model,text)
print(prob)

In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(model, msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()
