# Fine tuning for spam classification

This is a bit of a digression from the book, based on the [Vizuara course](https://www.youtube.com/watch?v=yZpy_hsC1bE).
I think the idea is to get a more gradual introduction into fine tuning by starting with something simpler.

In [19]:
import import_ipynb
import openai # type:ignore
import gpt # type:ignore
import pandas as pd
import urllib.request
import ssl
import zipfile
import os
from pathlib import Path


## Download and preprocess the UCI spam data

The fine folks at the University of California at Irvine have provided a nice little data set for SMS spam.
Let's download that and save it in a convenient CSV format.

In [None]:
url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

# NOTE: on 6/21/25, the UCI archive server is unreachable. I downloaded this file
# manually from a mirror.
def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f"{data_file_path} already exists. Skipping download and extraction.")
        return
     
    ssl_context = ssl._create_unverified_context()

    with urllib.request.urlopen(url, context=ssl_context) as response:
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())
    
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)    
    print(f"File downloaded and saved as {data_file_path}")


sms_spam_collection/SMSSpamCollection.tsv already exists. Skipping download and extraction.


The data set contains 4825 ham messages and only 747 spam messages. Since we want an equal number of both, we'll have to take 747 ham messages at random and discard the rest.

In [21]:
def create_balanced_dataset(df):
    df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])
    num_spam = df[df["Label"] == "spam"].shape[0]
    ham_subset = df[df["Label"] == "ham"].sample(num_spam, random_state=123)
    balanced_df = pd.concat([ham_subset, df[df["Label"] == "spam"]])
    return balanced_df

Now we want to create the following splits:
- 70% for training
- 10% for validation
- 20% for testing

In [22]:
def random_split(df, train_frac, validation_frac):
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)

    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)

    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]

    return train_df, validation_df, test_df

In [None]:
def save_csv():
    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)
    df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])
    balanced_df = create_balanced_dataset(df)
    balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})
    train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)
    train_df.to_csv("train.csv", index=None) # type:ignore
    validation_df.to_csv("validation.csv", index=None) # type:ignore
    test_df.to_csv("test.csv", index=None) # type:ignore

# Uncomment if you haven't saved this yet
# save_csv()