In [15]:
import pandas as pd
import re
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from pathlib import Path

In [24]:
data_dir = Path("../data")
model_dir = Path("../models")
data_dir.mkdir(parents=True, exist_ok=True)
model_dir.mkdir(parents=True, exist_ok=True)

In [25]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [26]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"subject\s*:", '', text)  # remove "subject:"
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # remove links
    text = re.sub(r"\S*@\S*\s?", '', text)  # remove emails
    text = re.sub(r"[^\w\s]", '', text)  # remove punctuation, keep words & numbers
    text = re.sub(r'\s+', ' ', text).strip()
    words = text.split()
    cleaned = " ".join(
        [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    )
    return cleaned


In [32]:
df = pd.read_csv(Path("../data/spam.csv"), encoding="latin-1")

In [29]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,,spam,"subject: Bulk supply\r\nATTN. Director, We ...",1
1,,spam,subject: Ugkehkddwkwhfwfw fiehfawsfjefiewhufge...,1
2,,spam,subject: ghf095\r\n5eqtjf,1
3,,ham,subject: Want a birthday shoot for my little b...,0
4,,ham,subject: Hi\r\nHello!,0


In [33]:
df["clean_text"]=df["text"].apply(clean_text)

In [34]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num,clean_text
0,,spam,"subject: Bulk supply\r\nATTN. Director, We ...",1,bulk supply attn director interested product p...
1,,spam,subject: Ugkehkddwkwhfwfw fiehfawsfjefiewhufge...,1,ugkehkddwkwhfwfw fiehfawsfjefiewhufgewhguwjgw ...
2,,spam,subject: ghf095\r\n5eqtjf,1,ghf095 5eqtjf
3,,ham,subject: Want a birthday shoot for my little b...,0,want birthday shoot little brother one rose da...
4,,ham,subject: Hi\r\nHello!,0,hi hello


In [35]:
X_raw = df['clean_text']
y = df['label']
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(X_raw)

In [36]:
pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out()).to_csv(Path("../data/vectorized.csv"), index=False)
df[["label"]].to_csv(Path("../data/labels.csv"), index=False)
with open(Path("../models/tfidf_vectorizer.pkl"), "wb") as f:
    pickle.dump(vectorizer, f)