In [6]:
!pip install transformers datasets scikit-learn pandas matplotlib seaborn




[notice] A new release of pip is available: 24.0 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
import pandas as pd
import numpy as np
import re
import ast
import torch
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from transformers import AutoTokenizer, AutoModel
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
df = pd.read_csv('SA_SubTxt_fn.csv')

# Clean the data column (it contains strings of lists)
def extract_text(text):
    try:
        if text.startswith('['):
            return ast.literal_eval(text)[0]
        return text
    except:
        return text

df['clean_text'] = df['data'].apply(extract_text)

# Multi-class Label Mapping (Mapping Ham to Personal, Support, Promotions)
# 1 is Spam. 0 is Ham. We split 0 based on keywords.
def map_multi_class(row):
    text = str(row['clean_text']).lower()
    if row['label'] == 1:
        return 'Spam'
    elif any(kw in text for kw in ['support', 'help', 'issue', 'ticket', 'bug']):
        return 'Support'
    elif any(kw in text for kw in ['sale', 'offer', 'discount', 'price', 'promo']):
        return 'Promotions'
    else:
        return 'Personal'

df['category'] = df.apply(map_multi_class, axis=1)

print("Label Distribution:")
print(df['category'].value_counts())

Label Distribution:
category
Personal      2635
Spam          1896
Support       1349
Promotions     166
Name: count, dtype: int64


In [9]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # Remove URLs
    text = re.sub(r'\@\w+|\#','', text) # Remove mentions/hashtags
    text = re.sub(r'[^a-z\s]', '', text) # Remove special chars and numbers
    return text

df['processed_text'] = df['clean_text'].apply(preprocess_text)
X_train, X_test, y_train, y_test = train_test_split(df['processed_text'], df['category'], test_size=0.2, random_state=42)

In [10]:
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [11]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)
y_pred_rf = rf_model.predict(X_test_tfidf)

In [12]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased")

def get_embeddings(text_list):
    inputs = tokenizer(text_list.tolist(), padding=True, truncation=True, max_length=128, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the mean of hidden states as sentence representation
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Note: For large datasets, process in batches. Here we use a sample for demonstration.
sample_size = 500 
X_test_embeddings = get_embeddings(X_test[:sample_size])
print("GenAI Embeddings Shape:", X_test_embeddings.shape)

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

GenAI Embeddings Shape: (500, 768)
