# Stack Overflow Topic Classification using RNN

## Imports

In [5]:
import torch
import pandas as pd
from torchtext import data
import warnings as wrn
wrn.filterwarnings('ignore')
print(torch.__version__)

2.0.0


## Load the data

In [18]:
#Load csv files
spark, ml, security = pd.read_csv('dataset/SO-Spark.csv'), pd.read_csv('dataset/SO-ML.csv'), pd.read_csv('dataset/SO-Security.csv')
#Add label columns
spark['Label'] = 'spark'
ml['Label'] = 'ml'
security['Label'] = 'security'

# Keep only 'title' and 'label' columns
spark_filtered = spark[['Title', 'Label']]
ml_filtered = ml[['Title', 'Label']]
security_filtered = security[['Title', 'Label']]

# Combine dataframes
df = pd.concat([spark_filtered, ml_filtered, security_filtered], ignore_index=True)

In [21]:
# Dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Title   150000 non-null  object
 1   Label   150000 non-null  object
dtypes: object(2)
memory usage: 2.3+ MB


In [25]:
# Dataframe target distribution
df['Label'].value_counts()

spark       50000
ml          50000
security    50000
Name: Label, dtype: int64

## Preprocess the data
Steps:
- Remove non-alphabetical characters
- Convert to lowercase
- Remove stopwords
- Lemmatize

In [52]:
import re
import string
import pandas as pd
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.base import BaseEstimator, TransformerMixin

# Create a custom transformer to preprocess text
class Preprecessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        
    def get_wordnet_pos(self, word):
        """Map POS tag to first character lemmatize() accepts."""
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}

        return tag_dict.get(tag, wordnet.NOUN)
    
    def _preprocess(self, text):
        # Remove non-alphabetic characters
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Convert to lowercase
        text = text.lower()
        # Tokenize text
        tokens = word_tokenize(text)
        # Remove stopwords
        tokens = [token for token in tokens if token not in self.stop_words]
        # Lemmatize text with POS tagging
        tokens = [self.lemmatizer.lemmatize(token, self.get_wordnet_pos(token)) for token in tokens]
        # Join if the word isn't a blank space
        return ' '.join(tokens)

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.apply(self._preprocess)
    
preprocessor = Preprecessor()

In [53]:
df['Clean_Title'] = preprocessor.transform(df['Title'])

In [54]:
# Show untruncated 10 rows
df

Unnamed: 0,Title,Label,Clean_Title
0,Different rlike behavior in Spark 1.6 and Spar...,spark,different rlike behavior spark spark
1,Getting a column as concatenated column from a...,spark,get column concatenate column reference table ...
2,Write data using JDBC connection to Azure SQL ...,spark,write data use jdbc connection azure sql db sc...
3,Get value from external client database for a ...,spark,get value external client database column valu...
4,How to setup Apache Spark to use local hard di...,spark,setup apache spark use local hard disk data fi...
...,...,...,...
149995,Getting the CurrentUserID from Websecurity dir...,security,get currentuserid websecurity directly login c...
149996,How to name images so that other image names c...,security,name image image name cant guess easily
149997,Only the owner can delete his/her books?,security,owner delete hisher book
149998,Samsung Adb after a few days becomes un-author...,security,samsung adb day becomes unauthorized killserve...


In [55]:
# export this dataframe to csv as SO-preprocessed.csv under datasets folder
import os
df.to_csv(os.path.join('dataset', 'SO-preprocessed.csv'), index=False)

In [63]:
# Build vocab
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer('basic_english')

def tokenize_iterator(data):
    for title in data:
        yield tokenizer(title)

vocab = build_vocab_from_iterator(tokenize_iterator(df['Clean_Title']))
# Show vocab
vocab

150000lines [00:01, 142526.50lines/s]


<torchtext.vocab.Vocab at 0x7fa896e1dca0>

In [61]:
# Numericalize the text data to feed into the model
def numericalize(text, tokenizer, vocab):
    tokens = tokenizer(text)
    return [vocab[token] for token in tokens]

df['Numericalized_Title'] = df['Clean_Title'].apply(lambda x: numericalize(x, tokenizer, vocab))

In [62]:
df

Unnamed: 0,Title,Label,Clean_Title,Numericalized_Title
0,Different rlike behavior in Spark 1.6 and Spar...,spark,different rlike behavior spark spark,"[52, 3458, 821, 2, 2]"
1,Getting a column as concatenated column from a...,spark,get column concatenate column reference table ...,"[13, 9, 822, 9, 684, 65, 1701, 205, 45]"
2,Write data using JDBC connection to Azure SQL ...,spark,write data use jdbc connection azure sql db sc...,"[67, 4, 3, 467, 170, 176, 29, 444, 17, 39, 275..."
3,Get value from external client database for a ...,spark,get value external client database column valu...,"[13, 12, 320, 116, 99, 9, 12, 48, 2, 213]"
4,How to setup Apache Spark to use local hard di...,spark,setup apache spark use local hard disk data fi...,"[644, 28, 2, 3, 180, 1322, 840, 4, 299, 1439, ..."
...,...,...,...,...
149995,Getting the CurrentUserID from Websecurity dir...,security,get currentuserid websecurity directly login c...,"[13, 19161, 2448, 761, 100, 7303]"
149996,How to name images so that other image names c...,security,name image image name cant guess easily,"[98, 44, 44, 98, 189, 2238, 2460]"
149997,Only the owner can delete his/her books?,security,owner delete hisher book,"[1639, 478, 11792, 2132]"
149998,Samsung Adb after a few days becomes un-author...,security,samsung adb day becomes unauthorized killserve...,"[9716, 5360, 828, 2073, 1189, 26007, 1074, 38]"


In [65]:
# Split data into train, validation and test sets
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Label'])
train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42, stratify=train_df['Label'])

In [66]:
import torch
from torch.utils.data import Dataset, DataLoader

class StackOverflowDataset(Dataset):
    def __init__(self, df, label_to_idx):
        self.df = df
        self.label_to_idx = label_to_idx

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = torch.tensor(self.df.iloc[idx]['Numericalized_Title'], dtype=torch.long)
        label = torch.tensor(self.label_to_idx[self.df.iloc[idx]['Label']], dtype=torch.long)
        return text, label

def collate_batch(batch):
    texts, labels = zip(*batch)
    text_lens = [len(text) for text in texts]
    max_len = max(text_lens)
    padded_texts = [torch.cat([text, torch.tensor([vocab['<pad>']] * (max_len - len(text)), dtype=torch.long)]) for text in texts]
    return torch.stack(padded_texts), torch.tensor(labels, dtype=torch.long)

label_to_idx = {'spark': 0, 'ml': 1, 'security': 2}

train_dataset = StackOverflowDataset(train_df, label_to_idx)
val_dataset = StackOverflowDataset(val_df, label_to_idx)
test_dataset = StackOverflowDataset(test_df, label_to_idx)

batch_size = 64

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

In [67]:
import torch.nn as nn

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, dropout=0.5):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, text):
        embedded = self.embedding(text)
        lstm_out, _ = self.lstm(embedded)
        pooled = self.dropout(torch.mean(lstm_out, dim=1))
        logits = self.fc(pooled)
        return logits

In [68]:
vocab_size = len(vocab)
embed_dim = 128
hidden_dim = 128
num_classes = 3

model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, num_classes)

In [69]:
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for texts, labels in train_dataloader:
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        logits = model(texts)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    # Validate the model
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for texts, labels in val_dataloader:
            texts, labels = texts.to(device), labels.to(device)
            logits = model(texts)
            loss = criterion(logits, labels)
            val_loss += loss.item()
    
    train_loss /= len(train_dataloader)
    val_loss /= len(val_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")


Epoch 1/10 | Train Loss: 0.2869 | Val Loss: 0.1803
Epoch 2/10 | Train Loss: 0.1498 | Val Loss: 0.1540
Epoch 3/10 | Train Loss: 0.1075 | Val Loss: 0.1562
Epoch 4/10 | Train Loss: 0.0789 | Val Loss: 0.1716
Epoch 5/10 | Train Loss: 0.0584 | Val Loss: 0.1873
Epoch 6/10 | Train Loss: 0.0445 | Val Loss: 0.1965
Epoch 7/10 | Train Loss: 0.0353 | Val Loss: 0.2249
Epoch 8/10 | Train Loss: 0.0279 | Val Loss: 0.2235
Epoch 9/10 | Train Loss: 0.0238 | Val Loss: 0.2468
Epoch 10/10 | Train Loss: 0.0207 | Val Loss: 0.2798


In [70]:
from sklearn.metrics import classification_report

model.eval()
true_labels = []
pred_labels = []

with torch.no_grad():
    for texts, labels in test_dataloader:
        texts, labels = texts.to(device), labels.to(device)
        logits = model(texts)
        predictions = torch.argmax(logits, dim=-1)
        true_labels.extend(labels.tolist())
        pred_labels.extend(predictions.tolist())

print(classification_report(true_labels, pred_labels, target_names=label_to_idx.keys()))

              precision    recall  f1-score   support

       spark       0.94      0.94      0.94     10000
          ml       0.94      0.94      0.94     10000
    security       0.96      0.96      0.96     10000

    accuracy                           0.94     30000
   macro avg       0.94      0.94      0.94     30000
weighted avg       0.94      0.94      0.94     30000

