# Stack Overflow LSTM Multi-label Tag Prediction

This notebook demonstrates an end-to-end pipeline for predicting the top 10 Stack Overflow tags using an LSTM model. It is optimized for large datasets and ready for GitHub submission.

In [1]:
# 1. Imports and Setup
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, hamming_loss, accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense
import pickle
import tensorflow as tf
import os
print('TensorFlow version:', tf.__version__)


## 2. Data Loading and Filtering
- Load questions and tags
- Filter for top 10 tags
- Keep only relevant questions

In [2]:
# Load data (update paths as needed)
questions = pd.read_csv('Questions.csv', usecols=['Id', 'Title', 'Body'])
tags = pd.read_csv('Tags.csv')

# Find top 10 tags
top_tags = tags['Tag'].value_counts().head(10).index.tolist()
filtered_tags = tags[tags['Tag'].isin(top_tags)]
question_ids = filtered_tags['Id'].unique()
filtered_questions = questions[questions['Id'].isin(question_ids)].copy()


## 3. Text Preprocessing
- Remove HTML, lowercase, remove non-alphabetic chars
- Combine title and body

In [3]:
def strip_html(text):
    return BeautifulSoup(text, "lxml").get_text(separator=" ") if pd.notnull(text) else ""

filtered_questions['Title'] = filtered_questions['Title'].astype(str)
filtered_questions['Body'] = filtered_questions['Body'].astype(str)
filtered_questions['Body_no_html'] = filtered_questions['Body'].map(strip_html)

filtered_questions['Cleaned_Title'] = (
    filtered_questions['Title']
    .str.lower()
    .str.replace(r'[^a-z\s]', ' ', regex=True)
    .str.replace(r'\s+', ' ', regex=True)
    .str.strip()
)
filtered_questions['Cleaned_Body'] = (
    filtered_questions['Body_no_html']
    .str.lower()
    .str.replace(r'[^a-z\s]', ' ', regex=True)
    .str.replace(r'\s+', ' ', regex=True)
    .str.strip()
)
filtered_questions['Combined_Text'] = (
    filtered_questions['Cleaned_Title'] + " " + filtered_questions['Cleaned_Body']
)


## 4. Multi-label Target Preparation
- Group tags per question
- Binarize for multi-label

In [4]:
tags_per_question = (
    filtered_tags.groupby('Id')['Tag']
    .apply(list)
    .reset_index()
)
tags_per_question['Tag'] = tags_per_question['Tag'].apply(
    lambda tag_list: [tag for tag in tag_list if tag in top_tags]
)
tags_per_question = tags_per_question[tags_per_question['Tag'].map(len) > 0]
questions_targets = pd.merge(
    filtered_questions, tags_per_question, on='Id', how='inner'
)
mlb = MultiLabelBinarizer(classes=top_tags)
tag_matrix = mlb.fit_transform(questions_targets['Tag'])
tag_df = pd.DataFrame(tag_matrix, columns=mlb.classes_, index=questions_targets.index)
final_df = pd.concat([questions_targets.reset_index(drop=True), tag_df], axis=1)
final_df = final_df.drop(columns=['Tag'])

## 5. Tokenization and Padding for LSTM

In [5]:
texts = final_df['Combined_Text'].astype(str).tolist()
max_words = 10000
max_len = 200
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')
tag_columns = top_tags
y = final_df[tag_columns].values.astype(np.float32)

## 6. Train/Validation Split

In [6]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## 7. LSTM Model Building and Training

In [7]:
model = Sequential([
    Embedding(max_words, 64, input_length=max_len),
    LSTM(64),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(len(tag_columns), activation='sigmoid')
])
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)
model.summary()

In [8]:
history = model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=64,
    validation_data=(X_val, y_val)
)

## 8. Evaluation

In [9]:
y_pred_probs = model.predict(X_val)
y_pred = (y_pred_probs > 0.5).astype(int)
print(classification_report(y_val, y_pred, target_names=tag_columns))
print(f"Hamming Loss: {hamming_loss(y_val, y_pred):.4f}")
print(f"Subset Accuracy: {accuracy_score(y_val, y_pred):.4f}")

## 9. Save Model and Preprocessing Objects

In [10]:
model.save('stack_overflow_lstm_model.h5')
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)
with open('mlb.pkl', 'wb') as f:
    pickle.dump(mlb, f)

## 10. Inference Example

In [11]:
# Example: Predict tags for a new question
def predict_tags(text, max_len=200):
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=max_len)
    pred = model.predict(padded)
    tags = mlb.inverse_transform((pred > 0.5).astype(int))
    return tags

example = "How do I connect to SQL Server using C#?"
print("Predicted tags:", predict_tags(example))

## 11. Conclusion
- This notebook provides a full pipeline for multi-label tag prediction using LSTM.
- Artifacts are saved for reproducibility and deployment.
- Ready for GitHub submission.