In [62]:

# Imports

import pandas as pd
import re
import html
from bs4 import BeautifulSoup

import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")  # run only once
stop_words = set(stopwords.words("english"))

import sys
sys.path.append("../src")

from text_cleaner import clean_dataframe

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vamshipendyala/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:

QUESTIONS_PATH = "../data/raw/Questions.csv"
ANSWERS_PATH = "../data/raw/Answers.csv"
TAGS_PATH = "../data/raw/Tags.csv"

QUESTIONS_OUTPUT_PATH = "../data/processed/questions_clean.csv"
ANSWERS_OUTPUT_PATH = "../data/processed/answers_clean.csv"

CHUNK_SIZE = 50000  # rows processed at a time(large so chunckwise)


counting questions/answers/tags

In [None]:

question_count = sum(
    len(chunk)
    for chunk in pd.read_csv(QUESTIONS_PATH, encoding="latin1", chunksize=100000)
)

question_count


In [None]:
answer_count = sum(
    len(chunk)
    for chunk in pd.read_csv(ANSWERS_PATH, encoding="latin1", chunksize=100000)
)

answer_count


In [None]:
tag_count = sum(
    len(chunk)
    for chunk in pd.read_csv(TAGS_PATH, encoding="latin1", chunksize=100000)
)

tag_count


In [None]:
dataset_summary = pd.DataFrame({
    "TYpe": ["Questions", "Answers", "Tags"],
    "Count": [question_count, answer_count, tag_count]
})

dataset_summary


In [None]:
sample_df_ques= pd.read_csv(
    ANSWERS_PATH,
    encoding="latin1",
    nrows=5
)

# sample_df_ques.head()
sample_df_ques.head(1)


In [None]:
sample_df_ques= pd.read_csv(
    QUESTIONS_PATH,
    encoding="latin1",
    nrows=5
)

sample_df_ques.shape


In [None]:
from bs4 import XMLParsedAsHTMLWarning
import warnings

# Suppress XML warning once
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)

In [None]:
tags_df = pd.read_csv("../data/raw/Tags.csv")
print(tags_df.head())

In [None]:
unique_tags = tags_df["Tag"].unique()

print("Total Unique Tags:", len(unique_tags))

In [None]:
for tag in unique_tags:
    print(tag)

In [None]:
def clean_text_pipeline(text):
    if pd.isna(text):
        return ""

    # Decode HTML entities (e.g., &lt; → <)
    text = html.unescape(text)
    try:
        # Remove HTML/XML tags
        text = BeautifulSoup(text, "html.parser").get_text()
    except Exception:
        return ""
    
    # Convert to lowercase
    text = text.lower()

    # Remove non-alphabetic characters
    text = re.sub(r"[^a-z\s]", " ", text)

    # Normalize multiple spaces
    text = re.sub(r"\s+", " ", text)

    # Tokenize
    tokens = text.split()

    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    # Join back
    text = " ".join(tokens)

    return text.strip()


In [None]:
unique_tags_df = pd.DataFrame(unique_tags, columns=["Tag"]) 

In [None]:
unique_tags_df.to_csv("../data/processed/unique_tags.csv", index=False)

print("Unique tags file saved successfully.") 

In [None]:
tag_counts = tags_df["Tag"].value_counts()

print(tag_counts.head(20))


In [None]:
# Test Cleaning on Small Subset
test_df = pd.read_csv(
    QUESTIONS_PATH,
    encoding="latin1",
    nrows=10000
)

test_df["raw_text"] = (
    test_df["Title"].fillna("") + " " +
    test_df["Body"].fillna("")
)

test_df["clean_text"] = test_df["raw_text"].apply(clean_text_pipeline)

sample = test_df[["raw_text", "clean_text"]].iloc[0]

print("RAW TEXT:\n", sample["raw_text"][:500])
print("\nCLEAN TEXT:\n", sample["clean_text"][:500])


In [None]:
questions_df = pd.read_csv(
    "../data/raw/Questions.csv",
    encoding="latin1"
)

questions_df.head()

In [None]:
tags = pd.read_csv(
    "../data/processed/unique_tags.csv",
    encoding="latin1"
) 
tags.head()

In [None]:
# Load tags
tags_df = pd.read_csv("../data/processed/unique_tags.csv", encoding="latin1")

# Convert to clean lowercase list
tags = (
    tags_df.iloc[:, 0]
    .dropna()
    .astype(str)
    .str.lower()
    .str.strip()
    .tolist()
)

print("Total tags loaded:", len(tags))

In [None]:
# Full Dataset Cleaning (Chunk Processing)
first_chunk = True

for chunk in pd.read_csv(
    QUESTIONS_PATH,
    encoding="latin1",
    chunksize=CHUNK_SIZE
):

    # Combine title and body
    chunk["raw_text"] = (
        chunk["Title"].fillna("") + " " +
        chunk["Body"].fillna("")
    )

    # Apply cleaning pipeline
    chunk["clean_text"] = chunk["raw_text"].apply(clean_text_pipeline)

    # Keep only relevant columns
    processed_chunk = chunk[["Id", "Score", "clean_text"]]

    # Write incrementally
    processed_chunk.to_csv(
        QUESTIONS_OUTPUT_PATH,
        mode="w" if first_chunk else "a",
        header=first_chunk,
        index=False
    )

    first_chunk = False

print(" Full preprocessing complete.")


In [None]:
# Create placeholder mapping
tag_placeholder_map = {}

for i, tag in enumerate(tags):
    placeholder = f"TAGTOKEN{i}"
    tag_placeholder_map[tag] = placeholder

# Reverse map (to restore later)
reverse_map = {v: k for k, v in tag_placeholder_map.items()}

In [None]:
def protect_tags(text):
    if pd.isna(text):
        return ""
    
    text = text.lower()
    
    for tag, placeholder in tag_placeholder_map.items():
        # replace whole word matches only
        text = re.sub(rf"\b{re.escape(tag)}\b", placeholder, text)
    
    return text
    

In [None]:
# from bs4 import BeautifulSoup

def remove_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text(separator=" ")

In [None]:
import os

os.makedirs(os.path.dirname(QUESTIONS_OUTPUT_PATH), exist_ok=True)


In [None]:
#Normal Cleaning (Without Damaging Tags)
def clean_text(text):
    text = remove_html(text)
    
    # remove special characters but keep placeholders
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)
    
    # remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()
    
    return text

In [None]:
#Restore Tags Back
def restore_tags(text):
    for placeholder, tag in reverse_map.items():
        text = text.replace(placeholder, tag)
    return text

In [None]:
# verify
clean_df_sample = pd.read_csv(QUESTIONS_OUTPUT_PATH, nrows=5)

clean_df_sample.head()


In [None]:
def full_clean_pipeline(text):
    #Final Pipeline Function
    text = protect_tags(text)
    text = clean_text(text)
    text = restore_tags(text)
    return text

In [None]:
original_count = sum(
    len(chunk)
    for chunk in pd.read_csv(QUESTIONS_PATH, encoding="latin1", chunksize=100000)
)

clean_count = sum(
    len(chunk)
    for chunk in pd.read_csv(QUESTIONS_OUTPUT_PATH, chunksize=100000)
)

print("Original rows:", original_count)
print("Cleaned rows:", clean_count)


In [None]:
#Apply to DataFrame
questions_df["cleaned_body"] = questions_df["Body"].apply(full_clean_pipeline)

answers cleaning

In [None]:
sample_df_ans= pd.read_csv(
    ANSWERS_PATH,
    encoding="latin1",
    nrows=5
)

# sample_df_ans.head()
sample_df_ques.head()


In [None]:
# # Test Cleaning on Small Subset of ans
# test_df_ans = pd.read_csv(
#     ANSWERS_PATH,
#     encoding="latin1",
#     nrows=10000
# )

# test_df_ans["raw_text"] = (
#     test_df_ans["Body"].fillna("")
# )

# test_df_ans["clean_text"] = test_df_ans["raw_text"].apply(clean_text_pipeline)

# sample = test_df_ans[["raw_text", "clean_text"]].iloc[0]

# print("RAW TEXT:\n", sample["raw_text"][:500])
# print("\nCLEAN TEXT:\n", sample["clean_text"][:500])


In [None]:
# Full Dataset Cleaning (Chunk Processing)
first_chunk = True

for chunk in pd.read_csv(
    ANSWERS_PATH,
    encoding="latin1",
    chunksize=CHUNK_SIZE
):

    # Combine title and body
    chunk["raw_text"] = (
        chunk["Body"].fillna("")
    )

    # Apply cleaning pipeline
    chunk["clean_text"] = chunk["raw_text"].apply(clean_text_pipeline)

    # Keep only relevant columns
    processed_chunk = chunk[["Id","ParentId", "Score", "clean_text"]]

    # Write incrementally
    processed_chunk.to_csv(
        ANSWERS_OUTPUT_PATH,
        mode="w" if first_chunk else "a",
        header=first_chunk,
        index=False
    )

    first_chunk = False

print(" Full preprocessing complete.")


In [None]:
import pandas as pd
answers_sample = pd.read_csv("../data/processed/answers_clean.csv", nrows=5)
answers_sample.head()

In [None]:
# TF-IDF VECTORIZATION

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import save_npz

# Load cleaned questions
clean_questions = pd.read_csv("../data/processed/questions_clean.csv")

vectorizer = TfidfVectorizer(
    max_features=10000,
    min_df=5,
    max_df=0.8,
    ngram_range=(1,2),
    sublinear_tf=True
)

tfidf_questions = vectorizer.fit_transform(clean_questions["clean_text"].fillna(""))

import joblib
joblib.dump(vectorizer, "../data/processed/tfidf_vectorizer.pkl")

print("Questions TF-IDF Shape:", tfidf_questions.shape)

# Save sparse matrix
save_npz("../data/processed/questions_tfidf.npz", tfidf_questions)

print("Questions TF-IDF saved successfully.")

In [None]:
import pandas as pd
# Load the newly cleaned answers (now has ParentId)
answers_clean = pd.read_csv("../data/processed/answers_clean.csv")
print("Answers shape:", answers_clean.shape)
print(answers_clean.head(3))

In [None]:
# Group all answers by the question they belong to (ParentId = Question Id)
answer_stats = answers_clean.groupby("ParentId").agg(
    avg_answer_score=("Score", "mean"),
    answer_count=("Score", "count"),
    max_answer_score=("Score", "max")
).reset_index()
# Rename ParentId to question_id for clarity
answer_stats.rename(columns={"ParentId": "question_id"}, inplace=True)
print("Answer stats shape:", answer_stats.shape)
answer_stats.head(5)

In [None]:
# STEP 3: Bayesian Smoothing + Normalize to 0-1
# (Replaces the simple min-max normalization)

global_mean = answer_stats["avg_answer_score"].mean()
C = answer_stats["answer_count"].mean()  # confidence factor

print(f"Global mean score: {global_mean:.2f}")
print(f"Confidence factor C: {C:.2f}")

# Apply Bayesian smoothing
answer_stats["bayesian_avg_score"] = (
    (C * global_mean + answer_stats["avg_answer_score"] * answer_stats["answer_count"])
    / (C + answer_stats["answer_count"])
)

# Normalize to 0-1
min_b = answer_stats["bayesian_avg_score"].min()
max_b = answer_stats["bayesian_avg_score"].max()

answer_stats["avg_score_normalized"] = (
    (answer_stats["bayesian_avg_score"] - min_b) / (max_b - min_b)
)

print("\nNormalized score range:")
print("Min:", answer_stats["avg_score_normalized"].min())
print("Max:", answer_stats["avg_score_normalized"].max())
print("Mean:", answer_stats["avg_score_normalized"].mean().round(4))

answer_stats.head(5)


In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, random_state=42)

answer_stats["cluster"] = kmeans.fit_predict(
    answer_stats[['avg_score_normalized']]
)
cluster_order = (
    answer_stats.groupby('cluster')['avg_score_normalized']
    .mean()
    .sort_values()
    .index
)

mapping = {
    cluster_order[0]: "Hard",
    cluster_order[1]: "Medium",
    cluster_order[2]: "Easy"
}

answer_stats["difficulty"] = (
    answer_stats["cluster"].map(mapping)
)

answer_stats.head()


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,5))

plt.scatter(
    answer_stats.index,
    answer_stats['avg_score_normalized'],
    c=answer_stats['cluster'],
    cmap='viridis',
    s=60
)

plt.xlabel("Question Index")
plt.ylabel("Normalized Score (0–1)")
plt.title("K-Means Clustering of Question Difficulty")

plt.show()

In [None]:
from scipy.sparse import load_npz

# Load cleaned answers
clean_answers = pd.read_csv("../data/processed/answers_clean.csv")

# Load saved vectorizer
vectorizer = joblib.load("../data/processed/tfidf_vectorizer.pkl")

# Transform answers (DO NOT FIT AGAIN)
tfidf_answers = vectorizer.transform(clean_answers["clean_text"].fillna(""))

print("Answers TF-IDF Shape:", tfidf_answers.shape)

# Save sparse matrix
save_npz("../data/processed/answers_tfidf.npz", tfidf_answers)

print("Answers TF-IDF saved successfully.")


In [None]:
# Scalling the score and made it till out of 100


# calculated the averge by group of that that particular question id.
# clean_answers['average_score'] = (clean_answers.groupby('Id')['score_out_of_100'].transform('mean'))

# # added a new column of difficulty label and hardcoded the bins such that if average score defines the difficulty.
# def assign_difficulty(score):
#     if score >= 70:
#         return "Easy"
#     elif score >= 40:
#         return "Medium" 
#     else:
#         return "Hard"

# clean_answers['difficulty'] = (
#     clean_answers['average_score']
#     .apply(assign_difficulty)
# )

clean_answers.head()