# Text Mining

We're doing a basic EDA to figure out how we're going to reduce our vocabulary size.

In [None]:
import re
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import spacy

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
file_path = "data/wiki2.train.txt"

In [None]:
with open(file_path, "r", encoding="utf-8") as file:
    text = file.read()
tokens = text.split()

In [None]:
token_counts = Counter(tokens)
freq_dist = pd.DataFrame(token_counts.items(), columns=["Token", "Frequency"])

In [None]:
freq_dist["Token"].nunique()

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(freq_dist["Frequency"], bins=50, edgecolor="black")
plt.title("Histogram of Word Frequencies")
plt.xlabel("Frequency")
plt.ylabel("Number of Words")
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(freq_dist["Frequency"], log_scale=True, bins=50, kde=True, color="skyblue")
plt.title("Log-Scaled Histogram of Word Frequencies")
plt.xlabel("Frequency (log scale)")
plt.ylabel("Number of Words")
plt.xscale("log")
plt.yscale("log")
plt.grid(True, ls="--")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(
    x="Frequency",
    y="Token",
    data=freq_dist.sort_values(by="Frequency", ascending=True).head(20),
)
plt.title("Frequency Distribution of Bottom 20 Tokens")
plt.xlabel("Frequency")
plt.ylabel("Token")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(
    x="Frequency",
    y="Token",
    data=freq_dist.sort_values(by="Frequency", ascending=False).head(20),
)
plt.title("Frequency Distribution of Top 20 Tokens")
plt.xlabel("Frequency")
plt.ylabel("Token")
plt.show()

## Pre-processing Documents from corpus

In [None]:
document_delimiter_pattern = r"\n\s\n\s=\s[^=]+\s=\s\n"
documents = re.split(document_delimiter_pattern, text)

if documents and not documents[0].strip():
    documents = documents[1:]

In [None]:
doc_lengths = [len(doc.split()) for doc in documents]
doc_length_df = pd.DataFrame({"index": range(len(doc_lengths)), "length": doc_lengths})

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(doc_length_df["length"], bins=50, edgecolor="black")
plt.title("Histogram of Document Lengths")
plt.xlabel("Frequency")
plt.ylabel("Doc Length")
plt.show()

## Replacing stop words

In [None]:
def replace_stopwords_with_unk(text):
    doc = nlp(text)
    return " ".join(["<unk>" if token.is_stop else token.text for token in doc])

In [None]:
processed_documents = [replace_stopwords_with_unk(doc) for doc in documents]

In [None]:
processed_documents[10]

## Keeping only English characters and numbers

In [None]:
def replace_non_alpha_numeric_with_unk(text):
    doc = nlp(text)
    # Replace tokens that are not alpha (letters) or digit (numbers) with unk
    return " ".join(
        [token.text if token.is_alpha or token.is_digit else "unk" for token in doc]
    )

In [None]:
eng_proc_docs = [replace_non_alpha_numeric_with_unk(doc) for doc in processed_documents]

In [None]:
eng_proc_docs[10].replace("unk ", "")

In [None]:
proc_doc_lengths = [len(doc.split()) for doc in eng_proc_docs]
proc_doc_length_df = pd.DataFrame(
    {"index": range(len(proc_doc_lengths)), "length": proc_doc_lengths}
)

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(proc_doc_length_df["length"], bins=50, edgecolor="black")
plt.title("Histogram of Processed Document Lengths")
plt.xlabel("Frequency")
plt.ylabel("Doc Length")
plt.show()

In [None]:
all_docs_combined = " ".join(eng_proc_docs)
all_tokens = all_docs_combined.split()
token_counts = Counter(all_tokens)
freq_dist = pd.DataFrame(token_counts.items(), columns=["Token", "Frequency"])

In [None]:
freq_dist["Token"].nunique()

In [None]:
(33277 - 32224) / 33277

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(
    freq_dist.loc[freq_dist["Token"] != "<unk>", ["Frequency"]],
    log_scale=True,
    bins=50,
    kde=True,
    color="skyblue",
)
plt.title("Log-Scaled Histogram of Word Frequencies")
plt.xlabel("Frequency (log scale)")
plt.ylabel("Number of Words")
plt.xscale("log")
plt.yscale("log")
plt.grid(True, ls="--")
plt.show()

# Replacing based on threshold

In [None]:
eng_proc_docs = [doc.replace("unk", "<unk>") for doc in eng_proc_docs]

In [None]:
def reduce_vocabulary(docs, vocab_size=10000):
    # Join the documents
    combined_text = " ".join(docs)

    # Tokenize the combined text
    all_tokens = combined_text.split()

    # Calculate the frequency of each token
    token_freq = Counter(all_tokens)

    # Identify the tokens to be replaced (those outside the top 'vocab_size' most common)
    common_tokens = set(token for token, freq in token_freq.most_common(vocab_size))
    tokens_to_replace = set(token for token in token_freq if token not in common_tokens)

    # Replace low-frequency tokens with "<unk>" and keep track of replaced tokens
    replaced_tokens = set()
    reduced_docs = []
    for doc in docs:
        new_tokens = []
        for token in doc.split():
            if token in tokens_to_replace:
                new_tokens.append("<unk>")
                replaced_tokens.add(token)
            else:
                new_tokens.append(token)
        reduced_docs.append(" ".join(new_tokens))

    return reduced_docs, replaced_tokens

In [None]:
reduced_docs, replaced_tokens = reduce_vocabulary(eng_proc_docs, vocab_size=12500)

In [None]:
def write_tokens_to_file(tokens, file_path):
    with open(file_path, "w", encoding="utf-8") as file:
        for token in tokens:
            file.write(token + "\n")

In [None]:
write_tokens_to_file(replaced_tokens, "data/replaced_tokens.txt")

In [None]:
reduced_docs[10].replace("<unk> ", "")

In [None]:
all_docs_combined = " ".join(eng_proc_docs)
all_tokens = all_docs_combined.split()
token_counts = Counter(all_tokens)
freq_dist = pd.DataFrame(token_counts.items(), columns=["Token", "Frequency"])

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(
    freq_dist.loc[freq_dist["Token"] != "<unk>", ["Frequency"]],
    log_scale=True,
    bins=50,
    kde=True,
    color="skyblue",
)
plt.title("Log-Scaled Histogram of Word Frequencies")
plt.xlabel("Frequency (log scale)")
plt.ylabel("Number of Words")
plt.xscale("log")
plt.yscale("log")
plt.grid(True, ls="--")
plt.show()

## Saving the Data

In [None]:
from utils.data_saver import DataSaver

In [None]:
# train_data_saver = DataSaver(
#     input_file="data/wiki2.train.txt",
#     tokens_to_replace_file="data/replaced_tokens.txt",
#     output_file="data/wiki2.train_processed.txt",
# )
# train_data_saver.save_processed_data()

# test_data_saver = DataSaver(
#     input_file="data/wiki2.test.txt",
#     tokens_to_replace_file="data/replaced_tokens.txt",
#     output_file="data/wiki2.test_processed.txt",
# )
# test_data_saver.save_processed_data()

# valid_data_saver = DataSaver(
#     input_file="data/wiki2.valid.txt",
#     tokens_to_replace_file="data/replaced_tokens.txt",
#     output_file="data/wiki2.valid_processed.txt",
# )
# valid_data_saver.save_processed_data()