In [1]:
import pandas as pd
import re
import string
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from gensim.utils import simple_preprocess
import collections
import pickle

In [2]:
"""
This script:
- imports the train.parquet form data/
- preprocesses the queries and docs
- creates lookup tables for the vocab
- tokenizes the queries and docs
- saves the processed queries and docs as pickle files in data/processed/
"""

'\nThis script:\n- imports the train.parquet form data/\n- preprocesses the queries and docs\n- creates lookup tables for the vocab\n- tokenizes the queries and docs\n- saves the processed queries and docs as pickle files in data/processed/\n'

In [3]:
file_path = os.path.join("data", "train.parquet")
df = pd.read_parquet(file_path)

In [4]:
nltk.download("stopwords")
nltk.download("punkt")

# Initialize stemmer and stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kaleb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kaleb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
def preprocess_query(query: str) -> list[str]:
    if query is None or pd.isna(query):
        return []

    query = query.lower()

    query = re.sub(f"[{string.punctuation}]", "", query)

    tokens = simple_preprocess(
        query, deacc=True
    )  # deacc=True removes accents and punctuations

    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]

    tokens = tokens + ["[<EOS>]"]

    return tokens

In [6]:
def create_lookup_tables(words: list[str]) -> tuple[dict[str, int], dict[int, str]]:
    word_counts = collections.Counter(words)
    vocab = sorted(word_counts, key=lambda k: word_counts.get(k), reverse=True)
    int_to_vocab = {ii + 1: word for ii, word in enumerate(vocab)}
    int_to_vocab[0] = "<PAD>"
    vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}
    return vocab_to_int, int_to_vocab

#### queries


In [7]:
processed_queries = df["query"].apply(
    lambda x: preprocess_query(x) if pd.notna(x) else []
)

print(df["query"][82321])
processed_queries[:5]

meaning of propagation


0                                   [[S], rba, [E]]
1              [[S], ronald, reagan, democrat, [E]]
2    [[S], long, need, sydney, surround, area, [E]]
3           [[S], price, instal, tile, shower, [E]]
4                 [[S], convers, observ, bodi, [E]]
Name: query, dtype: object

In [8]:
query_corpus = [word for query in processed_queries for word in query]

In [9]:
vocab_to_int, int_to_vocab = create_lookup_tables(query_corpus)
tokenized_queries = [
    [vocab_to_int[word] for word in query] for query in processed_queries
]
query_dataset = (processed_queries, tokenized_queries, vocab_to_int, int_to_vocab)

In [None]:
with open("data/processed/queries_dataset.pkl", "wb") as f:
    pickle.dump(query_dataset, f)

#### docs


In [30]:
# df["passages"][0]
# len(df)

82326

In [10]:
dfd = df.head(10)

In [11]:
# Get max length of 'passage_text' (array)
max_length = max(len(passage["passage_text"]) for passage in df["passages"])

# Extract 'passage_text' list items into separate columns
for i in range(max_length):
    df[f"passage_text_{i}"] = dfd["passages"].apply(
        lambda x: x["passage_text"][i] if i < len(x["passage_text"]) else None
    )

# Apply the preprocessing function to each column of interest
for i in range(max_length):
    column_name = f"passage_text_{i}"
    df[column_name] = df[column_name].apply(
        lambda x: str(preprocess_query(x)) if pd.notna(x) else "[]"
    )

In [12]:
# df.to_parquet("data/processed/docs.parquet")
processed_docs = []

# Consolidate all passages into processed_docs
for i in range(max_length):
    column_name = f"passage_text_{i}"
    processed_docs += (
        df[column_name]
        .apply(lambda x: preprocess_query(x) if pd.notna(x) else [])  # change
        .tolist()
    )

In [13]:
# flatten processed_docs
processed_docs = df[[f"passage_text_{i}" for i in range(max_length)]].values.flatten()
processed_docs[:2]

array(["['[S]', 'sinc', 'rba', 'outstand', 'reput', 'affect', 'secur', 'npa', 'scandal', 'rba', 'subsidiari', 'involv', 'bribe', 'oversea', 'offici', 'australia', 'might', 'win', 'lucr', 'noteprint', 'contract', 'asset', 'bank', 'includ', 'gold', 'foreign', 'exchang', 'reserv', 'australia', 'estim', 'net', 'worth', 'billion', 'nearli', 'rba', 'employe', 'work', 'headquart', 'sydney', 'new', 'south', 'wale', 'busi', 'resumpt', 'site', '[E]']",
       "['[S]', 'reserv', 'bank', 'australia', 'rba', 'came', 'januari', 'australia', 'central', 'bank', 'banknot', 'issu', 'author', 'reserv', 'bank', 'act', 'remov', 'central', 'bank', 'function', 'commonwealth', 'bank', 'asset', 'bank', 'includ', 'gold', 'foreign', 'exchang', 'reserv', 'australia', 'estim', 'net', 'worth', 'billion', 'nearli', 'rba', 'employe', 'work', 'headquart', 'sydney', 'new', 'south', 'wale', 'busi', 'resumpt', 'site', '[E]']"],
      dtype=object)

In [14]:
import ast

processed_docs = [
    ast.literal_eval(doc) if isinstance(doc, str) else doc for doc in processed_docs
]
print("Processed Docs After Conversion:")
for i in range(5):
    print(processed_docs[i])

Processed Docs After Conversion:
['[S]', 'sinc', 'rba', 'outstand', 'reput', 'affect', 'secur', 'npa', 'scandal', 'rba', 'subsidiari', 'involv', 'bribe', 'oversea', 'offici', 'australia', 'might', 'win', 'lucr', 'noteprint', 'contract', 'asset', 'bank', 'includ', 'gold', 'foreign', 'exchang', 'reserv', 'australia', 'estim', 'net', 'worth', 'billion', 'nearli', 'rba', 'employe', 'work', 'headquart', 'sydney', 'new', 'south', 'wale', 'busi', 'resumpt', 'site', '[E]']
['[S]', 'reserv', 'bank', 'australia', 'rba', 'came', 'januari', 'australia', 'central', 'bank', 'banknot', 'issu', 'author', 'reserv', 'bank', 'act', 'remov', 'central', 'bank', 'function', 'commonwealth', 'bank', 'asset', 'bank', 'includ', 'gold', 'foreign', 'exchang', 'reserv', 'australia', 'estim', 'net', 'worth', 'billion', 'nearli', 'rba', 'employe', 'work', 'headquart', 'sydney', 'new', 'south', 'wale', 'busi', 'resumpt', 'site', '[E]']
['[S]', 'rba', 'recogn', 'microsoft', 'us', 'region', 'partner', 'pr', 'newswir', 

In [15]:
docs_corpus = [word for doc in processed_docs for word in doc]
print("\nDocs Corpus Sample:")
print(docs_corpus[:5])
print(len(docs_corpus))


Docs Corpus Sample:
['[S]', 'sinc', 'rba', 'outstand', 'reput']
3837


In [16]:
docs_vocab_to_int, docs_int_to_vocab = create_lookup_tables(docs_corpus)
tokenized_docs = [
    [docs_vocab_to_int[word] for word in doc] for doc in processed_docs  # change
]
docs_dataset = (processed_docs, tokenized_docs, docs_vocab_to_int, docs_int_to_vocab)

In [19]:
# tokenized_docs

In [28]:
with open("data/processed/docs_dataset.pkl", "wb") as f:
    pickle.dump(docs_dataset, f)

MemoryError: 