Code to preprocess the text field in data.json and generate tokenized text after lemmatization

In [None]:
%pip install spacy numpy scikit-learn ipykernel

In [None]:
# load the spacy models (small, medium, large)
import spacy
import spacy.cli
import spacy.cli.download

# Download the models if not already present

spacy.cli.download("en_core_web_sm")
spacy.cli.download("en_core_web_md")
spacy.cli.download("en_core_web_lg")

# Load the models
nlp_small = spacy.load("en_core_web_sm")
nlp_medium = spacy.load("en_core_web_md")
nlp_large = spacy.load("en_core_web_lg")

In [None]:
# print the model sizes in bytes
print(f"Small model size: {nlp_small.vocab.vectors.shape}")
print(f"Medium model size: {nlp_medium.vocab.vectors.shape}")
print(f"Large model size: {nlp_large.vocab.vectors.shape}")

In [None]:
# load the json file and store data in a variable
import json

with open("data.json", "r") as file:
    data = json.load(file)

# Print the data
print(data)

In [None]:
# Load the models into a dictionary for easy access
models = {
    "small": nlp_small,
    "medium": nlp_medium,
    "large": nlp_large
}

In [None]:
# tokenize the text in data.json
def tokenize_text(text, model):
    """
    Preprocesses and tokenizes the text using the specified spaCy model.
    Steps:
    - Lowercase the text
    - Lemmatize
    - Remove stopwords, punctuation, and tokens without a proper lemma
    """
    nlp = models[model]
    doc = nlp(text.lower())
    return [
        token.lemma_
        for token in doc
        if not token.is_stop and not token.is_punct and token.lemma_ != "" and token.lemma_ != "-PRON-"
    ]

In [None]:
# tokenized text from small model
for entry in data:
    entry["tokenized_text"] = tokenize_text(entry["text"], "small")

# Export the updated data to a new JSON file
with open("updated_data_small.json", "w", encoding="utf-8") as outfile:
    json.dump(data, outfile, ensure_ascii=False, indent=4)

print("Tokenization complete. Updated data saved to updated_data_small.json.")

In [None]:
# tokenized text using the medium model
for entry in data:
    entry["tokenized_text"] = tokenize_text(entry["text"], "medium")

# Export the updated data to a new JSON file
with open("updated_data_medium.json", "w", encoding="utf-8") as outfile:
    json.dump(data, outfile, ensure_ascii=False, indent=4)

print("Tokenization complete. Updated data saved to updated_data_medium.json.")

In [None]:
# tokenized text from the large model
for entry in data:
    entry["tokenized_text"] = tokenize_text(entry["text"], "large")

# Export the updated data to a new JSON file
with open("updated_data_large.json", "w", encoding="utf-8") as outfile:
    json.dump(data, outfile, ensure_ascii=False, indent=4)

print("Tokenization complete. Updated data saved to updated_data_large.json.")