In [31]:
import sys
import os
import string
import spacy
import pandas as pd
import xml.etree.ElementTree as ET


In [5]:
# Add the previous directory to the Python path
previous_directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
if previous_directory not in sys.path:
    sys.path.append(previous_directory)

In [7]:
# Load spaCy model
nlp = spacy.load("pt_core_news_lg")

In [13]:
# Base directory containing the XML files
base_dir = '../data/raw/ptwiki-latest-pages-articles'

# List to store extracted documents
docs = []

# Traverse all files in the directory recursively
for root_dir, sub_dirs, files in os.walk(base_dir):
    for file_name in files:
        file_path = os.path.join(root_dir, file_name)
        
        # Process only XML files
        try:
            with open(file_path, "r", encoding="utf-8") as file:
                content = file.read()

            # Wrap the content in a root tag if it's not already well-formed XML
            content = f"<root>{content}</root>"

            # Parse the content
            root = ET.fromstring(content)

            # Extract documents
            for doc in root.findall("doc"):
                doc_id = doc.get("id")
                url = doc.get("url")
                title = doc.get("title")
                text_content = doc.text.strip() if doc.text else ""
                docs.append({
                    "id": doc_id,
                    "url": url,
                    "title": title,
                    "content": text_content
                })

        except Exception as e:
            print(f"Error processing file {file_path}: {e}")

# Load data into a DataFrame
df = pd.DataFrame(docs)


In [23]:
# Define the function to generate n-grams and targets
def generate_ngrams_with_masks(content, n=3):
    words = ["[BEG]"] * (n - 1) + content.split() + ["[END]"]  # Add masks
    ngrams = []
    for i in range(len(words) - n):
        ngram = " ".join(words[i:i+n])  # Create the n-gram
        target = words[i+n]  # Next word is the target
        ngrams.append((ngram, target))
    return ngrams

In [35]:
# Function to remove punctuation from text
def remove_punctuation(text):
     return text.translate(str.maketrans(string.punctuation, " " * len(string.punctuation)))

# Apply the function to the sampled DataFrame
def process_sample_df(df, frac=0.01, random_state=42):
    # Randomly sample 5% of the DataFrame
    sampled_df = df.sample(frac=frac, random_state=random_state)
    # Remove punctuation from the content column
    sampled_df["content"] = sampled_df["content"].apply(remove_punctuation)
    return sampled_df

In [36]:
sampled_df = process_sample_df(df)

# Generate n-grams and targets for the sampled DataFrame
n = 3  # Change n for different n-grams
all_ngrams = []

for index, row in sampled_df.iterrows():
    content_ngrams = generate_ngrams_with_masks(row["content"], n)
    for ngram, target in content_ngrams:
        all_ngrams.append({"id": row["id"], "ngram": ngram, "target": target})

# Create a new DataFrame with n-grams and targets
ngrams_df = pd.DataFrame(all_ngrams)

In [37]:
ngrams_df.head(20)

Unnamed: 0,id,ngram,target
0,4972233,[BEG] [BEG] Lewis,Montagna
1,4972233,[BEG] Lewis Montagna,[END]
2,1389145,[BEG] [BEG] Márcio,Lomiranda
3,1389145,[BEG] Márcio Lomiranda,Márcio
4,1389145,Márcio Lomiranda Márcio,Lomiranda
5,1389145,Lomiranda Márcio Lomiranda,Márcio
6,1389145,Márcio Lomiranda Márcio,Silvio
7,1389145,Lomiranda Márcio Silvio,Cotti
8,1389145,Márcio Silvio Cotti,de
9,1389145,Silvio Cotti de,Miranda
