In [1]:
# ✅ Install dependencies if not already installed
# !pip install trafilatura

import trafilatura
import requests

# Example: An arXiv paper abstract page
url = "https://arxiv.org/abs/2404.00001"

# Step 1: Fetch raw HTML
response = requests.get(url)
html = response.text

# Step 2: Use Trafilatura to extract clean text
downloaded_text = trafilatura.extract(html, include_comments=False, include_tables=False)

# Step 3: Display the result
print("📄 Extracted Text Preview:\n")
print(downloaded_text[:1000])  # Show first 1000 characters


📄 Extracted Text Preview:

Physics > Physics Education
[Submitted on 3 Feb 2024]
Title:Uso de herramientas digitales matemáticas en la Educación Secundaria
View PDF HTML (experimental)Abstract:Information and Community Technologies (ICT) are very present in our society nowadays and particularly in the educative field. In just two decades, we have passed from a learning based, in many cases, on the master lessons to one such that methodologies like the flipped classroom or the gamification are stronger than ever. Along this work, we have done a study to teachers and students with the main objective to compare the knowledge on digital tools, their use and their acceptation. We use WxMaxima and Geogebra in order to solve an exercise of \textit{Evaluación de Bachillerato para el Acceso a la Universidad} (EBAU) related with Geometry, comparing their ins and outs with the manual solution. Finally, we expose some conclusions and some possible research lines about digital tools, as well as a p

In [2]:
# Install: sudo apt install tesseract-ocr OR !pip install pytesseract Pillow
import pytesseract
from PIL import Image

# Load and preprocess image (convert to grayscale)
image = Image.open("./test_data/image/image.png").convert("L")  # grayscale
text = pytesseract.image_to_string(image)

print("📄 Tesseract OCR Output (first 500 chars):")
print(text[:500])


📄 Tesseract OCR Output (first 500 chars):
a 4: Leading intelligence.

Unrivaled speed and efficiency.

The most accessible and scalable generation of Llama is here.
Native multimodality, mixture-of-experts models, super long
context windows, step changes in performance, and
unparalleled efficiency. All in easy-to-deploy sizes custom fit for
how you want to use it.



In [3]:
# Install: pip install openai-whisper
import whisper

# Load model
model = whisper.load_model("base")  # or "small", "medium", "large"

# Transcribe audio
result = model.transcribe("./test_data/audio/sample-1.mp3")
print("📄 Whisper Transcription:")
print(result["text"])




📄 Whisper Transcription:
 At the wui pace or breathing, people practice on the elements, instead of 일반 music, and live there,


In [2]:
from datasketch import MinHash, MinHashLSH

def minhash_deduplication(texts, threshold=0.7):
    lsh = MinHashLSH(threshold=threshold, num_perm=128)
    unique_texts = []
    for i, doc in enumerate(texts):
        m = MinHash(num_perm=128)
        for word in set(doc.split()):
            m.update(word.encode('utf8'))
        if not lsh.query(m):
            lsh.insert(f"doc{i}", m)
            unique_texts.append(doc)
    return unique_texts
    

In [3]:
from langdetect import detect
from bs4 import BeautifulSoup

def clean_html_and_filter_lang(texts, lang='en'):
    filtered = []
    for txt in texts:
        txt = BeautifulSoup(txt, 'html.parser').get_text()
        try:
            if detect(txt.strip()) == lang:
                filtered.append(txt.strip())
        except:
            continue
    return filtered
    

In [4]:
import re

def strip_pii(text):
    text = re.sub(r'[\w\.-]+@[\w\.-]+', '[EMAIL]', text)
    text = re.sub(r'\b\d{12,19}\b', '[CREDIT_CARD]', text)
    text = re.sub(r'\b(?:\d{3}-){2}\d{4}\b', '[PHONE]', text)
    return text
    

In [5]:
import re
from collections import Counter

def remove_repetitive_ngrams(text, n=3, threshold=3):
    words = text.split()
    ngrams = [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]

    counts = Counter(ngrams)
    repetitive = [ngram for ngram, count in counts.items() if count >= threshold]

    for phrase in repetitive:
        # regex-safe version of the phrase
        escaped_phrase = re.escape(phrase)
        # match the phrase repeated 2+ times with optional whitespace
        text = re.sub(rf'(?:{escaped_phrase}\s*){{{threshold},}}', phrase + ' ', text)

    # Remove extra spaces
    text = re.sub(r'\s{2,}', ' ', text).strip()
    return text


In [9]:
import pandas as pd
fake_texts = pd.read_csv("test_data/data/Fake_Pretraining_Texts.csv")
raw_dataset = fake_texts["Raw Text"]
print(raw_dataset)


0    Hello! Contact us at support@data.org or call ...
1    Hola! Este artículo está completamente en espa...
2    <html><body><div><h1>Breaking News</h1><p>This...
3    Buy now! Best product ever. Best product ever....
4    Python 3.14 introduces several improvements in...
5    Python 3.14 introduces several improvements in...
6    <div>For inquiries, email jane_doe@example.com...
7    Large Language Models are transforming the AI ...
8                  这是一个包含有用技术信息的中文段落。电话号码：010-12345678
9    Buy now! Best product ever. Best product ever....
Name: Raw Text, dtype: object


In [10]:
# Step 1: Remove HTML + Language Filter
step1 = clean_html_and_filter_lang(raw_dataset)
display(step1)


['Hello! Contact us at support@data.org or call 123-456-7890. Your credit card 4111111111111111 was declined. This message is intended only for the recipient. Visit our site for more.',
 'Breaking NewsThis is a major event!Contact us',
 'Buy now! Best product ever. Best product ever. Best product ever.',
 'Python 3.14 introduces several improvements including better error messages. Learn more on the official site.',
 'Python 3.14 introduces several improvements including better error messages. Learn more on the official docs.',
 'Large Language Models are transforming the AI landscape with few-shot capabilities.',
 'Buy now! Best product ever. Best product ever. Best product ever.']

In [11]:
# Step 2: Deduplicate Paragraphs
step2 = minhash_deduplication(step1)
display(step2)


['Hello! Contact us at support@data.org or call 123-456-7890. Your credit card 4111111111111111 was declined. This message is intended only for the recipient. Visit our site for more.',
 'Breaking NewsThis is a major event!Contact us',
 'Buy now! Best product ever. Best product ever. Best product ever.',
 'Python 3.14 introduces several improvements including better error messages. Learn more on the official site.',
 'Large Language Models are transforming the AI landscape with few-shot capabilities.']

In [12]:
# Step 3: Strip PII
step3 = [strip_pii(t) for t in step2]
display(step3)


['Hello! Contact us at [EMAIL] or call [PHONE]. Your credit card [CREDIT_CARD] was declined. This message is intended only for the recipient. Visit our site for more.',
 'Breaking NewsThis is a major event!Contact us',
 'Buy now! Best product ever. Best product ever. Best product ever.',
 'Python 3.14 introduces several improvements including better error messages. Learn more on the official site.',
 'Large Language Models are transforming the AI landscape with few-shot capabilities.']

In [13]:
# Step 4: Remove Repetitive N-grams
cleaned_data = [remove_repetitive_ngrams(t) for t in step3]
display(cleaned_data)


['Hello! Contact us at [EMAIL] or call [PHONE]. Your credit card [CREDIT_CARD] was declined. This message is intended only for the recipient. Visit our site for more.',
 'Breaking NewsThis is a major event!Contact us',
 'Buy now! Best product ever.',
 'Python 3.14 introduces several improvements including better error messages. Learn more on the official site.',
 'Large Language Models are transforming the AI landscape with few-shot capabilities.']

In [14]:

# Done!
print("✅ Cleaned dataset sample:")
for idx, text in enumerate(cleaned_data):
    print(f"--- Article {idx + 1} ---")
    print(text)


✅ Cleaned dataset sample:
--- Article 1 ---
Hello! Contact us at [EMAIL] or call [PHONE]. Your credit card [CREDIT_CARD] was declined. This message is intended only for the recipient. Visit our site for more.
--- Article 2 ---
Breaking NewsThis is a major event!Contact us
--- Article 3 ---
Buy now! Best product ever.
--- Article 4 ---
Python 3.14 introduces several improvements including better error messages. Learn more on the official site.
--- Article 5 ---
Large Language Models are transforming the AI landscape with few-shot capabilities.
