In [1]:
! pip install langdetect

Collecting langdetect
  Using cached langdetect-1.0.9.tar.gz (981 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py): started
  Building wheel for langdetect (setup.py): finished with status 'done'
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993225 sha256=1b012c2a501873fae6d9c4941f7afbe732e1ec18e5fa7644b2135a1c8b1d3d92
  Stored in directory: c:\users\idali\appdata\local\pip\cache\wheels\d1\c1\d9\7e068de779d863bc8f8fc9467d85e25cfe47fa5051fff1a1bb
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


  DEPRECATION: Building 'langdetect' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'langdetect'. Discussion can be found at https://github.com/pypa/pip/issues/6334


In [6]:
from langdetect import detect
from bs4 import BeautifulSoup

texts = [
    "<p>Hello world!</p>",
    "<p>This is Ida!</p>",
    "<div>Bonjour le monde!</div>",
    "Hola mundo",
    "<span> 12345 </span>"
]

def clean_html_and_filter_lang(texts, lang='en'):
    filtered = []
    for txt in texts:
        txt = BeautifulSoup(txt, 'html.parser').get_text()
        try:
            if detect(txt.strip()) == lang:
                filtered.append(txt.strip())
        except:
            continue
    return filtered
result = clean_html_and_filter_lang(texts, lang='en')
print(result)
for line in result:
    print(line)

['Hello world!', 'This is Ida!']
Hello world!
This is Ida!


In [7]:
# Step 3: Strip PII using regex
import re

def strip_pii(text):
    text = re.sub(r'[\w\.-]+@[\w\.-]+', '[EMAIL]', text)
    text = re.sub(r'\b\d{12,19}\b', '[CREDIT_CARD]', text)
    text = re.sub(r'\b(?:\d{3}-){2}\d{4}\b', '[PHONE]', text)
    return text

sample = "Contact me at john.doe@example.com or 123-456-7890. My card is 4111111111111111."
print(strip_pii(sample))

Contact me at [EMAIL] or [PHONE]. My card is [CREDIT_CARD].


In [10]:
# Step 4: Remove repetitive n-grams

import re
from collections import Counter

text = "I love you I love you I love you very much."

def remove_repetitive_ngrams(text, n=3, threshold=3):
    words = text.split()
    ngrams = [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]

    counts = Counter(ngrams)
    repetitive = [ngram for ngram, count in counts.items() if count >= threshold]

    for phrase in repetitive:
        # regex-safe version of the phrase
        escaped_phrase = re.escape(phrase)
        # match the phrase repeated 2+ times with optional whitespace
        text = re.sub(rf'(?:{escaped_phrase}\s*){{{threshold},}}', phrase + ' ', text)

    # Remove extra spaces
    text = re.sub(r'\s{2,}', ' ', text).strip()
    return text

print(remove_repetitive_ngrams(text, n=3, threshold=3))

I love you very much.


In [14]:
# Step 7: prepare for the text data
import pandas as pd
fake_texts = pd.read_csv("C:/AI_Coop/Slides/Module 3/Week3/test_data/data/Fake_Pretraining_Texts.csv")
raw_dataset = fake_texts["Raw Text"]
print(raw_dataset)

0    Hello! Contact us at support@data.org or call ...
1    Hola! Este artículo está completamente en espa...
2    <html><body><div><h1>Breaking News</h1><p>This...
3    Buy now! Best product ever. Best product ever....
4    Python 3.14 introduces several improvements in...
5    Python 3.14 introduces several improvements in...
6    <div>For inquiries, email jane_doe@example.com...
7    Large Language Models are transforming the AI ...
8                  这是一个包含有用技术信息的中文段落。电话号码：010-12345678
9    Buy now! Best product ever. Best product ever....
Name: Raw Text, dtype: object


In [17]:
import string
from datasketch import MinHash, MinHashLSH

def normalize(text):
    return text.lower().translate(str.maketrans('', '', string.punctuation))

texts = [
    "The cat sat on the mat.",
    "On the mat sat the cat.",
    "The cat is sitting on the mat.",
    "A dog barked loudly at the cat.",
]

def minhash_deduplication(texts, threshold=0.98):
    lsh = MinHashLSH(threshold=threshold, num_perm=128)
    unique_texts = []
    for i, doc in enumerate(texts):
        m = MinHash(num_perm=128)
        
        for word in set(normalize(doc).split()):
            m.update(word.encode('utf8'))
         
        query_result = lsh.query(m)
        print(f"Query for doc{i}: {query_result}")
            
        if not query_result:
            lsh.insert(f"doc{i}", m)
            unique_texts.append(doc)

    return unique_texts


result = minhash_deduplication(texts, threshold=0.98)
print(result)

Query for doc0: []
Query for doc1: ['doc0']
Query for doc2: []
Query for doc3: []
['The cat sat on the mat.', 'The cat is sitting on the mat.', 'A dog barked loudly at the cat.']


In [18]:
# Step 1: Remove HTML + Language Filter
step1 = clean_html_and_filter_lang(raw_dataset)
display(step1)

['Hello! Contact us at support@data.org or call 123-456-7890. Your credit card 4111111111111111 was declined. This message is intended only for the recipient. Visit our site for more.',
 'Breaking NewsThis is a major event!Contact us',
 'Buy now! Best product ever. Best product ever. Best product ever.',
 'Python 3.14 introduces several improvements including better error messages. Learn more on the official site.',
 'Python 3.14 introduces several improvements including better error messages. Learn more on the official docs.',
 'Large Language Models are transforming the AI landscape with few-shot capabilities.',
 'Buy now! Best product ever. Best product ever. Best product ever.']

In [19]:
# Step 2: Deduplicate Paragraphs
step2 = minhash_deduplication(step1)
display(step2)


Query for doc0: []
Query for doc1: []
Query for doc2: []
Query for doc3: []
Query for doc4: []
Query for doc5: []
Query for doc6: ['doc2']


['Hello! Contact us at support@data.org or call 123-456-7890. Your credit card 4111111111111111 was declined. This message is intended only for the recipient. Visit our site for more.',
 'Breaking NewsThis is a major event!Contact us',
 'Buy now! Best product ever. Best product ever. Best product ever.',
 'Python 3.14 introduces several improvements including better error messages. Learn more on the official site.',
 'Python 3.14 introduces several improvements including better error messages. Learn more on the official docs.',
 'Large Language Models are transforming the AI landscape with few-shot capabilities.']

In [20]:
# Step 3: Strip PII
step3 = [strip_pii(t) for t in step2]
display(step3)

['Hello! Contact us at [EMAIL] or call [PHONE]. Your credit card [CREDIT_CARD] was declined. This message is intended only for the recipient. Visit our site for more.',
 'Breaking NewsThis is a major event!Contact us',
 'Buy now! Best product ever. Best product ever. Best product ever.',
 'Python 3.14 introduces several improvements including better error messages. Learn more on the official site.',
 'Python 3.14 introduces several improvements including better error messages. Learn more on the official docs.',
 'Large Language Models are transforming the AI landscape with few-shot capabilities.']

In [21]:
# Step 4: Remove Repetitive N-grams
cleaned_data = [remove_repetitive_ngrams(t) for t in step3]
display(cleaned_data)

['Hello! Contact us at [EMAIL] or call [PHONE]. Your credit card [CREDIT_CARD] was declined. This message is intended only for the recipient. Visit our site for more.',
 'Breaking NewsThis is a major event!Contact us',
 'Buy now! Best product ever.',
 'Python 3.14 introduces several improvements including better error messages. Learn more on the official site.',
 'Python 3.14 introduces several improvements including better error messages. Learn more on the official docs.',
 'Large Language Models are transforming the AI landscape with few-shot capabilities.']

In [22]:
# Done!
print("✅ Cleaned dataset sample:")
for idx, text in enumerate(cleaned_data):
    print(f"--- Article {idx + 1} ---")
    print(text)

✅ Cleaned dataset sample:
--- Article 1 ---
Hello! Contact us at [EMAIL] or call [PHONE]. Your credit card [CREDIT_CARD] was declined. This message is intended only for the recipient. Visit our site for more.
--- Article 2 ---
Breaking NewsThis is a major event!Contact us
--- Article 3 ---
Buy now! Best product ever.
--- Article 4 ---
Python 3.14 introduces several improvements including better error messages. Learn more on the official site.
--- Article 5 ---
Python 3.14 introduces several improvements including better error messages. Learn more on the official docs.
--- Article 6 ---
Large Language Models are transforming the AI landscape with few-shot capabilities.
