In [None]:
!pip install datasketch datasets



## Load Dataset

In [1]:
from datasets import load_dataset, Dataset

In [2]:

ds = load_dataset("habanoz/WildChat-turkce")['train']

## Normalize Chat Data

In [3]:
ds

Dataset({
    features: ['conversation_hash', 'model', 'timestamp', 'conversation', 'turn', 'language', 'openai_moderation', 'detoxify_moderation', 'toxic', 'redacted', 'state', 'country', 'hashed_ip', 'header'],
    num_rows: 6104
})

In [4]:
def flatten_chat(chat):
    return "\n".join(f"{t['role']}: {t['content']}" for t in chat)

ds = ds.map(lambda x: {'chat': flatten_chat(x['conversation'])})

In [5]:
ds

Dataset({
    features: ['conversation_hash', 'model', 'timestamp', 'conversation', 'turn', 'language', 'openai_moderation', 'detoxify_moderation', 'toxic', 'redacted', 'state', 'country', 'hashed_ip', 'header', 'chat'],
    num_rows: 6104
})

## Remove Exact Deduplicates

In [6]:
import pandas as pd

df = ds.to_pandas()
print("Original:", len(df))
deduped = df.drop_duplicates("chat")
print("Unique:", len(deduped))
print("Exact duplicates:", len(df) - len(deduped))

ds_dedup = Dataset.from_pandas(deduped)

Original: 6104
Unique: 6092
Exact duplicates: 12


## Near Deduplicates

In [7]:
from datasketch import MinHash, MinHashLSH

def get_minhash(text, num_perm=128):
    m = MinHash(num_perm=num_perm)
    for word in text.split():
        m.update(word.encode('utf8'))
    return m

lsh = MinHashLSH(threshold=0.9, num_perm=128)
minhashes = {}

for i, t in enumerate(ds_dedup["chat"]):
    m = get_minhash(t)
    lsh.insert(i, m)
    minhashes[i] = m

dup_groups = []
seen = set()
for i in minhashes:
    if i not in seen:
        near_dups = lsh.query(minhashes[i])
        if len(near_dups) > 1:
            dup_groups.append(near_dups)
        seen.update(near_dups)

print("Near-duplicate groups (whole chats):", len(dup_groups))

Near-duplicate groups (whole chats): 110


In [8]:
to_remove = set()
for group in dup_groups:
    # keep the first one, remove the others
    for idx in group[1:]:
        to_remove.add(idx)

print("Chats to remove:", len(to_remove))

Chats to remove: 244


In [9]:
keep_idx = [i for i in range(len(ds_dedup)) if i not in to_remove]
ds_near_dedup = ds_dedup.select(keep_idx)

print("Original:", len(ds_dedup))
print("After deduplication:", len(ds_near_dedup))


Original: 6092
After deduplication: 5848


## Duplicate turns (inside conversations)

In [10]:
from collections import Counter

all_turns = []
for conv in ds_near_dedup["conversation"]:
    for t in conv:
        if t["role"] == "assistant":  # or include user too
            all_turns.append(t["content"].strip())

turn_counts = Counter(all_turns)
repeated_turns = [(t, c) for t, c in turn_counts.items() if c > 1]

print("Unique turns:", len(turn_counts))
print("Repeated turns:", len(repeated_turns))
print("Most common turns:", turn_counts.most_common(10))

Unique turns: 14997
Repeated turns: 104
Most common turns: [('Merhaba! Size nasıl yardımcı olabilirim?', 233), ('Hello! How can I assist you today?', 39), ('Merhaba, nasıl yardımcı olabilirim?', 8), ('Merhaba, size nasıl yardımcı olabilirim?', 8), ('Merhaba! Nasıl yardımcı olabilirim?', 7), ('Evet, Türkçe biliyorum. Size nasıl yardımcı olabilirim?', 7), ('Evet, Türkçe konuşabilirim. Size nasıl yardımcı olabilirim?', 6), ('Doğru.', 6), ('Doğru', 5), ('Hello! How can I help you today?', 4)]


In [11]:
greeting_turns = set(turn  for turn, count in turn_counts.most_common(7))

In [12]:
greeting_turns

{'Evet, Türkçe biliyorum. Size nasıl yardımcı olabilirim?',
 'Evet, Türkçe konuşabilirim. Size nasıl yardımcı olabilirim?',
 'Hello! How can I assist you today?',
 'Merhaba! Nasıl yardımcı olabilirim?',
 'Merhaba! Size nasıl yardımcı olabilirim?',
 'Merhaba, nasıl yardımcı olabilirim?',
 'Merhaba, size nasıl yardımcı olabilirim?'}

In [13]:
turns_removed = 0
def remove_greeting_turns(example):
    global turns_removed
    if example['conversation'][1]['content'] in greeting_turns:
        example['conversation'] = example['conversation'][2:]  # remove first and second turn that involves greeting
        turns_removed+=2
    return example

ds_near_dedup = ds_near_dedup.map(remove_greeting_turns)
ds_near_dedup = ds_near_dedup.filter(lambda x: len(x['conversation']) > 1)  # keep only if more than 1 turn

Map:   0%|          | 0/5848 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5848 [00:00<?, ? examples/s]

In [14]:
turns_removed = 0
def remove_greeting_turns(example):
    global turns_removed
    if example['conversation'][1]['content'] in greeting_turns:
        example['conversation'] = example['conversation'][2:]  # remove first and second turn that involves greeting
        turns_removed+=2
    return example

ds_near_dedup = ds_near_dedup.map(remove_greeting_turns)

Map:   0%|          | 0/5844 [00:00<?, ? examples/s]

## Remove Political Bias

In [15]:
indices_to_remove = list()
for i, row in enumerate(ds_near_dedup):
    if set(['tayyip','erdoğan']) & set(row['chat'].lower().split()):
        indices_to_remove.append(i)

In [16]:
len(indices_to_remove)

6

In [17]:
keep_idx = [i for i in range(len(ds_near_dedup)) if i not in indices_to_remove]
ds_near_dedup = ds_near_dedup.select(keep_idx)

## Final Dataset

In [21]:
print("Orignal dataset size:", len(ds))
print("After cleanup:", len(ds_near_dedup))
print("Percentage removed:", (len(ds) - len(ds_near_dedup)) / len(ds) * 100)

Orignal dataset size: 6104
After cleanup: 5838
Percentage removed: 4.3577981651376145


## Anonymize PII Data

In [39]:
import re

patterns = {
    "EMAIL": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b",
    "PHONE": r"(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{3,4}[-.\s]?\d{4}",
    "CREDIT_CARD": r"\b(?:\d[ -]*?){13,19}\b",
    "TCK": r"\b\d{11}\b",
    "IP": r"\b(?:\d{1,3}\.){3}\d{1,3}\b",
}

anonymized = {
    "EMAIL" : "someone@gmail.com",
    "PHONE": "+90 555 555 5555",
    "CREDIT_CARD": "4111 1111 1111 1111",
    "TCK": "12345678901",
    "IP": "192.168.1.0"
}

n_cleaned = 0

def redact_pii(text):
    global n_cleaned
    for label, pattern in patterns.items():
        clean_text = re.sub(pattern, f"<{anonymized[label]}>", text, flags=re.IGNORECASE)
    n_cleaned += (text != clean_text)
    return clean_text

def redact_chat(conversation):
    for con in conversation:
        con['content'] = redact_pii(con['content'])
    return conversation

In [40]:
ds_no_pii = ds_near_dedup.map(lambda x: {'conversation': redact_chat(x['conversation'])})

Map:   0%|          | 0/5838 [00:00<?, ? examples/s]

In [27]:
n_cleaned

39

## Save Clean Data

In [42]:
final_ds= ds_no_pii

In [51]:
final_ds = final_ds.remove_columns(set(final_ds.features.keys())-set(["conversation"]))

In [52]:
final_ds

Dataset({
    features: ['conversation'],
    num_rows: 5838
})

In [53]:
final_ds = final_ds.map(lambda x: {'conversation':  [ {'role':msg['role'], 'content': msg['content']} for msg in x['conversation']]})  # remove any turn data except role and content

Map:   0%|          | 0/5838 [00:00<?, ? examples/s]

In [57]:
final_ds.push_to_hub("habanoz/WildChat-turkce-cleaned", private=False, token="xxxx")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


CommitInfo(commit_url='https://huggingface.co/datasets/habanoz/WildChat-turkce-cleaned/commit/d40f5211ec134b08a098af499dfcd09e7725a8d5', commit_message='Upload dataset', commit_description='', oid='d40f5211ec134b08a098af499dfcd09e7725a8d5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/habanoz/WildChat-turkce-cleaned', endpoint='https://huggingface.co', repo_type='dataset', repo_id='habanoz/WildChat-turkce-cleaned'), pr_revision=None, pr_num=None)