In [1]:
import datasets
import numpy as np
#import cleaning
from get_data import fetch_data
import re
import tiktoken
num_proc = 8

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_data, val_data = fetch_data()



In [3]:
train_data[0]['text']

'November 24, 2016 – World News, Breaking News\nWednesday, April 24, 2019\nLatest:\nFitbit introduced “smart” watches, Versa Lite, and a fitness bracelets under $100\nUpgraded Acer laptop Aspire 3 entered the Ukrainian market with a price tag from 8939 UAH\nMicrosoft discontinues support fitness bracelets Band, but promises compensation to users\nSamsung Connect combines Wi-Fi and the Internet of things in a single device\nMWC 2019: Nubia announced a smart watch Alpha camera and flexible OLED display\nWorld News, Breaking News\nRussia, USA, World\nSOCIETY\nUSA NEWS\nPOLITICS\nECONOMICS\nBUSINESS\nFINANCES\nAuto\nTECHNOLOGY AND MEDIA\nVIDEO NEWS\nSPORT\nDay: November 24, 2016\nSPORT\nBest of the best: from Singapore Ukrainian swimmers Romanchuk and Zevin brought home 5 awards of the world Cup!\nNovember 24, 2016 trump\nUkrainian swimmers once again prove that they are the fastest in the world. After the end of the world Cup,\nRead more\nCULTURE\nBlue velvet is a creepy, seductive, and a

In [3]:
# Filtering non-english text at word level
non_english_regex = re.compile(r'[^\u0000-\u007F]+')
def filter_non_english(example):
    text = example['text']
    words = text.split()
    english_words = [w for w in words if not cleaning.filter_noneng(w)]
    return {'text': " ".join(english_words), 'before_filter': len(words), 'after_filter': len(english_words)}

In [4]:
def filter_pii(example):
    filtered_text = cleaning.clean_pii(example['text'])
    return {'text': filtered_text, 'before_filter': len(example['text']), 'after_filter': len(filtered_text)}

In [5]:
def clean(dataset):
    print("Filtering non-english words: ")
    dataset = dataset.map(filter_non_english, num_proc = num_proc)
    before_filter = np.sum(dataset['before_filter'])
    after_filter = np.sum(dataset['after_filter'])
    print(f'Words before filtering: {before_filter}')
    print(f'Words after filtering: {after_filter}')
    print(f'Words removed: {before_filter - after_filter}')
    print()

    print("Applying Offensive Words filtering: ")
    before_filter = len(dataset)
    bad_words_list = open('HW3/llms-class-hw-3-main/bad_words/bad_words_list.txt').read().split('\n')
    dataset = dataset.filter(lambda x: cleaning.clean_other(x['text'], bad_words=bad_words_list), num_proc=num_proc)
    after_filter = len(dataset)
    print(f'Length before filtering: {before_filter}')
    print(f'Length after filtering: {after_filter}')
    print(f'Documents removed: {before_filter - after_filter}')
    print()

    print("Applying PII filter: ")
    dataset = dataset.map(filter_pii, num_proc=8)
    before_filter = np.sum(dataset['before_filter'])
    after_filter = np.sum(dataset['after_filter'])
    print(f"Charaters before filtering: {before_filter}")
    print(f"Charaters after filtering: {after_filter}")
    print(f'Characters removed: {before_filter - after_filter}')
    
    return dataset

In [6]:
train_data_filtered = clean(train_data)

Filtering non-english words: 
Words before filtering: 264841705
Words after filtering: 258956021
Words removed: 5885684

Applying Offensive Words filtering: 
Length before filtering: 281979
Length after filtering: 250435
Documents removed: 31544

Applying PII filter: 
Charaters before filtering: 1285062338
Charaters after filtering: 1278348826
Characters removed: 6713512


In [7]:
print(len(train_data_filtered))

250435


In [8]:
val_data_filtered = clean(val_data)

Filtering non-english words: 
Words before filtering: 13952369
Words after filtering: 13608836
Words removed: 343533

Applying Offensive Words filtering: 
Length before filtering: 14842
Length after filtering: 13225
Documents removed: 1617

Applying PII filter: 
Charaters before filtering: 67692417
Charaters after filtering: 67353887
Characters removed: 338530


In [9]:
cols_to_remove = ['before_filter', 'after_filter']
filtered_dataset = datasets.DatasetDict(
    {
        'train': train_data_filtered.remove_columns(cols_to_remove), 
        'val': val_data_filtered .remove_columns(cols_to_remove)
    }
)
filtered_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'timestamp', 'url'],
        num_rows: 250435
    })
    val: Dataset({
        features: ['text', 'timestamp', 'url'],
        num_rows: 13225
    })
})

In [14]:
filtered_dataset.save_to_disk('cleaned_data.arrow')

Saving the dataset (3/3 shards): 100%|██████████| 250435/250435 [00:00<00:00, 282093.42 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 13225/13225 [00:00<00:00, 263694.26 examples/s]


In [3]:
filtered_dataset = datasets.load_from_disk('/home/ubuntu/LLM/HW3/llms-class-hw-3-main/src/cleaned_data.arrow')
filtered_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'timestamp', 'url'],
        num_rows: 250435
    })
    val: Dataset({
        features: ['text', 'timestamp', 'url'],
        num_rows: 13225
    })
})

In [7]:
tokenizer = tiktoken.get_encoding('gpt2')
def tokenise(sample):
    tokenised_text = tokenizer.encode(sample['text'])
    tokenised_text.append(tokenizer.eot_token)
    return {"tokenised_text": tokenised_text}

In [9]:
train_data_filtered = filtered_dataset['train']
val_data_filtered = filtered_dataset['val']

train_data_tokenised = train_data_filtered.map(tokenise, num_proc=num_proc)
val_data_tokenised = val_data_filtered.map(tokenise, num_proc=num_proc)

Map (num_proc=8): 100%|██████████| 250435/250435 [01:02<00:00, 3986.77 examples/s]
Map (num_proc=8): 100%|██████████| 13225/13225 [00:03<00:00, 4305.37 examples/s]


In [10]:
train_data = np.concatenate(train_data_tokenised['tokenised_text'])
train_data.shape

(314786851,)

In [15]:
val_data = np.concatenate(val_data_tokenised['tokenised_text'])
val_data.shape

(16483617,)

In [18]:
np.savez('../dataset/tokens.npz', train = train_data, val = val_data)