# Data Filtering

In [56]:
import re

In [57]:
# data quality filtering

# defiine a function to clean a text
def clean_text(text):

  # remove special characters, numbers and extra spaces
  clean_text = re.sub(r'[^A-Za-z\s]', '', text)
  cleaned_text = clean_text.lower().strip()
  return cleaned_text

In [58]:
# example data
texts = [
    "!!!$$%%&&****",
    "This is a useful article on AI",
    "AI is the future!!"
]

In [59]:
# filter data
print([clean_text(text) for text in texts])

['', 'this is a useful article on ai', 'ai is the future']


# Tokenization

In [60]:
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [61]:
# example sentence
sentence = "This is a useful article on AI"

In [62]:
# tokenization
tokens = word_tokenize(sentence)
print(tokens)

['This', 'is', 'a', 'useful', 'article', 'on', 'AI']


# Random Sampling of Tokens

In [63]:
import random

In [64]:
# example dataset with 1 million tokens
total_tokens = ["token_" + str(i) for i in range(1, 1000001)]

# select only 1% of tokens (`0,000 tokens)
sample_size = int(0.01 * len(total_tokens))
reduced_tokens = random.sample(total_tokens, sample_size)

In [65]:
print(f"Total tokens: {len(total_tokens)}")
print(f"Reduced tokens: {len(reduced_tokens)}")
print(f"Reduced tokens: {reduced_tokens[:10]}")

Total tokens: 1000000
Reduced tokens: 10000
Reduced tokens: ['token_501001', 'token_248913', 'token_651173', 'token_264517', 'token_823195', 'token_685837', 'token_484482', 'token_648052', 'token_743765', 'token_520131']


# Stratified Sampling of tokens

In [66]:
from collections import Counter
import numpy as np

In [67]:
# example dataset
token_data = ["apple", "banana", "banana", "cherry", "cherry", "cherry", "date", "date", "date","date"]

token_counts = Counter(token_data)
print(token_counts)

Counter({'date': 4, 'cherry': 3, 'banana': 2, 'apple': 1})


In [68]:
# define eprcentage to keep (50% of each type)
percentage_to_keep = 0.5

In [69]:
# perform stratified sampling
reduced_data = []

for token, count in token_counts.items():
  sample_size = max(1, int(percentage_to_keep * count))

  # ensure atleast 1 token is sampled
  reduced_data.extend([token] * sample_size)

In [70]:
print("Original Tokens: ", token_data)
print("Reduced Tokens: ", reduced_data)

Original Tokens:  ['apple', 'banana', 'banana', 'cherry', 'cherry', 'cherry', 'date', 'date', 'date', 'date']
Reduced Tokens:  ['apple', 'banana', 'cherry', 'date', 'date']


# Reducing Data Using TF-IDF scores

In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [72]:
# sample text
corpus = [
    "Machine learning is amazing",
    "Deep learning is awesome",
    "Natural language processing is interesting"
]

In [73]:
# compute tf-idf scores
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names_out()

In [74]:
# get top words based on tf-idf
importance_threshold = 0.2
important_words = [
    feature_names[idx] for idx, score in enumerate(tfidf_matrix.toarray()[0]) if score > importance_threshold
]

In [75]:
print("Important words for training: ", important_words)

Important words for training:  ['amazing', 'is', 'learning', 'machine']


# Word Embedding

In [76]:
import numpy as np

In [77]:
# initialize random embedding
embedding_size = 9
vocab = ["the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"]
embedding_dict = {word: np.random.rand(embedding_size) for word in vocab}

In [78]:
# print embeddings for each word
for word, embedding in embedding_dict.items():
  print(f"{word}: {embedding}")

the: [0.29649993 0.06894139 0.64594602 0.2261799  0.34018184 0.37321896
 0.02797216 0.6193868  0.45396172]
quick: [6.40223200e-01 1.60633294e-01 5.31612529e-01 8.98870236e-01
 9.97338341e-01 5.74264070e-01 2.89786510e-01 2.67043774e-01
 8.84860827e-04]
brown: [0.79381324 0.43095386 0.95665607 0.5780545  0.59604417 0.03567627
 0.02466823 0.93072177 0.04986714]
fox: [0.31388851 0.30469892 0.09565952 0.66766076 0.91579526 0.85856697
 0.32946515 0.59259439 0.82616203]
jumps: [0.85945934 0.03656065 0.12017924 0.44196285 0.50546647 0.27802044
 0.04518222 0.16086026 0.37015752]
over: [0.95955083 0.55091684 0.56349494 0.16655973 0.5753885  0.55467389
 0.15906548 0.53726099 0.55166269]
lazy: [0.57359462 0.54968877 0.63229307 0.90689625 0.20123622 0.40803974
 0.99954071 0.44123054 0.63231006]
dog: [0.13622454 0.85477554 0.02056476 0.56636478 0.13533947 0.03022967
 0.48237713 0.69857204 0.63903754]
