In [1]:
import os
os.chdir(os.path.expanduser("~/git/gpt_from_scratch/"))
from src.data.load_data import *
from datasets import load_dataset
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the WikiText-2-raw-v1 dataset
dataset = load_dataset("Bingsu/openwebtext_20p")
train_data = dataset['train']
# valid_data = dataset['validation']
# test_data = dataset['test']

In [3]:
# Sample a few data points to see the content
print(train_data[0])
# print(valid_data[0])
# print(test_data[0])

{'text': 'If you live abroad and are requesting an ITIN for a foreign child who has been adopted or legally placed in your home pending adoption, remember to include a copy of the legal documents evidencing your relationship to the child.'}


In [4]:
# Number of documents in each split
print(f"Number of training documents: {len(train_data)}")
# print(f"Number of validation documents: {len(valid_data)}")
# print(f"Number of test documents: {len(test_data)}")

# Word counts per document
word_counts = [len(doc['text'].split()) for doc in train_data]
print(f"Average words per document: {np.mean(word_counts)}")
print(f"Median words per document: {np.median(word_counts)}")
print(f"Max words in a document: {np.max(word_counts)}")
print(f"Min words in a document: {np.min(word_counts)}")


Number of training documents: 33167823
Average words per document: 39.604504281152245
Median words per document: 28.0
Max words in a document: 20975
Min words in a document: 1


In [8]:
from transformers import GPT2Tokenizer

# Initialize the tokenizer
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer = AutoTokenizer.from_pretrained("gpt2", clean_up_tokenization_spaces=True)

# Tokenize the dataset
token_lengths = [len(tokenizer.encode(doc['text'])) for doc in train_data]

print(f"Average tokens per document: {np.mean(token_lengths)}")
print(f"Median tokens per document: {np.median(token_lengths)}")
print(f"Max tokens in a document: {np.max(token_lengths)}")
print(f"Min tokens in a document: {np.min(token_lengths)}")


Token indices sequence length is longer than the specified maximum sequence length for this model (1190 > 1024). Running this sequence through the model will result in indexing errors


Average tokens per document: 52.61460346673944
Median tokens per document: 36.0
Max tokens in a document: 75147
Min tokens in a document: 1


In [None]:
import matplotlib.pyplot as plt

# Histogram of word counts
plt.figure(figsize=(10, 6))
plt.hist(word_counts, bins=50, color='blue', edgecolor='black')
plt.title('Distribution of Document Lengths (in Words)')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.show()

# Histogram of token counts
plt.figure(figsize=(10, 6))
plt.hist(token_lengths, bins=50, color='green', edgecolor='black')
plt.title('Distribution of Document Lengths (in Tokens)')
plt.xlabel('Number of Tokens')
plt.ylabel('Frequency')
plt.show()


In [None]:
from collections import Counter
import nltk
from nltk.util import ngrams

# Tokenize words for frequency analysis
all_words = [word for doc in train_data for word in doc['text'].split()]
word_freq = Counter(all_words)

# Top 20 most common words
print("Top 20 most common words:")
print(word_freq.most_common(20),"\n")

# Bigram analysis
bigrams = list(ngrams(all_words, 2))
bigram_freq = Counter(bigrams)
print("Top 20 most common bigrams:")
print(bigram_freq.most_common(20),"\n")

# Trigram analysis
trigrams = list(ngrams(all_words, 3))
trigram_freq = Counter(trigrams)
print("Top 20 most common trigrams:")
print(trigram_freq.most_common(20))


In [None]:
# Vocabulary size
vocab_size = len(word_freq)
print(f"Vocabulary size: {vocab_size}")

# Words that appear only once
rare_words = [word for word, count in word_freq.items() if count == 1]
print(f"Number of rare words (appearing once): {len(rare_words)}")


In [None]:
# import nltk
# nltk.download('punkt_tab')

# Tokenize sentences
sent_lengths = []
for doc in train_data:
    sentences = nltk.sent_tokenize(doc['text'])
    sent_lengths.extend([len(sentence.split()) for sentence in sentences])

# Sentence length statistics
print(f"Average sentence length: {np.mean(sent_lengths)} words")
print(f"Median sentence length: {np.median(sent_lengths)} words")
print(f"Max sentence length: {np.max(sent_lengths)} words")
print(f"Min sentence length: {np.min(sent_lengths)} words")

# Histogram of sentence lengths
plt.figure(figsize=(10, 6))
plt.hist(sent_lengths, bins=50, color='purple', edgecolor='black')
plt.title('Distribution of Sentence Lengths (in Words)')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.show()


In [None]:
import re

# Count special characters
special_chars = re.findall(r'[^\w\s]', ' '.join(all_words))
special_char_freq = Counter(special_chars)

print("Most common special characters:")
print(special_char_freq.most_common(20))


In [None]:
from wordcloud import WordCloud

# Generate a word cloud for frequent words
wordcloud = WordCloud(width=800, height=400, background_color='black').generate(' '.join(all_words))

plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of the Training Data')
plt.show()
