In [2]:
import os
from dotenv import load_dotenv
import comet_ml
from transformers import GPT2Tokenizer


load_dotenv()

COMET_API_KEY = os.getenv("COMET_API_KEY")
# experiment = comet_ml.Experiment(
#     api_key=COMET_API_KEY,
#     project_name="clm",
#     log_code=True,
#     auto_metric_logging=True,
#     auto_param_logging=True,
#     auto_histogram_weight_logging=True,
#     auto_histogram_gradient_logging=True,
#     auto_histogram_activation_logging=True,
# )


def tokenize_function(examples):
    from transformers import GPT2Tokenizer

    tokenizer = GPT2Tokenizer.from_pretrained(params["model"])
    return tokenizer(examples["text"])


def group_texts(dat, block_size = 64):
    # function from HF script used to chunk data into block_size
    # Concatenate all texts.
    concatenated_examples = {k: sum(dat[k], []) for k in dat.keys()}
    total_length = len(concatenated_examples[list(dat.keys())[0]])
    # We drop the small remainder, though you could add padding instead if the model supports it
    # In this, as in all things, we advise you to follow your heart
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result


from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoConfig, TFAutoModelForCausalLM
from transformers import AdamWeightDecay
from transformers import DefaultDataCollator
import math
import pandas as pd




params = {
    "model": "gpt2",
    "epochs": 50,
    "batch_size": 32,
    "learning_rate": 2e-5,
    "weight_decay": 0.01,
}
model_checkpoint = params["model"]
tokenizer_checkpoint = params["model"]
tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)
# experiment.log_parameters(params)
tokenizer = GPT2Tokenizer.from_pretrained(params["model"])

dataset = load_dataset("Whispering-GPT/lex-fridman-podcast")
filtered_dataset = dataset.filter(lambda x: "deepmind" in x["title"].lower())

print(dataset["train"])
print(filtered_dataset["train"])

tokenized_datasets = filtered_dataset.map(
    tokenize_function, 
    batched=True, 
    num_proc=4, 
    remove_columns = ['id', 'channel', 'channel_id', 'title', 'categories', 'tags', 'description', 'text', 'segments'])

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

df_test = pd.DataFrame( dataset['train'] )

df_test.head()

Found cached dataset parquet (/home/starscream/.cache/huggingface/datasets/Whispering-GPT___parquet/Whispering-GPT--lex-fridman-podcast-f9b59d9d94797791/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|███████████████████████████████████████████████| 1/1 [00:00<00:00, 399.72it/s]
Loading cached processed dataset at /home/starscream/.cache/huggingface/datasets/Whispering-GPT___parquet/Whispering-GPT--lex-fridman-podcast-f9b59d9d94797791/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-aad5ef54e8d74c4d.arrow
num_proc must be <= 3. Reducing num_proc to 3 for dataset of size 3.
Loading cached processed dataset at /home/starscream/.cache/huggingface/datasets/Whispering-GPT___parquet/Whispering-GPT--lex-fridman-podcast-f9b59d9d94797791/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-fe132911464cec1a_*_of_00003.arrow
num_proc must be <= 3. Reducing num_proc to 3 for dataset of size 3.
Loading cached processed

Dataset({
    features: ['id', 'channel', 'channel_id', 'title', 'categories', 'tags', 'description', 'text', 'segments'],
    num_rows: 346
})
Dataset({
    features: ['id', 'channel', 'channel_id', 'title', 'categories', 'tags', 'description', 'text', 'segments'],
    num_rows: 3
})


Unnamed: 0,id,channel,channel_id,title,categories,tags,description,text,segments
0,TRdL6ZzWBS0,Lex Fridman,UCSHZKyawb77ixDdsGog4iWA,Jed Buchwald: Isaac Newton and the Philosophy ...,[Science & Technology],"[agi, ai, ai podcast, artificial intelligence,...",Jed Buchwald is a historian and philosopher of...,The following is a conversation with Jed Buck...,"[{'start': 0.0, 'end': 7.68, 'text': ' The fol..."
1,TPXTmVdlyoc,Lex Fridman,UCSHZKyawb77ixDdsGog4iWA,"Sergey Nazarov: Chainlink, Smart Contracts, an...",[Science & Technology],"[agi, ai, ai podcast, artificial intelligence,...","Sergey Nazarov is the Co-Founder of Chainlink,...",The following is a conversation with Sergey N...,"[{'start': 0.0, 'end': 6.5, 'text': ' The foll..."
2,-t1_ffaFXao,Lex Fridman,UCSHZKyawb77ixDdsGog4iWA,Stephen Wolfram: Fundamental Theory of Physics...,[Science & Technology],"[stephen wolfram, artificial intelligence, agi...","Stephen Wolfram is a computer scientist, mathe...",The following is a conversation with Stephen ...,"[{'start': 0.0, 'end': 4.48, 'text': ' The fol..."
3,BCdV6BMMpOo,Lex Fridman,UCSHZKyawb77ixDdsGog4iWA,"Philip Goff: Consciousness, Panpsychism, and t...",[Science & Technology],"[agi, ai, ai podcast, artificial intelligence,...",Philip Goff is a philosopher of mind and consc...,I believe our official scientific worldview i...,"[{'start': 0.0, 'end': 5.2, 'text': ' I believ..."
4,Kedt2or9xlo,Lex Fridman,UCSHZKyawb77ixDdsGog4iWA,"Oriol Vinyals: DeepMind AlphaStar, StarCraft, ...",[Science & Technology],[],,The following is a conversation with Ariol Vi...,"[{'start': 0.0, 'end': 3.2800000000000002, 'te..."


In [None]:
config = AutoConfig.from_pretrained(model_checkpoint)
gpt2 = TFAutoModelForCausalLM.from_config(config)
learning_rate = params['learning_rate']
weight_decay = params['weight_decay']

optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
gpt2.compile(optimizer=optimizer)

data_collator = DefaultDataCollator(return_tensors="tf")

train_set = lm_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    shuffle=True,
    batch_size=params['batch_size'],
    collate_fn=data_collator,)

gpt2.fit(train_set, epochs=params['epochs'])
eval_loss = gpt2.evaluate(train_set)
experiment.log_metrics({"eval_loss":eval_loss})
print(f"Perplexity: {math.exp(eval_loss):.2f}")

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


Epoch 1/50


2023-04-02 13:14:48.561229: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [1080]
	 [[{{node Placeholder/_0}}]]
2023-04-02 13:14:48.561508: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [1080]
	 [[{{node Placeholder/_0}}]]
2023-04-02 13:14:50.382235: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 154389504 exceeds 10% of free system memory.
2023-04-02 13:14:50.448887: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 154389504 exceeds 10% of free system memory.
2023-04-02 13:14:50.466463: W tens

 4/34 [==>...........................] - ETA: 7:16 - loss: 10.5472

In [14]:
_ = df_test.copy()[['id', 'tags']]
_= _.explode('tags')
_.tags.value_counts().head(50)

lex podcast                        276
agi                                275
lex ai                             275
mit ai                             275
lex mit                            275
lex fridman                        275
lex jre                            275
artificial intelligence            275
artificial intelligence podcast    275
ai podcast                         275
ai                                 275
physics                             12
deep learning                       12
elon musk                           11
russia                              10
consciousness                       10
bitcoin                              9
stanford                             9
twitter                              8
aliens                               7
ukraine                              7
machine learning                     7
war                                  7
putin                                6
space                                6
mit                      

In [17]:
source_list = ["agi", "ai", "intelligence", "learning", "consciousness", "robotics", "psychology", "evolution"]