In [None]:
import os


import datasets
import torch
from transformers import pipeline

In [None]:
df = datasets.load_dataset('kreimben/leetcode_with_youtube_captions')['train']
df = df.to_pandas()

In [None]:
# Load the HF pipeline using Gemma 2B from Kaggle
pipe = pipeline(
    "text-generation",
    model='bert-large-uncased',
    model_kwargs={"torch_dtype": torch.float16},
    device='cuda',
    max_new_tokens=512,
    token=os.getenv('HF_API')
)

In [None]:
prompt = """
You are a teacher who teaches coding tests to users.
You should explain the algorithm for the coding test problem to the user in detail.
separate the steps but keep it detailed.
let's think step by step.
"""

In [None]:
total_execution_time = 0

def summarize_example(text):
    global prompt
    print(f'Total Executed: {total_execution_time} times.')
    
    max_input_length = 512  # Reducing input length for stricter truncation
    max_output_length = 512

    chunks = [text[i:i + max_input_length] for i in range(0, len(text), max_input_length)]

    summaries = []
    for i, chunk in enumerate(chunks):
        print(f'[CHUNK] {i + 1} / {len(chunks)}')
        input_text = f"{prompt}\n\n{chunk}"
        inputs = pipe.tokenizer(input_text, truncation=True, max_length=max_input_length, return_tensors="pt")

        # Ensure compatibility between output length and input constraints
        max_new_tokens = min(max_output_length, max_input_length - len(inputs['input_ids'][0]))  

        outputs = pipe(
            input_text,
            do_sample=True,
            temperature=0.1,
            top_k=5,
            top_p=0.3,
            max_new_tokens=max_new_tokens,
            pad_token_id=pipe.tokenizer.eos_token_id,
        )

        summary = outputs[0]["generated_text"]
        summaries.append(summary)

    return " ".join(summaries)


In [None]:
%%time

import random

example = df[df['id'] == 1]['cc_content']
example.sample(3)

N = len(example)
idx = random.randint(0, N)

example_yt_cc = example.values[idx]
example_yt_cc

summarize_example(example_yt_cc)

In [None]:
df['summary'] = df['cc_content'].apply(summarize_example)