In [1]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import torch
from torch.utils.data import DataLoader
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers.optimization import AdamW

In [3]:

# Load the CSV file and create a dataset
from pandas import read_csv
corpus_file = 'sample_corpus_smartphones.csv'
corpus_data = read_csv(corpus_file)

source_texts = corpus_data['text'].tolist()
summaries = corpus_data['summary'].tolist()

In [4]:
# Initialize the BART tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

# Tokenize and encode the source texts and summaries
source_encodings = tokenizer(source_texts, truncation=True, padding=True, max_length=512)
summary_encodings = tokenizer(summaries, truncation=True, padding=True, max_length=128)

# Convert the tokenized encodings to PyTorch tensors
source_ids = torch.tensor(source_encodings['input_ids'])
source_mask = torch.tensor(source_encodings['attention_mask'])
summary_ids = torch.tensor(summary_encodings['input_ids'])
summary_mask = torch.tensor(summary_encodings['attention_mask'])

# Create a PyTorch dataset
dataset = torch.utils.data.TensorDataset(source_ids, source_mask, summary_ids, summary_mask)

# Define the BART-like model architecture
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

In [5]:
# Create a data loader
batch_size = 8
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Set the device (GPU or CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Set the optimizer and learning rate
optimizer = AdamW(model.parameters(), lr=1e-5)

# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in dataloader:
        batch = [item.to(device) for item in batch]
        source_ids, source_mask, summary_ids, summary_mask = batch

        optimizer.zero_grad()

        outputs = model(input_ids=source_ids, attention_mask=source_mask, decoder_input_ids=summary_ids, decoder_attention_mask=summary_mask, labels=summary_ids)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs} - Average Loss: {average_loss}")

# Save the trained model
output_dir = 'model/'
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)



Epoch 1/10 - Average Loss: 9.676902770996094
Epoch 2/10 - Average Loss: 8.203815460205078
Epoch 3/10 - Average Loss: 7.764133930206299
Epoch 4/10 - Average Loss: 7.513242721557617
Epoch 5/10 - Average Loss: 6.850715160369873
Epoch 6/10 - Average Loss: 6.400395393371582
Epoch 7/10 - Average Loss: 6.245432376861572
Epoch 8/10 - Average Loss: 6.531487941741943
Epoch 9/10 - Average Loss: 6.158269882202148
Epoch 10/10 - Average Loss: 5.8197712898254395


('model/tokenizer_config.json',
 'model/special_tokens_map.json',
 'model/vocab.json',
 'model/merges.txt',
 'model/added_tokens.json')

In [6]:
model_path = '/content/model/'
tokenizer_path = '/content/model/'

model = BartForConditionalGeneration.from_pretrained(model_path)
tokenizer = BartTokenizer.from_pretrained(tokenizer_path)

In [7]:
input_text = "The Pixel 7 is a flagship smartphone produced by Google. It boasts a sleek design with a vibrant OLED display and thin bezels, providing an immersive viewing experience. The device is powered by a high-performance processor and ample RAM, ensuring smooth multitasking and speedy performance. The Pixel 7 is known for its exceptional camera capabilities, capturing stunning photos with its advanced sensors and computational photography features. Additionally, it runs on the latest version of Android, offering a clean and intuitive user interface along with access to a wide range of apps and services from the Google Play Store."

input_encoding = tokenizer.encode_plus(input_text, truncation=True, padding=True, max_length=512, return_tensors="pt")
input_ids = input_encoding.input_ids.to(device)
attention_mask = input_encoding.attention_mask.to(device)


In [15]:
output = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=18, num_beams=4, early_stopping=True)
summary = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Summary:", summary)

Generated Summary: The Pixel 7 is a flagship smartphone smartphone produced by Google. It is a
