In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import Trainer, TrainingArguments
from transformers import EarlyStoppingCallback


# Load the dataset
data = pd.read_csv(r'keyword_label.csv')

# Format the dataset
data['input_text'] = 'topic of keyword: "' + data['keyword'] + '" is "<extra_id_1>"'
data['label'] = '<extra_id_0> "'+ data['label'].astype(str) + '"'  # Ensure labels are strings

train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['input_text'].tolist(),
    data['label'].tolist(),
    test_size=0.2,
    random_state=42
)

# Initialize the tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-base")

# Tokenize the inputs and labels
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, return_tensors="pt")
train_labels_encodings = tokenizer(train_labels, truncation=True, padding=True, return_tensors="pt")
val_labels_encodings = tokenizer(val_labels, truncation=True, padding=True, return_tensors="pt")

class KeywordDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels_encodings):
        self.encodings = encodings
        self.labels_encodings = labels_encodings

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels_encodings['input_ids'][idx]
        return item

    def __len__(self):
        return len(self.labels_encodings['input_ids'])

# Create datasets
train_dataset = KeywordDataset(train_encodings, train_labels_encodings)
val_dataset = KeywordDataset(val_encodings, val_labels_encodings)

# Load the T5 model
model = T5ForConditionalGeneration.from_pretrained("t5-base")

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=1000,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)

# Train the model
trainer.train()

# Save the model and tokenizer
model_save_path = r'fine_tuned_T5'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model saved to {model_save_path}")

input_ids = tokenizer('topic of keyword: "badminton" is ', return_tensors="pt").input_ids
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))



In [None]:
import torch
import time
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5EncoderModel
import pandas as pd

start_time = time.time()

# Specify your fine-tuned model path
model_name = r'fine_tuned_T5'  # Change this to your fine-tuned model directory

# Load the fine-tuned T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained(model_name, cache_dir='./model_cache')
model = T5ForConditionalGeneration.from_pretrained(model_name, cache_dir='./model_cache')  # Using T5ForConditionalGeneration
encoder_model = T5EncoderModel.from_pretrained(model_name, cache_dir='./model_cache')  # T5 Encoder for embeddings
'''
input_text = 'topic of keyword: "' + "smart" + '" is "<extra_id_1>"'

# Tokenize the input text
inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True).to(device)
    
# Generate the output text
with torch.no_grad():
  generated_ids = model.generate(inputs.input_ids, max_length=50, num_beams=5, early_stopping=True)
    
# Decode the generated ids to text
generated_output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(generated_output)
# Store the generated output
'''
# Read the dataset
df = pd.read_csv(r'data.csv')

generated_text = []  # To store generated outputs
generated_embeddings = []
line = 0
print(df)

# Set the device to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Loop through each keyword
for item in df['keyword']:
    line += 1
    print(line)
    # Format the input text
    input_text = 'topic of keyword: "' + item + '" is "<extra_id_1>"'
    
    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True).to(device)
    
    # Generate the output text
    with torch.no_grad():
        generated_ids = model.generate(inputs.input_ids, max_length=50, num_beams=5, early_stopping=True)
    
    # Decode the generated ids to text
    generated_output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    print(generated_output)
    # Store the generated output
    generated_text.append(generated_output)

    generated_inputs = tokenizer(generated_output, return_tensors='pt', padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = encoder_model(**generated_inputs)
    
    # Use the last hidden state and average it to get a fixed-size vector for clustering
    event_embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy().reshape(-1)
    generated_embeddings.append(event_embedding)

# Save the result to a DataFrame
data = pd.DataFrame(generated_embeddings)
data['timestamp'] = df['date']
data.to_csv(r'T5_EmbeddingText.csv', index=False)

end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time:.2f} seconds")