<a href="https://colab.research.google.com/github/jbloewencolon/Psychedelic-Trip-Generator/blob/main/GPT_2_Text_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
import pandas as pd
import math
from torch.utils.data import Dataset, random_split, DataLoader, RandomSampler, SequentialSampler
from nltk.tokenize import sent_tokenize
from joblib import load
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from torch.cuda.amp import autocast, GradScaler

# If there's a GPU available...
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce GTX 1070


In [None]:
# Load data
df = pd.read_csv('D:/Cloud/Google Drive/Colab Notebooks/Data/modeled.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67516 entries, 0 to 67515
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   title              67516 non-null  object 
 1   drug               67516 non-null  object 
 2   dosage             67516 non-null  object 
 3   delivery           67516 non-null  object 
 4   weight             67516 non-null  float64
 5   year               67516 non-null  int64  
 6   gender             67516 non-null  object 
 7   report             67516 non-null  object 
 8   processed_report   67516 non-null  object 
 9   mixed              67516 non-null  int64  
 10  drug_category      67516 non-null  object 
 11  report_embeddings  67516 non-null  object 
dtypes: float64(1), int64(2), object(9)
memory usage: 6.2+ MB


In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained('gpt2')
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
unique_drugs = df['drug'].unique()
special_tokens = [f'<{drug}>' for drug in unique_drugs]
tokenizer.add_tokens(special_tokens)

# Create a dictionary mapping each drug to its corresponding special token
special_tokens_dict = {drug: f'<{drug}>' for drug in unique_drugs}

# Apply the mapping to the 'report' column based on the 'drug' column
df['report'] = df.apply(lambda row: special_tokens_dict[row['drug']] + ' ' + row['report'], axis=1)

In [None]:
model.resize_token_embeddings(len(tokenizer))
vocab_size = len(tokenizer)

In [None]:
class GPT2Dataset(Dataset):
    def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=768):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []
        for txt in txt_list:
            encodings_dict = tokenizer(txt, truncation=True, max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [None]:
# Function to generate text
def generate_text(drug, length=500):
    category = get_category_from_embedding(drug)  # Function to categorize the drug based on BigBird embeddings
    input_str = f"The experience of using {drug}, which belongs to the {category} category, is like..."
    inputs = tokenizer.encode(input_str, return_tensors='pt').to(device)
    outputs = model.generate(inputs, max_length=length, num_return_sequences=1, do_sample=True)
    return tokenizer.decode(outputs[0])

In [None]:
reports = df.report.values
dataset = GPT2Dataset(reports, tokenizer, max_length=768)

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

num_workers = 4
batch_size = 16

train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size, num_workers=num_workers)
validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size, num_workers=num_workers)

In [None]:
tokenizer.save_pretrained('D:/Cloud/Google Drive/Colab Notebooks/Data/')

('D:/Cloud/Google Drive/Colab Notebooks/Data/tokenizer_config.json',
 'D:/Cloud/Google Drive/Colab Notebooks/Data/special_tokens_map.json',
 'D:/Cloud/Google Drive/Colab Notebooks/Data/vocab.json',
 'D:/Cloud/Google Drive/Colab Notebooks/Data/merges.txt',
 'D:/Cloud/Google Drive/Colab Notebooks/Data/added_tokens.json')

In [None]:
# Define number of epochs
epochs = 3

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * epochs)

# Training loop
for epoch in range(epochs):
    model.train()
    total_batches = len(train_dataloader)
    print_every = total_batches // 10  # Print every 10%
    for batch_idx, batch in enumerate(train_dataloader):
        inputs, masks = batch
        inputs, masks = inputs.to(device), masks.to(device)

        # Forward pass
        outputs = model(inputs, attention_mask=masks, labels=inputs)
        loss = outputs.loss

        # Backward pass
        loss.backward()

        # Clip gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        # Print how much of the data has been processed every 10%
        if batch_idx % 10 == 0:
            print(f'\rProcessed: {processed_percentage:.2f}% of data in epoch {epoch + 1}', end='')

    # Print training loss and generate sample text
    print(f'Epoch {epoch + 1}: Training Loss: {loss.item()}')
    sample_text = generate_text("Sample prompt", length=50)
    print(f"Generated Text: {sample_text}")

# Save the trained model
torch.save(model.state_dict(), 'D:/Cloud/Google Drive/Colab Notebooks/Data/trip_reports_model.pth')

Processed: 0.01% of data in epoch 1
Processed: 9.99% of data in epoch 1


KeyboardInterrupt: 

In [None]:
# Validation loop
model.eval()
total_eval_loss = 0
eval_steps = 0
with torch.no_grad():
    for batch in validation_dataloader:
        inputs, masks = batch
        inputs, masks = inputs.to(device), masks.to(device)
        outputs = model(inputs, attention_mask=masks, labels=inputs)
        loss = outputs.loss
        total_eval_loss += loss.item()
        eval_steps += 1

avg_val_loss = total_eval_loss / len(validation_dataloader)
perplexity = math.exp(avg_val_loss)  # Compute perplexity from the average loss

print(f'Validation Loss: {avg_val_loss}')
print(f'Validation Perplexity: {perplexity}')

In [None]:
def generate_report(drug, desired_length_min=300, desired_length_max=500):
    # Step 1: Preprocess
    output_file = 'D:/Cloud/Google Drive/Colab Notebooks/Data/bigbird_embeddings.joblib'
    bigbird_embeddings = load(output_file)
    embedding = bigbird_embeddings[drug]

    # Step 2: Generate text with GPT-2
    text = generate_text(drug)

    # Step 3: Evaluate with RFC (optional)
    # Load the trained Random Forest model
    with open("D:/Cloud/Google Drive/Colab Notebooks/Data/xgb_model.pkl", "rb") as f:
    rfc_model = pickle.load(f)
    vectorized_text = tfidf_vectorizer.transform([text])
    category_prediction = rfc_model.predict(vectorized_text)
    # Validate or modify text based on category prediction if needed

    # Step 4: Post-process text
    sentences = sent_tokenize(text)

    # Concatenate sentences until you reach the desired minimum length
    processed_text = ""
    for sentence in sentences:
        processed_text += sentence + " "
        words = processed_text.split()
        if len(words) >= desired_length_min:
            break

    # If the text exceeds the desired maximum length, truncate it
     words = text.split()
    if len(words) > 4000:
        text = " ".join(words[:4000])
    if len(words) > desired_length_max:
        processed_text = " ".join(words[:desired_length_max])

    return processed_text
