In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import torch
import gc
from transformers import GPT2Tokenizer, GPT2Model

def load_and_clean_data(file_path):
    df = pd.read_csv(file_path)
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    # Remove 'Combined_clean' (text) and 'IVFLUIDS' (target) from structured features.
    structured_cols = [col for col in df.columns if col not in ['Combined_clean', 'IVFLUIDS']]
    imputer = SimpleImputer(strategy='median')
    df[structured_cols] = imputer.fit_transform(df[structured_cols])
    df['ID'] = np.arange(len(df))
    return df, structured_cols

def tokenize_texts(texts, tokenizer, max_length=128):
    return tokenizer(texts, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')

def get_gpt2_embeddings_in_batches(texts, model, tokenizer, batch_size=16):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        tokenized = tokenize_texts(batch_texts, tokenizer)
        with torch.no_grad():
            outputs = model(**tokenized)
        # Use the first token's hidden state as the sentence representation.
        embeddings.append(outputs.last_hidden_state[:, 0, :].cpu().numpy())
        torch.cuda.empty_cache()
        gc.collect()
    return np.vstack(embeddings)

# Load data from CSV.
file_path = '.../cleaned_ed_data.csv'
df, structured_cols = load_and_clean_data(file_path)

# Process structured data.
structured_data = df[structured_cols].values
scaler = StandardScaler()
structured_data_normalized = scaler.fit_transform(structured_data)

# Process text data and target labels.
text_data = df['Combined_clean'].tolist()
target = df['IVFLUIDS']  # Assume binary target (0 or 1)

# Initialize GPT-2 tokenizer and model (using distilgpt2 for efficiency).
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
tokenizer.pad_token = tokenizer.eos_token
gpt2_model = GPT2Model.from_pretrained('distilgpt2')
gpt2_model.eval()

print("Extracting GPT-2 text embeddings...")
text_embeddings = get_gpt2_embeddings_in_batches(text_data, gpt2_model, tokenizer, batch_size=50)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

Extracting GPT-2 text embeddings...


KeyboardInterrupt: 

In [None]:
import pickle

data_to_save = {
    'structured_data': structured_data_normalized,
    'text_embeddings': text_embeddings,
    'target': target.values
}

output_path = '.../model_input_data.pkl'
with open(output_path, 'wb') as f:
    pickle.dump(data_to_save, f)

print(f"Saved processed data to: {output_path}")
