In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import torch
import gc
from transformers import GPT2Tokenizer, GPT2Model

def load_and_clean_data(file_path):
    df = pd.read_csv(file_path)
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    # Remove 'Combined_clean' (text) and 'IVFLUIDS' (target) from structured features.
    structured_cols = [col for col in df.columns if col not in ['Combined_clean', 'IVFLUIDS']]
    imputer = SimpleImputer(strategy='median')
    df[structured_cols] = imputer.fit_transform(df[structured_cols])
    df['ID'] = np.arange(len(df))
    return df, structured_cols

def tokenize_texts(texts, tokenizer, max_length=128):
    return tokenizer(texts, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')

def get_gpt2_embeddings_in_batches(texts, model, tokenizer, batch_size=16):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        tokenized = tokenize_texts(batch_texts, tokenizer)
        with torch.no_grad():
            outputs = model(**tokenized)
        # Use the first token's hidden state as the sentence representation.
        embeddings.append(outputs.last_hidden_state[:, 0, :].cpu().numpy())
        torch.cuda.empty_cache()
        gc.collect()
    return np.vstack(embeddings)

# Load data from CSV.
file_path = '.../cleaned_ed_data.csv'
df, structured_cols = load_and_clean_data(file_path)

# Process structured data.
structured_data = df[structured_cols].values
scaler = StandardScaler()
structured_data_normalized = scaler.fit_transform(structured_data)

# Process text data and target labels.
text_data = df['Combined_clean'].tolist()
target = df['IVFLUIDS']  # Assume binary target (0 or 1)

# Initialize GPT-2 tokenizer and model (using distilgpt2 for efficiency).
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
tokenizer.pad_token = tokenizer.eos_token
gpt2_model = GPT2Model.from_pretrained('distilgpt2')
gpt2_model.eval()

print("Extracting GPT-2 text embeddings...")
text_embeddings = get_gpt2_embeddings_in_batches(text_data, gpt2_model, tokenizer, batch_size=50)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

Extracting GPT-2 text embeddings...


KeyboardInterrupt: 

In [None]:
import pickle

data_to_save = {
    'structured_data': structured_data_normalized,
    'text_embeddings': text_embeddings,
    'target': target.values
}

output_path = '.../model_input_data.pkl'
with open(output_path, 'wb') as f:
    pickle.dump(data_to_save, f)

print(f"Saved processed data to: {output_path}")

## CountVectorizer

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
import re
import gc

def load_and_clean_data(file_path):
    df = pd.read_csv(file_path)
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    # Remove 'Combined_clean' (text) and 'IVFLUIDS' (target) from structured features.
    structured_cols = [col for col in df.columns if col not in ['Combined_clean', 'IVFLUIDS']]
    imputer = SimpleImputer(strategy='median')
    df[structured_cols] = imputer.fit_transform(df[structured_cols])
    df['ID'] = np.arange(len(df))
    return df, structured_cols

def preprocess_text(text):
    """Basic text preprocessing for medical text"""
    if pd.isna(text):
        return ""
    # Convert to lowercase and remove special characters but keep medical terms
    text = re.sub(r'[^\w\s]', ' ', str(text).lower())
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

def extract_bow_features(texts, max_features=1000, ngram_range=(1, 2), min_df=2, max_df=0.95):
    """
    Extract Bag of Words features using CountVectorizer

    Parameters:
    - max_features: Maximum number of features to keep
    - ngram_range: Range of n-grams to consider (1,1) for unigrams, (1,2) for unigrams+bigrams
    - min_df: Ignore terms that appear in fewer than min_df documents
    - max_df: Ignore terms that appear in more than max_df proportion of documents
    """
    print("Preprocessing text data...")
    # Preprocess texts
    processed_texts = [preprocess_text(text) for text in texts]

    print(f"Extracting BOW features with CountVectorizer...")
    print(f"Parameters: max_features={max_features}, ngram_range={ngram_range}")

    # Initialize CountVectorizer
    vectorizer = CountVectorizer(
        max_features=max_features,
        ngram_range=ngram_range,
        stop_words='english',
        min_df=min_df,
        max_df=max_df,
        lowercase=True,
        token_pattern=r'\b[a-zA-Z]{2,}\b'  # Only words with 2+ letters
    )

    # Fit and transform
    bow_matrix = vectorizer.fit_transform(processed_texts)
    bow_features = bow_matrix.toarray()

    # Get feature names for interpretability
    feature_names = vectorizer.get_feature_names_out()

    print(f"BOW feature matrix shape: {bow_features.shape}")
    print(f"Vocabulary size: {len(feature_names)}")
    print(f"Most common features: {feature_names[:10].tolist()}")

    # Memory cleanup
    del bow_matrix
    gc.collect()

    return bow_features, vectorizer, feature_names

# Load data from CSV
file_path = 'cleaned_ed_data.csv'  # Update your path here
df, structured_cols = load_and_clean_data(file_path)

# Process structured data
structured_data = df[structured_cols].values
scaler = StandardScaler()
structured_data_normalized = scaler.fit_transform(structured_data)

# Process text data and target labels
text_data = df['Combined_clean'].tolist()
target = df['IVFLUIDS']  # Assume binary target (0 or 1)

print("Extracting CountVectorizer text features...")
print(f"Dataset shape: {df.shape}")
print(f"Number of text samples: {len(text_data)}")
print("-" * 50)

# Extract BOW features - you can adjust these parameters based on your needs
text_embeddings, bow_vectorizer, feature_names = extract_bow_features(
    text_data,
    max_features=500,    # Adjust based on your needs (500-2000 is common)
    ngram_range=(1, 2),  # Include both unigrams and bigrams
    min_df=2,           # Must appear in at least 2 documents
    max_df=0.95         # Must appear in less than 95% of documents
)

print(f"Final text embeddings shape: {text_embeddings.shape}")
print(f"Structured data shape: {structured_data_normalized.shape}")

# Combine structured and text features
combined_features = np.hstack([structured_data_normalized, text_embeddings])
print(f"Combined feature matrix shape: {combined_features.shape}")

print("\nCountVectorizer feature extraction complete!")
print("Ready for model training...")

# Optional: Show most important features by frequency
print(f"\nTop 10 most frequent terms:")
feature_sums = np.sum(text_embeddings, axis=0)
top_indices = np.argsort(feature_sums)[-10:][::-1]
for i, idx in enumerate(top_indices):
    print(f"{i+1}. {feature_names[idx]}: {feature_sums[idx]:.0f} occurrences")

# Save processed data using pickle
import pickle

data_to_save = {
    'structured_data': structured_data_normalized,
    'text_embeddings': text_embeddings,
    'target': target.values,
    'bow_vectorizer': bow_vectorizer,
    'scaler': scaler,
    'feature_names': feature_names,
    'structured_cols': structured_cols
}

output_path = 'countvectorizer_model_input_data.pkl'
with open(output_path, 'wb') as f:
    pickle.dump(data_to_save, f)

print(f"\nSaved CountVectorizer processed data to: {output_path}")
print("Saved components:")
print("- structured_data: Normalized structured features")
print("- text_embeddings: CountVectorizer BOW features")
print("- target: Target variable")
print("- bow_vectorizer: Fitted CountVectorizer model")
print("- scaler: Fitted StandardScaler")
print("- feature_names: BOW feature names")
print("- structured_cols: Original structured column names")

Extracting CountVectorizer text features...
Dataset shape: (13115, 44)
Number of text samples: 13115
--------------------------------------------------
Preprocessing text data...
Extracting BOW features with CountVectorizer...
Parameters: max_features=500, ngram_range=(1, 2)
BOW feature matrix shape: (13115, 500)
Vocabulary size: 500
Most common features: ['abdominal', 'abdominal pain', 'abnormal', 'abnormal drug', 'abnormal pigmentation', 'abnormal pulsations', 'abnormal sensation', 'abnormalities', 'abuse', 'accident']
Final text embeddings shape: (13115, 500)
Structured data shape: (13115, 41)
Combined feature matrix shape: (13115, 541)

CountVectorizer feature extraction complete!
Ready for model training...

Top 10 most frequent terms:
1. pain: 8691 occurrences
2. soreness: 3277 occurrences
3. ache: 3034 occurrences
4. pain ache: 3034 occurrences
5. ache soreness: 2965 occurrences
6. nos: 2531 occurrences
7. unspecified: 2521 occurrences
8. discomfort: 2247 occurrences
9. soreness

## Word2Vec

In [None]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m56.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import re
import gc
import multiprocessing

def load_and_clean_data(file_path):
    df = pd.read_csv(file_path)
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    # Remove 'Combined_clean' (text) and 'IVFLUIDS' (target) from structured features.
    structured_cols = [col for col in df.columns if col not in ['Combined_clean', 'IVFLUIDS']]
    imputer = SimpleImputer(strategy='median')
    df[structured_cols] = imputer.fit_transform(df[structured_cols])
    df['ID'] = np.arange(len(df))
    return df, structured_cols

def preprocess_text_for_w2v(text):
    """Text preprocessing optimized for Word2Vec"""
    if pd.isna(text):
        return ""
    # Convert to lowercase and remove special characters
    text = re.sub(r'[^\w\s]', ' ', str(text).lower())
    # Remove numbers (optional - you might want to keep medical codes/values)
    # text = re.sub(r'\d+', '', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

def extract_word2vec_features(texts, vector_size=100, window=5, min_count=2,
                             epochs=10, sg=0, negative=5):
    """
    Extract Word2Vec features by training model and averaging word embeddings

    Parameters:
    - vector_size: Dimensionality of the word vectors (100-300 is common)
    - window: Maximum distance between current and predicted word
    - min_count: Ignores words that appear fewer than this many times
    - epochs: Number of training epochs
    - sg: Training algorithm (0=CBOW, 1=Skip-gram)
    - negative: Number of negative samples for negative sampling
    """
    print("Preprocessing text data for Word2Vec...")

    # Preprocess texts
    processed_texts = [preprocess_text_for_w2v(text) for text in texts]

    # Tokenize using gensim's simple_preprocess (removes punctuation, converts to lowercase)
    tokenized_texts = [simple_preprocess(text, deacc=True, min_len=2, max_len=50)
                      for text in processed_texts]

    # Remove empty documents
    tokenized_texts = [tokens for tokens in tokenized_texts if len(tokens) > 0]

    print(f"Training Word2Vec model...")
    print(f"Parameters: vector_size={vector_size}, window={window}, min_count={min_count}")
    print(f"Number of documents: {len(tokenized_texts)}")

    # Calculate total vocabulary for info
    all_words = [word for tokens in tokenized_texts for word in tokens]
    unique_words = len(set(all_words))
    print(f"Total unique words before filtering: {unique_words}")

    # Train Word2Vec model
    cores = multiprocessing.cpu_count()
    w2v_model = Word2Vec(
        sentences=tokenized_texts,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        workers=cores,
        sg=sg,  # 0 = CBOW, 1 = Skip-gram
        epochs=epochs,
        negative=negative,
        seed=42,
        compute_loss=True
    )

    print(f"Vocabulary size after filtering: {len(w2v_model.wv.key_to_index)}")
    print(f"Training loss: {w2v_model.get_latest_training_loss():.2f}")

    # Function to get document vector by averaging word vectors
    def get_document_vector(tokens, model, vector_size):
        """Average word vectors for all words in document"""
        vectors = []
        for token in tokens:
            if token in model.wv:
                vectors.append(model.wv[token])

        if vectors:
            # Average all word vectors in the document
            return np.mean(vectors, axis=0)
        else:
            # Return zero vector if no words found in vocabulary
            return np.zeros(vector_size)

    print("Generating document embeddings...")

    # Create document embeddings for all original texts (including empty ones)
    w2v_features = []
    for i, text in enumerate(processed_texts):
        if i < len(tokenized_texts):
            # Find corresponding tokenized text
            tokens = simple_preprocess(text, deacc=True, min_len=2, max_len=50)
            doc_vector = get_document_vector(tokens, w2v_model, vector_size)
        else:
            # Handle empty documents
            doc_vector = np.zeros(vector_size)

        w2v_features.append(doc_vector)

    w2v_features = np.array(w2v_features)

    print(f"Word2Vec feature matrix shape: {w2v_features.shape}")

    # Show some model statistics
    if len(w2v_model.wv.key_to_index) > 0:
        # Most similar words to common medical terms (if they exist in vocabulary)
        test_words = ['patient', 'pain', 'blood', 'treatment', 'hospital', 'diagnosis']
        print(f"\nSample word similarities:")
        for word in test_words:
            if word in w2v_model.wv:
                try:
                    similar = w2v_model.wv.most_similar(word, topn=3)
                    print(f"Words similar to '{word}': {[w for w, s in similar]}")
                except:
                    pass
                break  # Just show one example

    # Memory cleanup
    gc.collect()

    return w2v_features, w2v_model

# Load data from CSV
file_path = 'cleaned_ed_data.csv'  # Update your path here
df, structured_cols = load_and_clean_data(file_path)

# Process structured data
structured_data = df[structured_cols].values
scaler = StandardScaler()
structured_data_normalized = scaler.fit_transform(structured_data)

# Process text data and target labels
text_data = df['Combined_clean'].tolist()
target = df['IVFLUIDS']  # Assume binary target (0 or 1)

print("Extracting Word2Vec text features...")
print(f"Dataset shape: {df.shape}")
print(f"Number of text samples: {len(text_data)}")
print("-" * 50)

# Extract Word2Vec features - you can adjust these parameters
text_embeddings, w2v_model = extract_word2vec_features(
    text_data,
    vector_size=100,    # Dimensionality (50-300 common, 100 is good balance)
    window=5,          # Context window size
    min_count=2,       # Minimum word frequency
    epochs=10,         # Training epochs (more = better quality but slower)
    sg=0,             # 0=CBOW (faster), 1=Skip-gram (better for rare words)
    negative=5        # Negative sampling parameter
)

print(f"Final text embeddings shape: {text_embeddings.shape}")
print(f"Structured data shape: {structured_data_normalized.shape}")

# Combine structured and text features
combined_features = np.hstack([structured_data_normalized, text_embeddings])
print(f"Combined feature matrix shape: {combined_features.shape}")

print("\nWord2Vec feature extraction complete!")
print("Ready for model training...")

# Optional: Save the trained Word2Vec model for future use
# w2v_model.save("medical_word2vec_model.bin")
# print("Word2Vec model saved!")

# Optional: Analyze embedding quality
print(f"\nEmbedding statistics:")
print(f"Mean embedding magnitude: {np.mean(np.linalg.norm(text_embeddings, axis=1)):.3f}")
print(f"Std embedding magnitude: {np.std(np.linalg.norm(text_embeddings, axis=1)):.3f}")
print(f"Zero embeddings (empty docs): {np.sum(np.all(text_embeddings == 0, axis=1))}")

# Save processed data using pickle
import pickle

data_to_save = {
    'structured_data': structured_data_normalized,
    'text_embeddings': text_embeddings,
    'target': target.values,
    'w2v_model': w2v_model,
    'scaler': scaler,
    'structured_cols': structured_cols
}

output_path = 'word2vec_model_input_data.pkl'
with open(output_path, 'wb') as f:
    pickle.dump(data_to_save, f)

print(f"\nSaved Word2Vec processed data to: {output_path}")
print("Saved components:")
print("- structured_data: Normalized structured features")
print("- text_embeddings: Word2Vec document embeddings")
print("- target: Target variable")
print("- w2v_model: Trained Word2Vec model")
print("- scaler: Fitted StandardScaler")
print("- structured_cols: Original structured column names")

# Alternative: Save Word2Vec model separately for reuse
w2v_model.save("medical_word2vec_model.bin")
print(f"\nAlso saved Word2Vec model separately to: medical_word2vec_model.bin")

Extracting Word2Vec text features...
Dataset shape: (13115, 44)
Number of text samples: 13115
--------------------------------------------------
Preprocessing text data for Word2Vec...
Training Word2Vec model...
Parameters: vector_size=100, window=5, min_count=2
Number of documents: 13104
Total unique words before filtering: 1096
Vocabulary size after filtering: 955
Training loss: 558055.69
Generating document embeddings...
Word2Vec feature matrix shape: (13115, 100)

Sample word similarities:
Words similar to 'patient': ['spokesperson', 'refused', 'care']
Final text embeddings shape: (13115, 100)
Structured data shape: (13115, 41)
Combined feature matrix shape: (13115, 141)

Word2Vec feature extraction complete!
Ready for model training...

Embedding statistics:
Mean embedding magnitude: 4.456
Std embedding magnitude: 1.014
Zero embeddings (empty docs): 22

Saved Word2Vec processed data to: word2vec_model_input_data.pkl
Saved components:
- structured_data: Normalized structured featur

In [None]:
import pickle
import os
import numpy as np
from pathlib import Path

def save_data_robust(data_dict, output_path, backup=True):
    """
    Robust data saving with error checking and backup
    """
    try:
        # Create backup if file exists
        if backup and os.path.exists(output_path):
            backup_path = output_path.replace('.pkl', '_backup.pkl')
            if os.path.exists(backup_path):
                os.remove(backup_path)
            os.rename(output_path, backup_path)
            print(f"Created backup: {backup_path}")

        # Save with explicit protocol and error checking
        print(f"Saving data to: {output_path}")
        with open(output_path, 'wb') as f:
            pickle.dump(data_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

        # Verify the file was saved correctly
        file_size = os.path.getsize(output_path)
        print(f"File saved successfully. Size: {file_size / (1024*1024):.2f} MB")

        # Test loading to verify integrity
        print("Verifying file integrity...")
        with open(output_path, 'rb') as f:
            test_load = pickle.load(f)

        print("✓ File verification successful!")
        return True

    except Exception as e:
        print(f"Error saving data: {str(e)}")
        return False

def load_data_robust(file_path):
    """
    Robust data loading with error handling
    """
    try:
        if not os.path.exists(file_path):
            print(f"File not found: {file_path}")
            return None

        file_size = os.path.getsize(file_path)
        print(f"Loading file: {file_path} (Size: {file_size / (1024*1024):.2f} MB)")

        with open(file_path, 'rb') as f:
            data = pickle.load(f)

        print("✓ Data loaded successfully!")
        print(f"Keys in loaded data: {list(data.keys())}")

        return data

    except Exception as e:
        print(f"Error loading data: {str(e)}")

        # Try backup file if it exists
        backup_path = file_path.replace('.pkl', '_backup.pkl')
        if os.path.exists(backup_path):
            print(f"Trying backup file: {backup_path}")
            try:
                with open(backup_path, 'rb') as f:
                    data = pickle.load(f)
                print("✓ Backup file loaded successfully!")
                return data
            except Exception as e2:
                print(f"Backup file also corrupted: {str(e2)}")

        return None

# For CountVectorizer implementation
def save_countvectorizer_data(structured_data_normalized, text_embeddings, target,
                             bow_vectorizer, scaler, feature_names, structured_cols):
    """Save CountVectorizer processed data"""

    data_to_save = {
        'structured_data': structured_data_normalized,
        'text_embeddings': text_embeddings,
        'target': target.values if hasattr(target, 'values') else target,
        'bow_vectorizer': bow_vectorizer,
        'scaler': scaler,
        'feature_names': feature_names,
        'structured_cols': structured_cols,
        'metadata': {
            'structured_shape': structured_data_normalized.shape,
            'text_shape': text_embeddings.shape,
            'n_samples': len(target),
            'vocab_size': len(feature_names)
        }
    }

    output_path = 'countvectorizer_model_input_data.pkl'
    success = save_data_robust(data_to_save, output_path)

    if success:
        print("\n✓ CountVectorizer data saved successfully!")
        print("Saved components:")
        print(f"- structured_data: {data_to_save['metadata']['structured_shape']}")
        print(f"- text_embeddings: {data_to_save['metadata']['text_shape']}")
        print(f"- target: {data_to_save['metadata']['n_samples']} samples")
        print(f"- vocabulary: {data_to_save['metadata']['vocab_size']} features")

    return success

# For Word2Vec implementation
def save_word2vec_data(structured_data_normalized, text_embeddings, target,
                      w2v_model, scaler, structured_cols):
    """Save Word2Vec processed data"""

    data_to_save = {
        'structured_data': structured_data_normalized,
        'text_embeddings': text_embeddings,
        'target': target.values if hasattr(target, 'values') else target,
        'w2v_model': w2v_model,
        'scaler': scaler,
        'structured_cols': structured_cols,
        'metadata': {
            'structured_shape': structured_data_normalized.shape,
            'text_shape': text_embeddings.shape,
            'n_samples': len(target),
            'vector_size': text_embeddings.shape[1],
            'vocab_size': len(w2v_model.wv.key_to_index)
        }
    }

    output_path = 'word2vec_model_input_data.pkl'
    success = save_data_robust(data_to_save, output_path)

    if success:
        print("\n✓ Word2Vec data saved successfully!")
        print("Saved components:")
        print(f"- structured_data: {data_to_save['metadata']['structured_shape']}")
        print(f"- text_embeddings: {data_to_save['metadata']['text_shape']}")
        print(f"- target: {data_to_save['metadata']['n_samples']} samples")
        print(f"- w2v vocabulary: {data_to_save['metadata']['vocab_size']} words")

        # Also save Word2Vec model separately
        try:
            w2v_model.save("medical_word2vec_model.bin")
            print("- w2v_model: Also saved separately as medical_word2vec_model.bin")
        except Exception as e:
            print(f"Warning: Could not save separate W2V model: {e}")

    return success

# Loading functions
def load_countvectorizer_data():
    """Load CountVectorizer processed data"""
    return load_data_robust('countvectorizer_model_input_data.pkl')

def load_word2vec_data():
    """Load Word2Vec processed data"""
    return load_data_robust('word2vec_model_input_data.pkl')

# Example usage:
if __name__ == "__main__":
    # Example for loading data
    print("Attempting to load CountVectorizer data...")
    cv_data = load_countvectorizer_data()

    if cv_data is not None:
        print("CountVectorizer data loaded successfully!")
        print(f"Available keys: {list(cv_data.keys())}")
        if 'metadata' in cv_data:
            print(f"Metadata: {cv_data['metadata']}")
    else:
        print("Failed to load CountVectorizer data.")

    print("\n" + "="*50)

    print("Attempting to load Word2Vec data...")
    w2v_data = load_word2vec_data()

    if w2v_data is not None:
        print("Word2Vec data loaded successfully!")
        print(f"Available keys: {list(w2v_data.keys())}")
        if 'metadata' in w2v_data:
            print(f"Metadata: {w2v_data['metadata']}")
    else:
        print("Failed to load Word2Vec data.")

Attempting to load CountVectorizer data...
Loading file: countvectorizer_model_input_data.pkl (Size: 54.25 MB)
✓ Data loaded successfully!
Keys in loaded data: ['structured_data', 'text_embeddings', 'target', 'bow_vectorizer', 'scaler', 'feature_names', 'structured_cols']
CountVectorizer data loaded successfully!
Available keys: ['structured_data', 'text_embeddings', 'target', 'bow_vectorizer', 'scaler', 'feature_names', 'structured_cols']

Attempting to load Word2Vec data...
Loading file: word2vec_model_input_data.pkl (Size: 14.98 MB)
✓ Data loaded successfully!
Keys in loaded data: ['structured_data', 'text_embeddings', 'target', 'w2v_model', 'scaler', 'structured_cols']
Word2Vec data loaded successfully!
Available keys: ['structured_data', 'text_embeddings', 'target', 'w2v_model', 'scaler', 'structured_cols']


In [None]:
import pickle
import numpy as np

# Solution 1: Check what keys are actually available
print("Checking available keys in your saved data...")

try:
    with open("countvectorizer_model_input_data.pkl", "rb") as f:
        data = pickle.load(f)

    print("Available keys:", list(data.keys()))
    print("\nData shapes:")
    for key, value in data.items():
        if hasattr(value, 'shape'):
            print(f"- {key}: {value.shape}")
        elif hasattr(value, '__len__'):
            print(f"- {key}: length {len(value)}")
        else:
            print(f"- {key}: {type(value)}")

except Exception as e:
    print(f"Error loading file: {e}")

# Solution 2: Load data without IDs (create them manually)
print("\n" + "="*50)
print("Loading data and creating IDs manually...")

try:
    with open("countvectorizer_model_input_data.pkl", "rb") as f:
        data = pickle.load(f)

    structured_data_normalized = data["structured_data"]
    text_embeddings = data["text_embeddings"]
    target = data["target"]

    # Create IDs manually based on the number of samples
    n_samples = len(target)
    id_array = np.arange(n_samples)

    print("✓ Data loaded successfully!")
    print(f"- Structured data shape: {structured_data_normalized.shape}")
    print(f"- Text embeddings shape: {text_embeddings.shape}")
    print(f"- Target shape: {target.shape}")
    print(f"- Created IDs: {id_array.shape} (0 to {n_samples-1})")

    # Optional: Save the data again with IDs included
    print("\nAdding IDs to saved data...")
    data["ids"] = id_array

    with open("countvectorizer_model_input_data.pkl", "wb") as f:
        pickle.dump(data, f)

    print("✓ Updated pickle file with IDs included")

except Exception as e:
    print(f"Error: {e}")

# Solution 3: Updated saving function that includes IDs
def save_countvectorizer_data_with_ids(structured_data_normalized, text_embeddings, target,
                                      bow_vectorizer, scaler, feature_names, structured_cols, ids=None):
    """Save CountVectorizer processed data including IDs"""

    # Create IDs if not provided
    if ids is None:
        ids = np.arange(len(target))

    data_to_save = {
        'structured_data': structured_data_normalized,
        'text_embeddings': text_embeddings,
        'target': target.values if hasattr(target, 'values') else target,
        'ids': ids,  # Include IDs
        'bow_vectorizer': bow_vectorizer,
        'scaler': scaler,
        'feature_names': feature_names,
        'structured_cols': structured_cols,
        'metadata': {
            'structured_shape': structured_data_normalized.shape,
            'text_shape': text_embeddings.shape,
            'n_samples': len(target),
            'vocab_size': len(feature_names)
        }
    }

    output_path = 'countvectorizer_model_input_data.pkl'

    try:
        with open(output_path, 'wb') as f:
            pickle.dump(data_to_save, f, protocol=pickle.HIGHEST_PROTOCOL)

        print(f"✓ Data saved successfully to {output_path}")
        print("Saved components:")
        print(f"- structured_data: {data_to_save['metadata']['structured_shape']}")
        print(f"- text_embeddings: {data_to_save['metadata']['text_shape']}")
        print(f"- target: {data_to_save['metadata']['n_samples']} samples")
        print(f"- ids: {ids.shape}")
        print(f"- vocabulary: {data_to_save['metadata']['vocab_size']} features")

        return True

    except Exception as e:
        print(f"Error saving data: {e}")
        return False

# Solution 4: Load data with proper error handling
def load_data_with_ids(file_path):
    """Load data and ensure IDs are present"""

    try:
        with open(file_path, "rb") as f:
            data = pickle.load(f)

        # Check if IDs exist, create if not
        if 'ids' not in data:
            print("Warning: 'ids' not found in saved data. Creating them...")
            n_samples = len(data['target'])
            data['ids'] = np.arange(n_samples)
            print(f"Created IDs: 0 to {n_samples-1}")

        return data

    except Exception as e:
        print(f"Error loading data: {e}")
        return None

# Example usage:
print("\n" + "="*50)
print("Example: Loading data with automatic ID creation")

data = load_data_with_ids("countvectorizer_model_input_data.pkl")

if data is not None:
    structured_data_normalized = data["structured_data"]
    text_embeddings = data["text_embeddings"]
    target = data["target"]
    id_array = data["ids"]

    print("✓ All data loaded successfully including IDs!")
    print(f"ID range: {id_array.min()} to {id_array.max()}")
else:
    print("Failed to load data")

Checking available keys in your saved data...
Available keys: ['structured_data', 'text_embeddings', 'target', 'bow_vectorizer', 'scaler', 'feature_names', 'structured_cols']

Data shapes:
- structured_data: (13115, 41)
- text_embeddings: (13115, 500)
- target: (13115,)
- bow_vectorizer: <class 'sklearn.feature_extraction.text.CountVectorizer'>
- scaler: <class 'sklearn.preprocessing._data.StandardScaler'>
- feature_names: (500,)
- structured_cols: length 41

Loading data and creating IDs manually...
✓ Data loaded successfully!
- Structured data shape: (13115, 41)
- Text embeddings shape: (13115, 500)
- Target shape: (13115,)
- Created IDs: (13115,) (0 to 13114)

Adding IDs to saved data...
✓ Updated pickle file with IDs included

Example: Loading data with automatic ID creation
✓ All data loaded successfully including IDs!
ID range: 0 to 13114


In [None]:
import pickle
import numpy as np

# Solution 1: Check what keys are actually available in Word2Vec data
print("Checking available keys in Word2Vec saved data...")

try:
    with open("word2vec_model_input_data.pkl", "rb") as f:
        data = pickle.load(f)

    print("Available keys:", list(data.keys()))
    print("\nData shapes:")
    for key, value in data.items():
        if hasattr(value, 'shape'):
            print(f"- {key}: {value.shape}")
        elif hasattr(value, '__len__'):
            try:
                print(f"- {key}: length {len(value)}")
            except:
                print(f"- {key}: {type(value)}")
        else:
            print(f"- {key}: {type(value)}")

except Exception as e:
    print(f"Error loading Word2Vec file: {e}")

# Solution 2: Load Word2Vec data without IDs (create them manually)
print("\n" + "="*50)
print("Loading Word2Vec data and creating IDs manually...")

try:
    with open("word2vec_model_input_data.pkl", "rb") as f:
        data = pickle.load(f)

    structured_data_normalized = data["structured_data"]
    text_embeddings = data["text_embeddings"]
    target = data["target"]

    # Create IDs manually based on the number of samples
    n_samples = len(target)
    id_array = np.arange(n_samples)

    print("✓ Word2Vec data loaded successfully!")
    print(f"- Structured data shape: {structured_data_normalized.shape}")
    print(f"- Text embeddings shape: {text_embeddings.shape}")
    print(f"- Target shape: {target.shape}")
    print(f"- Created IDs: {id_array.shape} (0 to {n_samples-1})")

    # Optional: Save the data again with IDs included
    print("\nAdding IDs to Word2Vec saved data...")
    data["ids"] = id_array

    with open("word2vec_model_input_data.pkl", "wb") as f:
        pickle.dump(data, f)

    print("✓ Updated Word2Vec pickle file with IDs included")

except Exception as e:
    print(f"Error: {e}")

# Solution 3: Updated Word2Vec saving function that includes IDs
def save_word2vec_data_with_ids(structured_data_normalized, text_embeddings, target,
                               w2v_model, scaler, structured_cols, ids=None):
    """Save Word2Vec processed data including IDs"""

    # Create IDs if not provided
    if ids is None:
        ids = np.arange(len(target))

    data_to_save = {
        'structured_data': structured_data_normalized,
        'text_embeddings': text_embeddings,
        'target': target.values if hasattr(target, 'values') else target,
        'ids': ids,  # Include IDs
        'w2v_model': w2v_model,
        'scaler': scaler,
        'structured_cols': structured_cols,
        'metadata': {
            'structured_shape': structured_data_normalized.shape,
            'text_shape': text_embeddings.shape,
            'n_samples': len(target),
            'vector_size': text_embeddings.shape[1],
            'vocab_size': len(w2v_model.wv.key_to_index)
        }
    }

    output_path = 'word2vec_model_input_data.pkl'

    try:
        with open(output_path, 'wb') as f:
            pickle.dump(data_to_save, f, protocol=pickle.HIGHEST_PROTOCOL)

        print(f"✓ Word2Vec data saved successfully to {output_path}")
        print("Saved components:")
        print(f"- structured_data: {data_to_save['metadata']['structured_shape']}")
        print(f"- text_embeddings: {data_to_save['metadata']['text_shape']}")
        print(f"- target: {data_to_save['metadata']['n_samples']} samples")
        print(f"- ids: {ids.shape}")
        print(f"- w2v vocabulary: {data_to_save['metadata']['vocab_size']} words")

        # Also save Word2Vec model separately
        try:
            w2v_model.save("medical_word2vec_model.bin")
            print("- w2v_model: Also saved separately as medical_word2vec_model.bin")
        except Exception as e:
            print(f"Warning: Could not save separate W2V model: {e}")

        return True

    except Exception as e:
        print(f"Error saving Word2Vec data: {e}")
        return False

# Solution 4: Load Word2Vec data with proper error handling
def load_word2vec_data_with_ids(file_path):
    """Load Word2Vec data and ensure IDs are present"""

    try:
        with open(file_path, "rb") as f:
            data = pickle.load(f)

        # Check if IDs exist, create if not
        if 'ids' not in data:
            print("Warning: 'ids' not found in Word2Vec saved data. Creating them...")
            n_samples = len(data['target'])
            data['ids'] = np.arange(n_samples)
            print(f"Created IDs: 0 to {n_samples-1}")

        return data

    except Exception as e:
        print(f"Error loading Word2Vec data: {e}")
        return None

# Solution 5: Universal function for both CountVectorizer and Word2Vec
def load_any_model_data_with_ids(file_path):
    """Universal loader that works for both CountVectorizer and Word2Vec data"""

    try:
        with open(file_path, "rb") as f:
            data = pickle.load(f)

        print(f"✓ Loaded {file_path}")
        print(f"Available keys: {list(data.keys())}")

        # Check if IDs exist, create if not
        if 'ids' not in data:
            print("Warning: 'ids' not found. Creating them...")
            n_samples = len(data['target'])
            data['ids'] = np.arange(n_samples)
            print(f"Created IDs: 0 to {n_samples-1}")

            # Save back with IDs
            with open(file_path, "wb") as f:
                pickle.dump(data, f)
            print(f"✓ Updated {file_path} with IDs")

        return data

    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None

# Example usage for Word2Vec:
print("\n" + "="*50)
print("Example: Loading Word2Vec data with automatic ID creation")

w2v_data = load_word2vec_data_with_ids("word2vec_model_input_data.pkl")

if w2v_data is not None:
    structured_data_normalized = w2v_data["structured_data"]
    text_embeddings = w2v_data["text_embeddings"]
    target = w2v_data["target"]
    id_array = w2v_data["ids"]
    w2v_model = w2v_data["w2v_model"]

    print("✓ All Word2Vec data loaded successfully including IDs!")
    print(f"ID range: {id_array.min()} to {id_array.max()}")
    print(f"Word2Vec vocabulary size: {len(w2v_model.wv.key_to_index)}")
else:
    print("Failed to load Word2Vec data")

# Quick fix for immediate use:
print("\n" + "="*50)
print("QUICK FIX - Use this code directly:")
print("""
# For Word2Vec data:
import pickle
import numpy as np

with open("word2vec_model_input_data.pkl", "rb") as f:
    data = pickle.load(f)

structured_data_normalized = data["structured_data"]
text_embeddings = data["text_embeddings"]
target = data["target"]
id_array = np.arange(len(target))  # Create IDs manually

print("✓ Word2Vec data loaded with manually created IDs!")
""")

Checking available keys in Word2Vec saved data...
Available keys: ['structured_data', 'text_embeddings', 'target', 'w2v_model', 'scaler', 'structured_cols']

Data shapes:
- structured_data: (13115, 41)
- text_embeddings: (13115, 100)
- target: (13115,)
- w2v_model: <class 'gensim.models.word2vec.Word2Vec'>
- scaler: <class 'sklearn.preprocessing._data.StandardScaler'>
- structured_cols: length 41

Loading Word2Vec data and creating IDs manually...
✓ Word2Vec data loaded successfully!
- Structured data shape: (13115, 41)
- Text embeddings shape: (13115, 100)
- Target shape: (13115,)
- Created IDs: (13115,) (0 to 13114)

Adding IDs to Word2Vec saved data...
✓ Updated Word2Vec pickle file with IDs included

Example: Loading Word2Vec data with automatic ID creation
✓ All Word2Vec data loaded successfully including IDs!
ID range: 0 to 13114
Word2Vec vocabulary size: 955

QUICK FIX - Use this code directly:

# For Word2Vec data:
import pickle
import numpy as np

with open("word2vec_model_inp