In [None]:
import numpy as np
import scanpy as sc
from sklearn.model_selection import train_test_split
from geneformer import TranscriptomeTokenizer
import scipy.sparse as sp
import datetime
from geneformer import Classifier
from collections import Counter
import os
import matplotlib.pyplot as plt
import pandas as pd
import os
import urllib.request
import pickle



In [None]:
import os
import requests

# Define the URL and output directory
url = "https://huggingface.co/ctheodoris/Geneformer/resolve/main/gf-6L-30M-i2048/model.safetensors"
output_dir = "./gf-6L-30M-i2048/"
output_file = os.path.join(output_dir, "model.safetensors")

# Create the directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Check if the file already exists
if not os.path.exists(output_file):
    print(f"Downloading {output_file}...")
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(output_file, "wb") as f:
            f.write(response.content)
        print("Download completed.")
    else:
        print(f"Failed to download file: HTTP {response.status_code}")
else:
    print(f"File already exists: {output_file}")


In [None]:
base_url = "https://huggingface.co/ctheodoris/Geneformer/resolve/main/geneformer/gene_dictionaries_30m/"
files = [
    "ensembl_mapping_dict_gc30M.pkl",
    "gene_median_dictionary_gc30M.pkl",
    "gene_name_id_dict_gc30M.pkl",
    "token_dictionary_gc30M.pkl"
]

output_dir = "./gene_dictionaries_30m"
os.makedirs(output_dir, exist_ok=True)

for file in files:
    output_file = os.path.join(output_dir, file)
    if not os.path.exists(output_file):
        print(f"Downloading {file}...")
        urllib.request.urlretrieve(base_url + file, output_file)
        print(f"Downloaded {file}")
    else:
        print(f"{file} already exists.")


In [None]:
from collections import Counter
import matplotlib.pyplot as plt

def plot_cell_type_distribution(cell_types):
    # Count occurrences of each class
    class_distribution = Counter(cell_types)

    # Calculate relative frequencies
    total_count = sum(class_distribution.values())
    class_relative_frequencies = {key: value / total_count for key, value in class_distribution.items()}

    # Extract keys and values for plotting
    class_names = list(class_relative_frequencies.keys())
    class_frequencies = list(class_relative_frequencies.values())

    # Create the bar plot
    plt.figure(figsize=(12, 6))
    plt.bar(class_names, class_frequencies)

    # Add labels and title
    plt.xlabel('Cell Types', fontsize=12)
    plt.ylabel('Relative Frequency', fontsize=12)
    plt.title('Cell Type Distribution (Relative Frequencies)', fontsize=14)

    # Rotate x-axis labels for better readability if needed
    plt.xticks(rotation=45, ha='right', fontsize=10)

    # Print relative frequencies for verification
    for key, value in class_distribution.items():
        print(f"{key}: {value}")

    # Show the plot
    plt.tight_layout()
    plt.show()


In [None]:
# Load the data
cell_file = "data/cells.npy"
cells = np.load(cell_file, allow_pickle=True).ravel()[0]

# Extract data
expressions = cells["UMI"].toarray()  # Gene expression matrix (n_cells x n_genes)
gene_names = cells["gene_ids"]  # Gene names
cell_types = cells["classes"]  # Cell types (n_cells,)

plot_cell_type_distribution(cell_types)

In [None]:
# Create a DataFrame for stratified sampling
cell_df = pd.DataFrame({"cell_types": cell_types})

# Perform stratified sampling to select 10% of the data
_, subsample_indices = train_test_split(
    np.arange(len(cell_types)),  # Use indices for subsampling
    test_size=0.01,  # 10% subsample
    stratify=cell_df["cell_types"],  # Stratify by cell types
    random_state=42  # For reproducibility
)

print(f"Original dataset size: {len(cell_types)}")

# Subset the data based on sampled indices
expressions = expressions[subsample_indices, :]  # Subset expression matrix
cell_types = cell_types[subsample_indices]  # Subset cell types
# Output sizes for verification
print(f"Subsampled dataset size: {len(cell_types)}")

plot_cell_type_distribution(cell_types)


In [None]:
from sklearn.preprocessing import LabelEncoder
import scipy.sparse as sp
import scanpy as sc

# Example data
# Replace `expressions`, `cell_types`, and `gene_names` with your actual data
adata = sc.AnnData(X=expressions)
adata.obs["cell_types"] = cell_types
adata.var_names = gene_names
adata.var["ensembl_id"] = gene_names
adata.obs["n_counts"] = adata.X.sum(1)  # total read count per cell
adata.obs["cell_id"] = adata.obs_names.values
adata.obs["label"] = adata.obs["cell_types"].astype("category").cat.codes

# Convert matrix to sparse format if not already
if not sp.issparse(adata.X):
    adata.X = sp.csr_matrix(adata.X)

# Save the AnnData object
adata.write_h5ad("data/adata.h5ad")



In [None]:
from geneformer import TranscriptomeTokenizer

tokenizer = TranscriptomeTokenizer(
    custom_attr_name_dict={"cell_types": "cell_types", "cell_id": "cell_id", "label":"class_id"},#,"class_id":"class_id"},
    model_input_size=2048,  # For 30M model series
    special_token=False,   # 30M models require this to be False
    gene_median_file="./gene_dictionaries_30m/gene_median_dictionary_gc30M.pkl",
    token_dictionary_file="./gene_dictionaries_30m/token_dictionary_gc30M.pkl",
    gene_mapping_file="./gene_dictionaries_30m/ensembl_mapping_dict_gc30M.pkl"
)


tokenizer.tokenize_data(
    data_directory="./data",
    output_directory="./tokenized_data",
    output_prefix="my_dataset",
    file_format="h5ad",
    use_generator=False
    
)


# Example: Map cell types to numeric IDs
cell_types_v2 = list(adata.obs["cell_types"].unique())
id_class_dict = {i: class_id for i, class_id in enumerate(cell_types_v2)}

# Save the dictionary
with open("./tokenized_data/my_dataset_id_class_dict.pkl", "wb") as f:
    pickle.dump(id_class_dict, f)


In [None]:
"""from datasets import Dataset

# Load the tokenized dataset
tokenized_data = Dataset.load_from_disk("./tokenized_data/my_dataset.dataset")

# Map the original labels back to the tokenized dataset
labels = adata.obs["class_id"].values
tokenized_data = tokenized_data.map(lambda x, idx: {"class_id": labels[idx]}, with_indices=True)

# Save back the tokenized dataset with the added labels
tokenized_data.save_to_disk("./tokenized_data/my_dataset_with_labels.dataset")
"""

In [None]:
current_date = datetime.datetime.now()
datestamp = f"{str(current_date.year)[-2:]}{current_date.month:02d}{current_date.day:02d}{current_date.hour:02d}{current_date.minute:02d}{current_date.second:02d}"
datestamp_min = f"{str(current_date.year)[-2:]}{current_date.month:02d}{current_date.day:02d}"

output_prefix = "cm_classifier_test"
output_dir = f"output_directory/{datestamp}"
os.makedirs(output_dir, exist_ok=True)

In [None]:
filter_data_dict={"cell_types":list(adata.obs["cell_types"].unique())}
training_args = {
    "num_train_epochs": 0.9,
    "learning_rate": 0.000804,
    "lr_scheduler_type": "polynomial",
    "warmup_steps": 1812,
    "weight_decay":0.258828,
    "per_device_train_batch_size": 12,
    "seed": 73,
}
cc = Classifier(classifier="cell",
                cell_state_dict = {"state_key": "cell_types", "states": "all"},
                filter_data=None, #none = fine tune with all input data
                training_args=training_args,
                max_ncells=None,
                freeze_layers = 2, # freeze the last 2 layer of the model
                num_crossval_splits = 1, #only 1 train, test and eval. no cross validation
                forward_batch_size=200,
                nproc=1,)
                #rare_threshold=.05)

In [None]:
# Step 1: Split into train (70%) and temp (30%)
train_indices, temp_indices = train_test_split(
    np.arange(len(cell_types)), test_size=0.3, random_state=42, stratify=None
)

# Step 2: Split temp (30%) into validation (15%) and test (15%)
eval_indices, test_indices = train_test_split(
    temp_indices, test_size=0.5, random_state=42, stratify=None
)

# Now get the cell IDs corresponding to these indices
train_ids = adata.obs['cell_id'][train_indices].tolist()
eval_ids = adata.obs['cell_id'][eval_indices].tolist()
test_ids = adata.obs['cell_id'][test_indices].tolist()

# Output the sizes
print(f"Total samples: {len(cell_types)}")
print(f"Training samples: {len(train_ids)}")
print(f"Validation samples: {len(eval_ids)}")
print(f"Test samples: {len(test_ids)}")


In [None]:
train_test_id_split_dict = {"attr_key": "cell_id",
                            "train": train_ids+eval_ids,
                            "test": test_ids}

# Example input_data_file: https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/example_input_files/cell_classification/disease_classification/human_dcm_hcm_nf.dataset
cc.prepare_data(input_data_file=f"tokenized_data/my_dataset.dataset/",
                output_directory=output_dir,
                output_prefix=output_prefix,
                split_id_dict=train_test_id_split_dict)

In [None]:
from datasets import load_from_disk

# Path to your tokenized dataset
dataset_path = "./tokenized_data/my_dataset.dataset"

# Load the dataset
tokenized_data = load_from_disk(dataset_path)


In [None]:
tokenized_data

In [None]:
test_number = "5"
os.makedirs(f"results/{test_number}", exist_ok=True)
"""train_valid_id_split_dict = {"attr_key": "cell_types",
                            "train": train_ids,
                            "eval": eval_ids}"""

train_valid_id_split_dict = {"attr_key": "class_id", "train": train_ids, "eval": eval_ids}

# 6 layer Geneformer: https://huggingface.co/ctheodoris/Geneformer/blob/main/model.safetensors
all_metrics = cc.validate(
                model_directory="gf-6L-30M-i2048",
                prepared_input_data_file="./tokenized_data/my_dataset.dataset",
                id_class_dict_file="./tokenized_data/my_dataset_id_class_dict.pkl",
                output_directory=f"./results/{test_number}",
                output_prefix="my_fine_tuned_model",
                split_id_dict=train_valid_id_split_dict
            )
                          # to optimize hyperparameters, set n_hyperopt_trials=100 (or alternative desired # of trials)