In [None]:
import pandas as pd
import numpy as np

df = pd.read_parquet("./data/parquet/clinical.parquet")
embeddings_series = df["clinical_embedding"]

all_embeddings = []
for i in range(len(embeddings_series)):
    embeddings_objects = np.array(embeddings_series[i].tolist(), dtype=object)
    all_embeddings.append(embeddings_objects)
all_embeddings = np.array(all_embeddings).reshape(-1, 1024)
print(all_embeddings.shape)

In [None]:
import pandas as pd
import numpy as np

df = pd.read_parquet("./data/parquet/pathology_reports.parquet")
embeddings_series = df["report_embedding"]

all_embeddings = []
for i in range(len(embeddings_series)):
    embeddings_objects = np.array(embeddings_series[i].tolist(), dtype=object)
    all_embeddings.append(embeddings_objects)
    print(embeddings_objects.shape)
# all_embeddings = np.array(all_embeddings).reshape(-1, 1024)

---

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import seaborn as sns

df = pd.read_parquet("./data/parquet/clinical.parquet")
embeddings_series = df["clinical_embedding"]

all_embeddings = []
for i in range(len(embeddings_series)):
    embeddings_objects = np.array(embeddings_series[i].tolist(), dtype=object)
    all_embeddings.append(embeddings_objects)
all_embeddings = np.array(all_embeddings).reshape(-1, 1024)
print(all_embeddings.shape)

tsne = TSNE(n_components=2, random_state=0)
tsne_obj = tsne.fit_transform(all_embeddings)

tsne_df = pd.DataFrame(
    {
        "X": tsne_obj[:, 0],
        "Y": tsne_obj[:, 1],
        "label": df["year_of_diagnosis"],
    }
)
# Filter out None values, then sort and set as categories
valid_labels = tsne_df["label"].dropna().unique()
sorted_labels = sorted(valid_labels, key=lambda x: (str(x).isdigit(), x))

# Set the 'label' column as an ordered categorical type with the sorted labels
tsne_df["label"] = pd.Categorical(
    tsne_df["label"], ordered=True, categories=sorted_labels
)

# Proceed with plotting
plt.figure(figsize=(14, 10))
sns.scatterplot(
    x="X",
    y="Y",
    data=tsne_df,
    hue="label",
    legend="full",
    palette="Spectral",
    alpha=1,
    marker="o",
    s=150,
)
plt.title(
    "t-SNE of TCGA-LUAD Patients Clinical Embeddings", fontsize=24
)  # Larger title font size
plt.xlabel("t-SNE Dimension 1", fontsize=24)  # X-axis label with font size
plt.ylabel("t-SNE Dimension 2", fontsize=24)  # Y-axis label with font size
plt.xticks(fontsize=24)  # Larger x-tick labels
plt.yticks(fontsize=24)  # Larger y-tick labels
plt.grid(True)  # Add gridlines
plt.legend(
    title="Year of Diagnosis",
    title_fontsize="16",
    bbox_to_anchor=(1.05, 1),
    loc="upper left",
    fontsize=16,
)
sns.set_style("whitegrid")
plt.tight_layout()
plt.show()

In [1]:
from datasets import Dataset
import numpy as np
import pandas as pd
import random
from dotenv import load_dotenv

load_dotenv()

max_patch_size = 10
embeddings = []
ids = []
labels = []

# Define some fake categories for labels
fake_categories = ["A", "B", "C", "D"]

for i in range(585):
    random_patch_size = np.random.randint(1, max_patch_size + 1)
    data = np.random.rand(random_patch_size, 7, 7, 2048).astype(np.float32)
    if random_patch_size < max_patch_size:
        padding = np.zeros((max_patch_size - random_patch_size, 7, 7, 2048), dtype=np.float32)
        data = np.concatenate([data, padding], axis=0)
    embeddings.append(data)
    ids.append(f"ID_{i}")
    labels.append(random.choice(fake_categories))

embeddings = np.array(embeddings, dtype=object)

# Generate a DataFrame with fake data
df = pd.DataFrame({
    "id": ids,
    "label": labels
})

# Now, combine this information with the embeddings to create the dataset
ds = Dataset.from_dict({
    "id": df["id"].tolist(),
    "label": df["label"].tolist(),
    "data": embeddings
})
ds = ds.with_format("torch")

# Save the dataset to disk
ds.save_to_disk("./testdata")

# Load the dataset from disk and verify its structure
dataset = Dataset.load_from_disk("./testdata")
print(dataset)
print(dataset[0])


Saving the dataset (0/10 shards):   0%|          | 0/585 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'label', 'data'],
    num_rows: 585
})
{'id': 'ID_0', 'label': 'D', 'data': tensor([[[[0.8399, 0.5693, 0.3032,  ..., 0.1226, 0.7706, 0.5793],
          [0.0378, 0.8379, 0.2756,  ..., 0.2207, 0.0626, 0.0852],
          [0.3668, 0.1106, 0.3341,  ..., 0.1359, 0.2585, 0.7778],
          ...,
          [0.3451, 0.2968, 0.0931,  ..., 0.7341, 0.1598, 0.4960],
          [0.1226, 0.0465, 0.3575,  ..., 0.9395, 0.4018, 0.9023],
          [0.8692, 0.8688, 0.4421,  ..., 0.3952, 0.2855, 0.4411]],

         [[0.4677, 0.9430, 0.1870,  ..., 0.0442, 0.4276, 0.9601],
          [0.0126, 0.1896, 0.3887,  ..., 0.4643, 0.5717, 0.8621],
          [0.1299, 0.0903, 0.7232,  ..., 0.1370, 0.1709, 0.2838],
          ...,
          [0.5206, 0.3902, 0.4402,  ..., 0.0113, 0.9322, 0.8847],
          [0.6994, 0.8671, 0.0250,  ..., 0.7464, 0.6148, 0.8130],
          [0.9712, 0.1913, 0.2831,  ..., 0.1075, 0.5943, 0.0333]],

         [[0.3705, 0.5122, 0.6124,  ..., 0.4444, 0.8500, 0.8437],
 

In [None]:
from datasets import load_dataset
import numpy as np

dataset = load_dataset("json", data_files="testdata.json")
data0 = dataset["train"]["data"][0]
data0 = np.array(data0)
print(data0.shape)

(10, 7, 7, 2048)


In [None]:
from datasets import load_dataset
import numpy as np

dataset = load_dataset("arrow", data_files="testdata/data-00000-of-00001.arrow")
data0 = dataset["train"]["data"][0]
data0 = np.array(data0)
print(data0.shape)

Generating train split: 0 examples [00:00, ? examples/s]

(10, 7, 7, 2048)


In [43]:
from datasets import load_dataset
import numpy as np

dataset = load_dataset("Aakash-Tripathi/TCGA-LUAD-minds", "clinical")
print(np.array(dataset["train"]['clinical_embedding'][0]).shape)

(1, 1024)


In [45]:
from datasets import load_dataset
import torch

dataset = load_dataset("Aakash-Tripathi/TCGA-LUAD-minds", "test")
data0 = dataset["train"]["data"][0]
# data0 = np.array(data0)
# print(data0.shape)

# convert to torch tensor
data0 = torch.tensor(data0)
print(data0.shape)

torch.Size([10, 7, 7, 2048])


---

In [None]:
import pandas as pd
from typing import List
import wget
import os
import zipfile
import pandas as pd
from ollama import Client


def download_wikipedia_data(
    data_path: str = "../../data/",
    download_path: str = "./",
    file_name: str = "vector_database_wikipedia_articles_embedded",
) -> pd.DataFrame:
    data_url = "https://cdn.openai.com/API/examples/data/vector_database_wikipedia_articles_embedded.zip"

    csv_file_path = os.path.join(data_path, file_name + ".csv")
    zip_file_path = os.path.join(download_path, file_name + ".zip")
    if os.path.isfile(csv_file_path):
        print("File Downloaded")
    else:
        if os.path.isfile(zip_file_path):
            print("Zip downloaded but not unzipped, unzipping now...")
        else:
            print("File not found, downloading now...")
            # Download the data
            wget.download(data_url, out=download_path)

        # Unzip the data
        with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
            zip_ref.extractall(data_path)

        # Remove the zip file
        os.remove("vector_database_wikipedia_articles_embedded.zip")
        print(f"File downloaded to {data_path}")


def read_wikipedia_data(
    data_path: str = "../../data/",
    file_name: str = "vector_database_wikipedia_articles_embedded",
) -> pd.DataFrame:
    csv_file_path = os.path.join(data_path, file_name + ".csv")
    data = pd.read_csv(csv_file_path)

    if "title_vector" in data.columns:
        data = data.drop(columns=["title_vector"])
    if "content_vector" in data.columns:
        data = data.drop(columns=["content_vector"])
    if "vector_id" in data.columns:
        data = data.drop(columns=["vector_id"])

    # save only the first 500 rows
    data = data.head(500)

    client = Client(host="http://localhost:11434")
    client.pull("all-minilm")

    data["title_vector"] = ""
    data["content_vector"] = ""
    for i, row in data.iterrows():
        title_embedding = client.embeddings(model="all-minilm", prompt=row["title"])[
            "embedding"
        ]
        content_embedding = client.embeddings(model="all-minilm", prompt=row["text"])[
            "embedding"
        ]
        # add the embeddings to the dataframe
        data.at[i, "title_vector"] = title_embedding
        data.at[i, "content_vector"] = content_embedding

    # save the data to a csv file
    data.to_csv(data_path + file_name + ".csv", index=False)
    return data


download_wikipedia_data()
data = read_wikipedia_data()
data.head()

In [None]:
import redis
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redis.commands.search.query import Query
from redis.commands.search.field import TextField, VectorField


data = pd.read_csv(r"F:\Projects\HoneyBee\data\dataframe\Pathology Report.csv")

# add to the data dataframe the column "extracted_text" and "extracted_text_embedding"
data["extracted_text"] = ""
data["extracted_text_embedding"] = ""


REDIS_HOST = "10.0.1.16"
REDIS_PORT = 6379
REDIS_PASSWORD = ""  # default for passwordless Redis

# Connect to Redis
redis_client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD)
redis_client.ping()

# Constants
VECTOR_DIM = 1024  # length of the vectors
VECTOR_NUMBER = len(data)  # initial number of vectors
INDEX_NAME = "embeddings-index"  # name of the search index
PREFIX = "pathology_report"  # prefix for the document keys
DISTANCE_METRIC = "COSINE"  # distance metric for the vectors (ex. COSINE, IP, L2)

# # Define RediSearch fields for each of the columns in the dataset
# title = TextField(name="title")
# url = TextField(name="url")
# text = TextField(name="extracted_text")
# extracted_text_embedding = VectorField(
#     "extracted_text_embedding",
#     "FLAT",
#     {
#         "TYPE": "FLOAT32",
#         "DIM": VECTOR_DIM,
#         "DISTANCE_METRIC": DISTANCE_METRIC,
#         "INITIAL_CAP": VECTOR_NUMBER,
#     },
# )
# fields = [title, url, text, extracted_text_embedding]

# Assuming 'data' is your DataFrame
fields = []
# Dynamically add TextField for each column, except for special handling columns
for column in data.columns:
    if column not in ["extracted_text", "extracted_text_embedding"]:
        fields.append(TextField(name=column))
# Add special fields for 'extracted_text' and 'extracted_text_embedding'
fields.append(TextField(name="extracted_text"))
fields.append(
    VectorField(
        "extracted_text_embedding",
        "FLAT",
        {
            "TYPE": "FLOAT32",
            "DIM": VECTOR_DIM,
            "DISTANCE_METRIC": DISTANCE_METRIC,
            "INITIAL_CAP": VECTOR_NUMBER,
        },
    )
)

# Check if index exists
try:
    redis_client.ft(INDEX_NAME).info()
    print("Index already exists")
    # drop the index
    redis_client.flushall()
    print("Index dropped")
    # Create RediSearch Index
    redis_client.ft(INDEX_NAME).create_index(
        fields=fields,
        definition=IndexDefinition(prefix=[PREFIX], index_type=IndexType.HASH),
    )
    print("Index created")
except:
    # Create RediSearch Index
    redis_client.ft(INDEX_NAME).create_index(
        fields=fields,
        definition=IndexDefinition(prefix=[PREFIX], index_type=IndexType.HASH),
    )
    print("Index created")

## Load documents into the index

In [None]:
def index_documents(client: redis.Redis, prefix: str, documents: pd.DataFrame):
    records = documents.to_dict("records")
    for doc in records:
        key = f"{prefix}:{str(doc['id'])}"

        # create byte vectors for title and content
        # extracted_text_embedding = np.array(
        #     doc["extracted_text_embedding"], dtype=np.float32
        # ).tobytes()

        # replace list of floats with byte vectors
        # doc["extracted_text_embedding"] = extracted_text_embedding
        client.hset(key, mapping=doc)


index_documents(redis_client, PREFIX, data)
print(
    f"Loaded {redis_client.info()['db0']['keys']} documents in Redis search index with name: {INDEX_NAME}"
)

## Simple Vector Search Queries with Ollama Query Embeddings

In [None]:
from ollama import Client

client = Client(host="http://localhost:11434")
client.pull("all-minilm")


def search_redis(
    redis_client: redis.Redis,
    user_query: str,
    index_name: str = "embeddings-index",
    vector_field: str = "title_vector",
    return_fields: list = ["title", "url", "text", "vector_score"],
    hybrid_fields="*",
    k: int = 20,
    print_results: bool = True,
) -> List[dict]:
    # Creates embedding vector from user query
    embedded_query = client.embeddings(
        prompt=user_query,
        model="all-minilm",
    )["embedding"]

    # Prepare the Query
    base_query = f"{hybrid_fields}=>[KNN {k} @{vector_field} $vector AS vector_score]"
    query = (
        Query(base_query)
        .return_fields(*return_fields)
        .sort_by("vector_score")
        .paging(0, k)
        .dialect(2)
    )
    params_dict = {
        "vector": np.array(embedded_query).astype(dtype=np.float32).tobytes()
    }

    # perform vector search
    results = redis_client.ft(index_name).search(query, params_dict)
    if print_results:
        for i, article in enumerate(results.docs):
            score = 1 - float(article.vector_score)
            print(f"{i}. {article.title} (Score: {round(score ,3) })")
    return results.docs


results = search_redis(redis_client, "Art", k=10)
print(results)

In [None]:
import pandas as pd
import numpy as np
import ast

# load F:\Projects\HoneyBee\pathology_report.csv
df = pd.read_csv(r"F:\Projects\HoneyBee\pathology_report.csv")


# Assuming `df` is your loaded DataFrame
df["report_embedding"] = df["report_embedding"].apply(
    lambda x: np.array(ast.literal_eval(x)) if isinstance(x, str) else x
)

In [None]:
for i in range(len(df)):
    print(df["report_embedding"][i].shape)

---

In [None]:
text_input = """EXAMINATION: CT TAP  w/Cont (Thorax, Abdomen,Pelvis)
                CLINICAL HISTORY:   with provided history of "Ovarian cancer, monitor"
                PRIOR EXAM(s): Comparison with CT 2/28/2019 and abdomen MRI 10/21/2021.
                TECHNIQUE: CT TAP  w/Cont (Thorax, Abdomen,Pelvis). CT imaging of the chest, abdomen and pelvis was performed after intravenous administration of contrast.
                Note: This report may employ disease-specific or contextual approach to highlight not just descriptive details, but relevant observations of the patient's oncologic imaging status for informed decision-making. Additional findings not pertinent to oncologic assessment may have been previously described and are discussed when clinically relevant.
                """

In [None]:
from typing import List
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_community.chat_models import ChatOllama
from langchain.output_parsers import OutputFixingParser
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser

model = ChatOllama(model="llama2")


class Patient(BaseModel):
    summary: str = Field(description="summary:")


patient_query = """EXAMINATION: CT TAP  w/Cont (Thorax, Abdomen,Pelvis)
                CLINICAL HISTORY:   with provided history of "Ovarian cancer, monitor"
                PRIOR EXAM(s): Comparison with CT 2/28/2019 and abdomen MRI 10/21/2021.
                TECHNIQUE: CT TAP  w/Cont (Thorax, Abdomen,Pelvis). CT imaging of the chest, abdomen and pelvis was performed after intravenous administration of contrast.
                Note: This report may employ disease-specific or contextual approach to highlight not just descriptive details, but relevant observations of the patient's oncologic imaging status for informed decision-making. Additional findings not pertinent to oncologic assessment may have been previously described and are discussed when clinically relevant.
                """

# Set up a parser + inject instructions into the prompt template.
parser = JsonOutputParser(pydantic_object=Patient)

prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | model | parser
print(chain)

try:
    formatted_output = chain.invoke({"query": patient_query})
    print(formatted_output)
except Exception as e:
    print(f"Error: {e}")
    new_parser = OutputFixingParser.from_llm(parser=parser, llm=ChatOllama())
    chain = prompt | model | new_parser
    misformatted = chain.invoke({"query": patient_query})
    print(misformatted)