In [37]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [38]:
import requests
import json

In [39]:
def query_pdc(query):

    url = 'https://pdc.cancer.gov/graphql'

    print('Sending query.')

    pdc_response = requests.post(url, json={'query': query})

    if pdc_response.ok:
        return pdc_response.json()
    else:
        return pdc_response.raise_for_status()

In [40]:
study_metadata_query = '''
{
  programsProjectsStudies {
    program_id
    program_submitter_id
    name
    sponsor
    start_date
    end_date
    program_manager
    projects {
      project_id
      project_submitter_id
      name
      studies {
        study_id
        study_submitter_id
        submitter_id_name
        study_name
        program_name
        project_name
        program_id
        project_id
        project_submitter_id
        disease_type
        primary_site
        analytical_fraction
        experiment_type
        acquisition_type
        cases_count
        aliquots_count
      }
    }
  }
}'''

In [41]:
study_mdata = query_pdc(study_metadata_query)

Sending query.


In [42]:
formatted = json.dumps(study_mdata, indent=2)
print(formatted[0:3000])

{
  "data": {
    "programsProjectsStudies": [
      {
        "program_id": "029f67dd-5cc8-11ea-bf04-0ef15c86e253",
        "program_submitter_id": "Georgetown Proteomics Research Program",
        "name": "Georgetown Proteomics Research Program",
        "sponsor": null,
        "start_date": "2020-03-11",
        "end_date": null,
        "program_manager": "Ratna Thangudu",
        "projects": [
          {
            "project_id": "8ffb768a-648c-11ea-b1fd-0aad30af8a83",
            "project_submitter_id": "Georgetown Lung Cancer Proteomics Study",
            "name": "Georgetown Lung Cancer Proteomics Study",
            "studies": [
              {
                "study_id": "17d5bccf-d028-11ea-b1fd-0aad30af8a83",
                "study_submitter_id": "Georgetown Lung Cancer Proteomics Study",
                "submitter_id_name": "Georgetown Lung Cancer Proteomics Study",
                "study_name": "Georgetown Lung Cancer Proteomics Study",
                "program_name": nu

In [43]:
output_file = 'studymdata.json'
with open(output_file, 'w') as outfile:
    outfile.write(formatted)
print(f"Study metadata saved to {output_file}")

Study metadata saved to studymdata.json


In [44]:
import csv

def flatten_data(data):
    flattened_data = []

    programs_projects_studies = data.get('data', {}).get('programsProjectsStudies', [])
    for program in programs_projects_studies:
        for project in program.get('projects', []):
            for study in project.get('studies', []):
                # Construct a flat dictionary for each study
                row = {
                    'Program Name': program['name'],
                    'Project Name': project['name'],
                    'Study Name': study['study_name'],
                    'Cases Count': study['cases_count'],
                    'Disease Type': study['disease_type'],
                    'Primary Site': study['primary_site'],
                    'Experiment Type': study['experiment_type']
                }
                flattened_data.append(row)
    return flattened_data


In [45]:
def write_to_csv(data, filename='/content/drive/MyDrive/Capstone/output.csv'):
    if not data:
        print("No data provided to write to CSV.")
        return

    try:
        fieldnames = data[0].keys()
        with open(filename, 'w', newline='') as file:
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(data)
        print(f"Data successfully saved to {filename}.")
    except IOError as e:
        print(f"An error occurred while writing to the file: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


In [46]:
!pip install datasets
from datasets import load_dataset

data = load_dataset('csv', data_files='/content/drive/MyDrive/Capstone/output.csv')



In [47]:
pip install transformers datasets torch



In [48]:
!pip install weaviate-client



In [49]:
import weaviate

def connect_to_weaviate(weaviate_url, api_key):
    client = weaviate.Client(
    url= "https://pp769-l99q1fyt.weaviate.network",
    auth_client_secret=weaviate.AuthApiKey(api_key="zvDbZFC0SwRBPqy94OdlmvpLR5QjuO8FogFv")
    )
    return client

client = connect_to_weaviate("https://pp769-l99q1fyt.weaviate.network", "zvDbZFC0SwRBPqy94OdlmvpLR5QjuO8FogFv" )

if client.is_ready():
    print("Connection successful!")
else:
    print("Connection failed. Please check your URL and API key.")

Connection successful!


In [50]:
def get_weaviate_version(client):
    if client.is_ready():
        meta_info = client.get_meta()
        return meta_info.get('version', 'Version not found')
    else:
        return "Connection failed. Please check your URL and API key."

version = get_weaviate_version(client)
print("Weaviate version:", version)


Weaviate version: 1.25.4


In [52]:
def create_schema(client):
    if client is None:
        print("Client is not initialized.")
        return
    study_schema = {
        "class": "Study",
        "properties": [
            {"name": "study_id", "dataType": ["string"]},
            {"name": "study_submitter_id", "dataType": ["string"]},
            {"name": "submitter_id_name", "dataType": ["string"]},
            {"name": "study_name", "dataType": ["string"]},
            {"name": "program_name", "dataType": ["string"]},
            {"name": "project_name", "dataType": ["string"]},
            {"name": "program_id", "dataType": ["string"]},
            {"name": "project_id", "dataType": ["string"]},
            {"name": "project_submitter_id", "dataType": ["string"]},
            {"name": "disease_type", "dataType": ["string"]},
            {"name": "primary_site", "dataType": ["string"]},
            {"name": "analytical_fraction", "dataType": ["string"]},
            {"name": "experiment_type", "dataType": ["string"]},
            {"name": "acquisition_type", "dataType": ["string"]},
            {"name": "cases_count", "dataType": ["int"]},
            {"name": "aliquots_count", "dataType": ["int"]}
        ]
    }

    publication_schema = {
        "class": "Publication",
        "properties": [
            {"name": "publicationId", "dataType": ["string"]},
            {"name": "pubmedId", "dataType": ["string"]},
            {"name": "doi", "dataType": ["string"]},
            {"name": "author", "dataType": ["string[]"]},
            {"name": "title", "dataType": ["text"], "indexInverted": True},
            {"name": "journal", "dataType": ["string"]},
            {"name": "journalUrl", "dataType": ["string"]},
            {"name": "year", "dataType": ["int"]},
            {"name": "abstract", "dataType": ["text"], "indexInverted": True},
            {"name": "citation", "dataType": ["string"]},
            {"name": "diseaseTypes", "dataType": ["string[]"]},
            {"name": "relatedStudies", "dataType": ["Study"], "cardinality": "hasMany"}
        ]
    }
    try:
        existing_classes = client.schema.get()['classes']
        existing_class_names = [cls['class'] for cls in existing_classes]

        if "Study" not in existing_class_names:
            client.schema.create_class(study_schema)
            print("Study schema created successfully.")
        else:
            print("Study class already exists.")

        if "Publication" not in existing_class_names:
            client.schema.create_class(publication_schema)
            print("Publication schema created successfully.")
        else:
            print("Publication class already exists.")

    except Exception as e:
        print(f"Failed to check or create schema: {e}")

In [53]:
def fetch_and_insert_publications(client, url, graphql_query):
    headers = {"Content-Type": "application/json"}
    response = requests.post(url, json={'query': graphql_query}, headers=headers)
    if response.status_code == 200:
        data = response.json()['data']['getPaginatedUIPublication']['uiPublication']
        print(f"Retrieved {len(data)} publications.")

        for pub in data:
            authors = [author.strip() for author in pub.get('author', '').split(',')] if pub.get('author') else []
            print(f"Processing publication: {pub['title']} with {len(pub.get('studies', []))} studies")
            publication_object = {
                "publicationId": pub['publication_id'],
                "pubmedId": pub['pubmed_id'],
                "doi": pub['doi'],
                "author": [author.strip() for author in pub.get('author', '').split(',')] if pub.get('author') else [],
                "title": pub['title'],
                "journal": pub['journal'],
                "journalUrl": pub['journal_url'],
                "year": int(pub.get('year', 0)),
                "abstract": pub.get('abstract', ""),
                "citation": pub.get('citation', ""),
                "diseaseTypes": pub.get('disease_types', [])
            }
            try:
                client.data_object.create(publication_object, "Publication")
                print(f"Publication Abstract: {pub.get('abstract', '')}")
            except Exception as e:
                print(f"Failed to insert publication: {pub.get('abstract', '')} - {str(e)}")
    else:
        print(f"Failed to fetch publications: {response.status_code} - {response.text}")

In [54]:
def fetch_and_insert_studies(client, url, graphql_query):
    headers = {"Content-Type": "application/json"}
    response = requests.post(url, json={'query': graphql_query}, headers=headers)
    if response.status_code == 200:
        programs = response.json()['data']['programsProjectsStudies']
        for program in programs:
            for project in program['projects']:
                for study in project['studies']:
                    study_object = {
                        "study_id": study['study_id'],
                        "study_submitter_id": study.get('study_submitter_id', ""),
                        "submitter_id_name": study.get('submitter_id_name', ""),
                        "study_name": study['study_name'],
                        "program_name": study['program_name'],
                        "project_name": study['project_name'],
                        "program_id": study['program_id'],
                        "project_id": study['project_id'],
                        "project_submitter_id": study.get('project_submitter_id', ""),
                        "disease_type": study['disease_type'],
                        "primary_site": study['primary_site'],
                        "analytical_fraction": study.get('analytical_fraction', ""),
                        "experiment_type": study.get('experiment_type', ""),
                        "acquisition_type": study.get('acquisition_type', ""),
                        "cases_count": int(study.get('cases_count', 0)),
                        "aliquots_count": int(study.get('aliquots_count', 0))
                    }
                    try:
                        client.data_object.create(study_object, "Study")
                        print(f"Inserted study: {study['study_name']} linked to project {project['name']}")
                    except Exception as e:
                        print(f"Failed to insert study {study['study_name']}: {str(e)}")
    else:
        print(f"Failed to fetch studies: {response.status_code} - {response.text}")

In [55]:
if __name__ == "__main__":
    client = connect_to_weaviate(weaviate_url, api_key)
    weaviate_url = 'https://pp769-l99q1fyt.weaviate.network'
    api_key = 'zvDbZFC0SwRBPqy94OdlmvpLR5QjuO8FogFv'

    fetch_and_insert_publications(client=client, url=graphql_url, graphql_query=publication_query)
    fetch_and_insert_studies(client, graphql_url, study_metadata_query)

Retrieved 44 publications.
Processing publication: Integrated proteogenomic characterization of glioblastoma evolution with 2 studies
Publication Abstract: The evolutionary trajectory of glioblastoma (GBM) is a multifaceted biological process that extends beyond genetic alterations alone. Here, we perform an integrative proteogenomic analysis of 123 longitudinal glioblastoma pairs and identify a highly proliferative cellular state at diagnosis and replacement by activation of neuronal transition and synaptogenic pathways in recurrent tumors. Proteomic and phosphoproteomic analyses reveal that the molecular transition to neuronal state at recurrence is marked by post-translational activation of the wingless-related integration site (WNT)/ planar cell polarity (PCP) signaling pathway and BRAF protein kinase. Consistently, multi-omic analysis of patient-derived xenograft (PDX) models mirror similar patterns of evolutionary trajectory. Inhibition of B-raf proto-oncogene (BRAF) kinas

In [56]:
def query_publications_by_year(client, year):
    gql_query = f"""
    {{
        Get {{
            Publication(
                where: {{
                    path: ["year"],
                    operator: Equal,
                    valueInt: {year}
                }}
            ) {{
                publicationId
                title
                year
            }}
        }}
    }}
    """
    return client.query.raw(gql_query)

def semantic_search_publications(client, keyword):
    gql_query = f"""
    {{
        Get {{
            Publication(
                where: {{
                    path: ["abstract"],
                    operator: Like,
                    valueString: "{keyword}"
                }}
            ) {{
                publicationId
                title
                abstract
            }}
        }}
    }}
    """
    return client.query.raw(gql_query)

def find_similar_publications(client, abstract):
    gql_query = f"""
    {{
        Get {{
            Publication(
                nearText: {{
                    concepts: ["{abstract}"],
                    certainty: 0.7
                }}
            ) {{
                publicationId
                title
                abstract
            }}
        }}
    }}
    """
    return client.query.raw(gql_query)

def main():
    client = connect_to_weaviate("https://pp769-l99q1fyt.weaviate.network", "zvDbZFC0SwRBPqy94OdlmvpLR5QjuO8FogFv")

    if client.is_ready():
        print("Connection successful!")

        print("\nQuerying all publications from the year 2020...")
        publications_from_2020 = query_publications_by_year(client, 2020)
        print(publications_from_2020)

        print("\nQuerying publications related to 'Cancer'...")
        cancer_related_publications = semantic_search_publications(client, "Cancer")
        print(cancer_related_publications)

        print("\nFinding similar publications to a given abstract...")
        similar_publications = find_similar_publications(client, "example abstract here")
        print(similar_publications)

    else:
        print("Connection failed. Please check your URL and API key.")

if __name__ == "__main__":
    main()


Connection successful!

Querying all publications from the year 2020...
{'data': {'Get': {'Publication': [{'publicationId': '90f114bd-2b57-11eb-93d9-0adc6d0c0251', 'title': 'Proteogenomic Landscape of Breast Cancer Tumorigenesis and Targeted Therapy', 'year': 2020}, {'publicationId': '161370ab-4f57-11ea-b5ed-0a7ef35af225', 'title': 'Proteogenomic Characterization of Endometrial Carcinoma', 'year': 2020}, {'publicationId': 'fe52a762-74ad-11ec-bde9-0a4e2186f121', 'title': 'Deep Proteomics Using Two Dimensional Data Independent Acquisition Mass Spectrometry', 'year': 2020}, {'publicationId': '245451cc-c519-11ea-9ab4-0ae52bb11cb7', 'title': 'Proteogenomics of Non-smoking Lung Cancer in East Asia Delineates Molecular Signatures of Pathogenesis and Progression', 'year': 2020}, {'publicationId': '24548009-c519-11ea-9ab4-0ae52bb11cb7', 'title': 'Proteogenomic Characterization Reveals Therapeutic Vulnerabilities in Lung Adenocarcinoma', 'year': 2020}, {'publicationId': '3c25e6ab-1507-11eb-8dc2-

In [58]:
!pip install transformers



In [64]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load the tokenizer and model from Hugging Face's Transformers
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

# Function to generate embeddings
def get_embedding_function(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().tolist()
    return embeddings

text = "Example abstract text for embedding"
# Generate embeddings
embeddings = get_embedding_function(text)

publication_object = {
    "pubmed_id": "example_pub",
    "abstract": text,
    "embeddings": embeddings
}
print("Generated Embeddings:", embeddings)

try:
    client.data_object.create(data_object=publication_object, class_name="Publication")
    print("Data inserted successfully.")
except Exception as e:
    print(f"Failed to insert data into Weaviate: {str(e)}")


Generated Embeddings: [-0.004773537162691355, -0.07366238534450531, -0.24704734981060028, -0.22698190808296204, 0.030707087367773056, -0.2608434855937958, -0.024355215951800346, 0.2256229668855667, 0.15915220975875854, -0.17719651758670807, -0.19198530912399292, 0.06828570365905762, -0.13849997520446777, 0.006158149801194668, -0.2831863462924957, 0.1900382786989212, -0.17914994060993195, 0.1722506582736969, 0.051133424043655396, -0.007557927630841732, -0.10050210356712341, -0.0022351464722305536, -0.49910974502563477, 0.16080442070960999, 0.6187970042228699, -0.2574285566806793, 0.003770755371078849, 0.04784826934337616, -0.5063055157661438, 0.030827246606349945, 0.06813422590494156, 0.25597453117370605, 0.008969760499894619, -0.2160501778125763, -0.20582297444343567, 0.029385603964328766, 0.19034753739833832, -0.09060962498188019, -0.20969301462173462, 0.1414690613746643, -0.4959445595741272, -0.331805557012558, 0.4533627927303314, -0.0029274059925228357, 0.023881515488028526, -0.4721