In [3]:
import requests
import pandas as pd
import matplotlib.pyplot as plt

def load_gene_synonyms(file_path):
    """
    Load gene synonyms from a TSV file into a dictionary.
    """
    df = pd.read_csv(file_path, sep="\t")
    gene_synonym_dict = {}
    for _, row in df.iterrows():
        gene = row['Gene']
        synonyms = row['Gene synonym'].split(', ') if pd.notna(row['Gene synonym']) else []
        gene_synonym_dict[gene] = synonyms
    return gene_synonym_dict

def fetch_studies(gene_synonym_dict, genes_of_interest):
    """
    Fetch clinical studies for genes of interest, then subset based on whether the gene or its synonyms
    are found in the title, brief description, or detailed description.
    """
    base_url = 'https://clinicaltrials.gov/api/v2/studies'
    all_studies = []

    for gene in genes_of_interest:
        params = {
            'query.term': gene,
            'pageSize': 1000,
            'format': 'json'
        }
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            studies = data.get('studies', [])
            synonyms = gene_synonym_dict.get(gene, [])
            search_terms = [gene] + synonyms  # Combine gene and its synonyms

            for study in studies:
                protocol_section = study.get('protocolSection', {})
                identification_module = protocol_section.get('identificationModule', {})
                status_module = protocol_section.get('statusModule', {})
                description_module = protocol_section.get('descriptionModule', {})
                conditions_module = protocol_section.get('conditionsModule', {})
                design_module = protocol_section.get('designModule', {})
                oversight_module = protocol_section.get('oversightModule', {})
                outcome_module = protocol_section.get('outcomesModule', {})
                
                brief_summary = description_module.get('briefSummary', 'N/A')
                detailed_description = description_module.get('detailedDescription', 'N/A')
                
                interventions = []
                intervention_module = protocol_section.get('armsInterventionsModule', {})
                for intervention in intervention_module.get('interventions', []):
                    interventions.append(intervention.get('name', 'N/A'))
                
                study_info = {
                    'Gene': gene,
                    'NCTId': identification_module.get('nctId', 'N/A'),
                    'Title': identification_module.get('briefTitle', 'N/A'),
                    'BriefSummary': brief_summary,
                    'DetailedDescription': detailed_description,
                    'Overall_Status': status_module.get('overallStatus', 'N/A'),
                    'Conditions': ', '.join(conditions_module.get('conditions', [])),
                    'Interventions': ', '.join(interventions),
                    'StudyType': design_module.get('studyType', 'N/A'),
                    'Phase': design_module.get('phases', ['N/A'])[0] if design_module.get('phases') else 'N/A',
                    'Verified_Date': status_module.get('statusVerifiedDate', 'N/A'),
                    'IsFDARegulatedDrug': oversight_module.get('isFdaRegulatedDrug', 'N/A'),
                    'IsFDARegulatedDevice': oversight_module.get('isFdaRegulatedDevice', 'N/A'),
                    'IsUnapprovedDevice': oversight_module.get('isUnapprovedDevice', 'N/A'),
                    'Keyword': ', '.join(conditions_module.get('keywords', [])),
                    'DesignInterventionModelDescription': design_module.get('designInfo', {}).get('interventionModelDescription', 'N/A'),
                    'PrimaryOutcomes': outcome_module.get('primaryOutcomes', 'N/A'),
                    'SecondaryOutcomes': outcome_module.get('secondaryOutcomes', 'N/A'),
                    'WhyStopped': status_module.get('whyStopped', 'N/A'),
                    'LastKnownStatus': status_module.get('lastKnownStatus', 'N/A'),
                }

                # Check if the gene or its synonyms appear in the study details
                if any(term.lower() in (study_info['Title'].lower() + brief_summary.lower() + detailed_description.lower()) for term in search_terms):
                    all_studies.append(study_info)
        else:
            print(f"Failed to fetch data for gene: {gene}")

    return pd.DataFrame(all_studies)


genes_of_interest = pd.read_csv("/data/ep924610/project_nb/paper_code/heatmap_results/skin_results.csv")["Gene"].to_list()

gene_synonym_file = "/data/ep924610/project_nb/paper_code/proteinatlas_4ef89daa.tsv"
gene_synonym_dict = load_gene_synonyms(gene_synonym_file)


df_studies = fetch_studies(gene_synonym_dict, genes_of_interest)


df_studies.to_csv('/data/ep924610/project_nb/paper_code/clinical_trials/clinical_trials_studies.csv', index=False)

In [4]:
print(df_studies.info())
print(df_studies.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54649 entries, 0 to 54648
Data columns (total 20 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   Gene                                54649 non-null  object
 1   NCTId                               54649 non-null  object
 2   Title                               54649 non-null  object
 3   BriefSummary                        54649 non-null  object
 4   DetailedDescription                 54649 non-null  object
 5   Overall_Status                      54649 non-null  object
 6   Conditions                          54649 non-null  object
 7   Interventions                       54649 non-null  object
 8   StudyType                           54649 non-null  object
 9   Phase                               54649 non-null  object
 10  Verified_Date                       54649 non-null  object
 11  IsFDARegulatedDrug                  54649 non-null  ob

In [5]:
gene_counts = df_studies['Gene'].value_counts()

# Print the counts for each score
print("Counts for each score:")
print(gene_counts)

Counts for each score:
Gene
AR        972
HLA-B     955
ERBB2     850
ENG       789
ADM       784
         ... 
SORL1       1
TCF20       1
ZNF382      1
IKBKG       1
NFKBIA      1
Name: count, Length: 2132, dtype: int64


In [6]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os
import pandas as pd

# Establish Spark connection
spark = (
    SparkSession.builder
    .master('local[*]')
    .getOrCreate()
)

# Path to the directory containing JSON files
json_dir = "/data/ep924610/project_nb/paper_code/clinical_trials/targets"

# List all JSON files in the directory
json_files = [os.path.join(json_dir, f) for f in os.listdir(json_dir) if f.endswith('.json')]

# Read JSON files into a DataFrame
df = spark.read.json(json_files)

# Extract approvedName and tractability information
df_tractability = df.select(
    F.col("approvedSymbol").alias("Gene_Name"),
    F.explode("tractability").alias("tractability")
).select(
    "Gene_Name",
    F.col("tractability.modality").alias("modality"),
    # Cast boolean value to integer (1 for true, 0 for false)
    F.col("tractability.value").cast("integer").alias("value")
)

# Map modalities to scores
df_tractability = df_tractability.withColumn(
    "Modality_Score",
    F.when(F.col("modality").contains("AB"), 10)  # Antibody
    .when(F.col("modality").contains("SM"), 5)      # Small molecule
    .when(F.col("modality").contains("PR") | F.col("modality").contains("OC"), 1)  # PR or OC
    .otherwise(0)  # Nothing
)

# Calculate modality scores
df_modality_scores = df_tractability.groupBy("Gene_Name").agg(
    F.max(F.when(F.col("modality").contains("AB"), 10).otherwise(0)).alias("Antibody"),
    F.max(F.when(F.col("modality").contains("SM"), 5).otherwise(0)).alias("Small_molecule"),
    F.max(F.when(F.col("modality").contains("PR") | F.col("modality").contains("OC"), 1).otherwise(0)).alias("PR_or_OC")
)

# Calculate the total modality score
df_modality_scores = df_modality_scores.withColumn(
    "Modality_Score",
    F.col("Antibody") + F.col("Small_molecule") + F.col("PR_or_OC")
)

# Convert to Pandas DataFrame
modality_scores_pd = df_modality_scores.toPandas()

# Display the results
print(modality_scores_pd.head())

# Save the results to a CSV file
#modality_scores_pd.to_csv('/data/ep924610/project_nb/paper_code/clinical_trials/modality_scores_by_gene_name.csv', index=False)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/03 11:52:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

  Gene_Name  Antibody  Small_molecule  PR_or_OC  Modality_Score
0     CHRM1        10               5         1              16
1     ZGPAT        10               5         1              16
2    IFIT1B        10               5         1              16
3     H2AZ2        10               5         1              16
4     OR1F1        10               5         1              16


In [7]:
import pandas as pd

# Define Phase and Status priority scores
phase_priority = {
    'PHASE4': 6,
    'PHASE3': 5,
    'PHASE2': 4,
    'PHASE1': 3,
    'EARLY_PHASE1': 2,
    'N/A': 0,
    'NA': 0
}

status_priority = {
    'COMPLETED': 3,
    'ACTIVE_NOT_RECRUITING': 2,
    'RECRUITING': 2,
    'NOT_YET_RECRUITING': 2,
    'UNKNOWN': 1,
    'TERMINATED': 0,
    'WITHDRAWN': 0,
    'SUSPENDED': 0
}

# List of cancer-related terms
cancer_terms = [
    'cancer', 'melanoma', 'hcc', 'carcinoma', 'sarcoma', 'leukemia', 'lymphoma', 
    'tumor', 'neoplasm', 'oncology', 'metastatic', 'malignant'
]

def calculate_scores(row, modality_scores):
    # Assign phase score based on the phase of the current row
    phase_score = phase_priority.get(row['Phase'], 0)
    
    # Assign status score based on the status of the current row
    status_score = status_priority.get(row['Overall_Status'], 0)
    
    # Assign cancer score
    conditions_lower = row['Conditions'].lower()
    cancer_score = 10 if any(term in conditions_lower for term in cancer_terms) else 0
    
    # Get the Modality_Score for the gene
    modality_score = modality_scores.get(row['Gene'], 0)
    
    # Combine all scores
    total_score = phase_score + status_score + cancer_score + modality_score
    
    # Return a dictionary of individual scores and the total score
    return {
        'Phase_Score': phase_score,
        'Status_Score': status_score,
        'Cancer_Score': cancer_score,
        'Modality_Score': modality_score,
        'Total_Score': total_score
    }

# Filter for INTERVENTIONAL studies
df_studies = df_studies[df_studies["StudyType"] == "INTERVENTIONAL"]

# Convert modality_scores to a dictionary for easy lookup
modality_scores_dict = modality_scores_pd.set_index('Gene_Name')['Modality_Score'].to_dict()

# Add scores to the DataFrame
scores_df = df_studies.apply(lambda row: calculate_scores(row, modality_scores_dict), axis=1, result_type='expand')
df_studies = pd.concat([df_studies, scores_df], axis=1)

# Function to get the row with the highest phase and status for each gene
def get_highest_phase_status(group):
    # Get the highest phase for the group
    highest_phase = group['Phase'].apply(lambda x: phase_priority.get(x, 0)).idxmax()
    highest_phase_row = group.loc[highest_phase]
    
    # Filter rows with the same highest phase
    same_phase_rows = group[group['Phase'] == highest_phase_row['Phase']]
    
    # Get the row with the highest status score within the highest phase
    highest_status_row = same_phase_rows.loc[same_phase_rows['Overall_Status'].apply(lambda x: status_priority.get(x, 0)).idxmax()]
    
    return highest_status_row

# Apply the function to each gene group
gene_scores = df_studies.groupby('Gene').apply(get_highest_phase_status).reset_index(drop=True)

# Count the number of trials for each phase for each gene
phase_counts = df_studies.groupby(['Gene', 'Phase']).size().unstack(fill_value=0)

# Add the number of trials for each phase
gene_scores = gene_scores.merge(phase_counts, on='Gene', how='left')

# Add missing genes with a score of 0
missing_genes = set(genes_of_interest) - set(gene_scores['Gene'])
for gene in missing_genes:
    new_row = pd.DataFrame([{
        'Gene': gene,
        'Total_Score': 0,
        'Phase_Score': 0,
        'Status_Score': 0,
        'Cancer_Score': 0,
        'Modality_Score': 0,
        'Phase': 'N/A',
        'Overall_Status': 'N/A',
        'Conditions': 'N/A'
    }])
    gene_scores = pd.concat([gene_scores, new_row], ignore_index=True)

    # Add zero counts for all phases
    for phase in phase_priority.keys():
        gene_scores.loc[gene_scores['Gene'] == gene, phase] = 0



In [8]:
# Remove unwanted columns
gene_scores = gene_scores.drop(columns=['Title', 'BriefSummary', 'DetailedDescription', 'Interventions', "Verified_Date", "IsFDARegulatedDrug", "IsFDARegulatedDevice", "IsUnapprovedDevice", 'Keyword',"StudyType", 'DesignInterventionModelDescription', 'PrimaryOutcomes',
       'SecondaryOutcomes', 'WhyStopped', 'LastKnownStatus'], errors='ignore')

In [9]:
gene_scores.columns

Index(['Gene', 'NCTId', 'Overall_Status', 'Conditions', 'Phase', 'Phase_Score',
       'Status_Score', 'Cancer_Score', 'Modality_Score', 'Total_Score',
       'EARLY_PHASE1', 'N/A', 'NA', 'PHASE1', 'PHASE2', 'PHASE3', 'PHASE4'],
      dtype='object')

In [10]:
gene_scores.sort_values("Total_Score", ascending=False).head(25)

Unnamed: 0,Gene,NCTId,Overall_Status,Conditions,Phase,Phase_Score,Status_Score,Cancer_Score,Modality_Score,Total_Score,EARLY_PHASE1,N/A,NA,PHASE1,PHASE2,PHASE3,PHASE4
429,DPYD,NCT01641458,COMPLETED,Colorectal Cancer,PHASE4,6,3,10,16,35,0.0,0.0,7.0,8.0,9.0,0.0,2.0
671,HMGCR,NCT04776889,COMPLETED,Prostate Cancer Metastatic,PHASE4,6,3,10,16,35,0.0,0.0,0.0,0.0,1.0,0.0,2.0
351,CTH,NCT02713087,COMPLETED,Brain Tumor,PHASE4,6,3,10,16,35,0.0,0.0,4.0,2.0,0.0,0.0,1.0
588,GEM,NCT01664975,COMPLETED,Peripheral T-cell Lymphoma,PHASE4,6,3,10,16,35,0.0,0.0,53.0,53.0,108.0,24.0,4.0
1101,PIK3CA,NCT02688881,COMPLETED,Refractory Solid Tumors,PHASE4,6,3,10,16,35,1.0,0.0,6.0,67.0,77.0,18.0,2.0
287,CISH,NCT01501487,COMPLETED,Breast Cancer,PHASE4,6,3,10,16,35,0.0,0.0,1.0,4.0,2.0,0.0,1.0
61,AIDA,NCT00465933,COMPLETED,Acute Promyelocytic Leukemia,PHASE4,6,3,10,16,35,0.0,0.0,9.0,0.0,2.0,2.0,2.0
1164,PREP,NCT01169220,COMPLETED,"Gastrointestinal Hemorrhage, Colon Cancer, Div...",PHASE4,6,3,10,16,35,5.0,0.0,435.0,34.0,45.0,58.0,82.0
1557,TYMS,NCT00138060,COMPLETED,Metastatic Colorectal Cancer,PHASE4,6,3,10,16,35,1.0,0.0,1.0,20.0,31.0,5.0,1.0
68,AKT1,NCT01206764,COMPLETED,Renal Cell Carcinoma,PHASE4,6,3,10,16,35,1.0,0.0,16.0,23.0,25.0,4.0,1.0


In [11]:
# Save the results
gene_scores.to_csv('/data/ep924610/project_nb/paper_code/clinical_trials/clinical_trial_scores.csv', index=False)