## Merge all the trails

In [64]:
import pandas as pd
import os
import json

def merge_csv_by_columns(folder_path, output_file):
    # List all CSV files in the folder
    csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

    # Initialize an empty list to store DataFrames
    dataframes = []

    # Read each CSV file and append it to the list
    for file in csv_files:
        df = pd.read_csv(os.path.join(folder_path, file))
        dataframes.append(df)

    # Merge all DataFrames based on common column names
    merged_df = pd.concat(dataframes, ignore_index=True, sort=False)

    # Remove rows where 'location' is empty
    # merged_df = merged_df.dropna(subset=['Locations'])

    # Remove duplicate rows
    merged_df = merged_df.drop_duplicates()

    # Save the merged DataFrame to a new CSV file
    merged_df.to_csv(output_file, index=False)

    return merged_df

# Example usage
folder_path = './trials/'
output_file = 'merged_output.csv'
clinical_trails_data = merge_csv_by_columns(folder_path, output_file)

In [65]:
clinical_trails_data.head(5)

Unnamed: 0,NCT Number,Study Title,Study URL,Acronym,Study Status,Brief Summary,Study Results,Conditions,Interventions,Primary Outcome Measures,...,Study Design,Other IDs,Start Date,Primary Completion Date,Completion Date,First Posted,Results First Posted,Last Update Posted,Locations,Study Documents
0,NCT03334747,Safety of KAE609 in Adults With Uncomplicated ...,https://clinicaltrials.gov/study/NCT03334747,,COMPLETED,KAE609 will be evaluated primarily for hepatic...,YES,Malaria,DRUG: KAE609|DRUG: Coartem,Number of Participants With at Least 2 CTCAE G...,...,Allocation: RANDOMIZED|Intervention Model: PAR...,CKAE609A2202|207813/Z/17/Z,2017-11-16,2019-11-23,2019-11-23,2017-11-07,2020-09-03,2021-10-11,"Novartis Investigative Site, Lambarene, Gabon|...","Study Protocol, https://storage.googleapis.com..."
1,NCT01190202,Epidemiology Study of Malaria Transmission Int...,https://clinicaltrials.gov/study/NCT01190202,,COMPLETED,The aim of this epidemiology study is to chara...,YES,Malaria|Malaria Vaccines,PROCEDURE: Blood sampling|PROCEDURE: Assessmen...,Number of Subjects With Plasmodium Falciparum ...,...,Allocation: NA|Intervention Model: SINGLE_GROU...,114001,2011-03-14,2013-12-20,2013-12-20,2010-08-27,2017-09-29,2020-08-18,"GSK Investigational Site, Ouagadougou 01, Burk...",
2,NCT01955382,Evaluation of Oral Activated Charcoal on Antim...,https://clinicaltrials.gov/study/NCT01955382,,COMPLETED,Background:\n\n- Malaria is caused by small pa...,YES,Malaria|Severe Malaria,DRUG: Actidose Aqua|DRUG: Artesunate|DRUG: Amo...,"Parasite Clearance Half-life, To compare paras...",...,Allocation: RANDOMIZED|Intervention Model: PAR...,999913209|13-I-N209|NCT01955382,2013-09,2015-07,2015-07,2013-10-07,2017-04-20,2018-02-06,"Universite des Sciencies, Techniques et Techno...","Study Protocol and Statistical Analysis Plan, ..."
3,NCT01465048,Optimisation of Controlled Human Malaria Infec...,https://clinicaltrials.gov/study/NCT01465048,,COMPLETED,"This is an open label, human pilot study to op...",YES,Malaria|Plasmodium Falciparum,BIOLOGICAL: Plasmodium falciparum sporozoites ...,"Number of Participants Infected, To determine ...",...,Allocation: NON_RANDOMIZED|Intervention Model:...,VAC049,2011-10,2012-02,2013-02,2011-11-04,2013-04-08,2013-06-24,Centre for Clinical Vaccinology and Tropical M...,
4,NCT02458092,Evaluate the Safety and Efficacy of Plasmodium...,https://clinicaltrials.gov/study/NCT02458092,,COMPLETED,The purpose of this study is to determine whet...,YES,Malaria,BIOLOGICAL: Plasmodium falciparum Malaria Prot...,Number of Participants With Solicited Adverse ...,...,Allocation: RANDOMIZED|Intervention Model: PAR...,A-14620.b|WRAIR 1417,2008-04,2008-12,2009-06,2015-05-29,2018-11-07,2023-06-28,,


## Preprocess the data

In [67]:
def preprocess(merged_df):
    for col in df.columns:
        if df[col].dtype == object:
            df[col] = df[col].str.lower()
        df[col] = df[col].fillna('')
    return df

In [69]:
preprocessed_trials = preprocess(clinical_trails_data)
preprocessed_trials.head()

Unnamed: 0,NCT Number,Study Title,Study URL,Acronym,Study Status,Brief Summary,Study Results,Conditions,Interventions,Primary Outcome Measures,...,Study Design,Other IDs,Start Date,Primary Completion Date,Completion Date,First Posted,Results First Posted,Last Update Posted,Locations,Study Documents
0,nct03334747,safety of kae609 in adults with uncomplicated ...,https://clinicaltrials.gov/study/nct03334747,,completed,kae609 will be evaluated primarily for hepatic...,yes,malaria,drug: kae609|drug: coartem,number of participants with at least 2 ctcae g...,...,allocation: randomized|intervention model: par...,ckae609a2202|207813/z/17/z,2017-11-16,2019-11-23,2019-11-23,2017-11-07,2020-09-03,2021-10-11,"novartis investigative site, lambarene, gabon|...","study protocol, https://storage.googleapis.com..."
1,nct01190202,epidemiology study of malaria transmission int...,https://clinicaltrials.gov/study/nct01190202,,completed,the aim of this epidemiology study is to chara...,yes,malaria|malaria vaccines,procedure: blood sampling|procedure: assessmen...,number of subjects with plasmodium falciparum ...,...,allocation: na|intervention model: single_grou...,114001,2011-03-14,2013-12-20,2013-12-20,2010-08-27,2017-09-29,2020-08-18,"gsk investigational site, ouagadougou 01, burk...",
2,nct01955382,evaluation of oral activated charcoal on antim...,https://clinicaltrials.gov/study/nct01955382,,completed,background:\n\n- malaria is caused by small pa...,yes,malaria|severe malaria,drug: actidose aqua|drug: artesunate|drug: amo...,"parasite clearance half-life, to compare paras...",...,allocation: randomized|intervention model: par...,999913209|13-i-n209|nct01955382,2013-09,2015-07,2015-07,2013-10-07,2017-04-20,2018-02-06,"universite des sciencies, techniques et techno...","study protocol and statistical analysis plan, ..."
3,nct01465048,optimisation of controlled human malaria infec...,https://clinicaltrials.gov/study/nct01465048,,completed,"this is an open label, human pilot study to op...",yes,malaria|plasmodium falciparum,biological: plasmodium falciparum sporozoites ...,"number of participants infected, to determine ...",...,allocation: non_randomized|intervention model:...,vac049,2011-10,2012-02,2013-02,2011-11-04,2013-04-08,2013-06-24,centre for clinical vaccinology and tropical m...,
4,nct02458092,evaluate the safety and efficacy of plasmodium...,https://clinicaltrials.gov/study/nct02458092,,completed,the purpose of this study is to determine whet...,yes,malaria,biological: plasmodium falciparum malaria prot...,number of participants with solicited adverse ...,...,allocation: randomized|intervention model: par...,a-14620.b|wrair 1417,2008-04,2008-12,2009-06,2015-05-29,2018-11-07,2023-06-28,,


## Convert the CSV data to a JSON-like structure (Original Code)

In [70]:
# import pandas as pd
# import json
 
# # Function to convert each row to a JSON-like structure based on the prompt
# def row_to_json(row):
#     trial_info = {
#         "Trial ID": row["NCT Number"],
#         "Title": row["Study Title"],
#         "URL": row["Study URL"],
#         "Status": row["Study Status"],
#         "Brief Summary": row["Brief Summary"],
#         "Phase": row["Phases"],
#         "Study Start Date": row["Start Date"],
#         "Expected Completion Date": row["Completion Date"]
    
#     }
 
#     # Assuming 'Interventions' field contains drug names
#     drug_info = {
#         "Drug Name": row["Interventions"]
#     }
 
#     # Assuming 'Conditions' field contains disease names
#     disease_info = {
#         "Disease Name": row["Conditions"]
#     }

#     locations_info = {
#         "Location Name": row["Locations"]
#     }
 
#     # Structuring the JSON object
#     json_structure = {
#         "Trial": trial_info,
#         "Drug/Intervention": drug_info,
#         "Disease/Condition": disease_info,
#         # "Sponsor": sponsor_info,
#         "Locations": locations_info
#         # "Outcome Measures": outcome_info,
#         # "Participants/Patients": participants_info
#     }
 
#     return json_structure
 
# # Convert each row of the dataframe to JSON structure and store in a list
# json_data = [row_to_json(row) for index, row in preprocessed_df.iterrows()]
 
# # Convert the list to JSON string for display
# json_string = json.dumps(json_data, indent=4)
 
# # Display the first few JSON structures as a sample
# print(json_string[:5000])  # Display only first 2000 characters for brevity
 


[
    {
        "Trial": {
            "Trial ID": "nct03334747",
            "Title": "safety of kae609 in adults with uncomplicated plasmodium falciparum malaria.",
            "URL": "https://clinicaltrials.gov/study/nct03334747",
            "Status": "completed",
            "Brief Summary": "kae609 will be evaluated primarily for hepatic safety of single and multiple doses in sequential cohorts with increasing doses.this study aims to determine the maximum safe dose of the investigational drug kae609 in malaria patients.",
            "Phase": "phase2",
            "Study Start Date": "2017-11-16",
            "Expected Completion Date": "2019-11-23"
        },
        "Drug/Intervention": {
            "Drug Name": "drug: kae609|drug: coartem"
        },
        "Disease/Condition": {
            "Disease Name": "malaria"
        },
        "Locations": {
            "Location Name": "novartis investigative site, lambarene, gabon|novartis investigative site, kintampo, ghana|nova

## Convert the CSV data to a JSON-like structure (Updated Code)

In [98]:
# Re-importing necessary libraries as the execution state was reset
import pandas as pd
import json


# Function to clean unwanted unicode characters like "\u00e9" from a string
def clean_unicode(text):
    if isinstance(text, str):
        return text.encode('ascii', 'ignore').decode('ascii')
    return text

# Updated function to include different intervention types and convert each row to a JSON-like structure
def row_to_json(row):
    trial_info = {
        "Trial ID": clean_unicode(row["NCT Number"]),
        "Title": clean_unicode(row["Study Title"]),
        "URL": clean_unicode(row["Study URL"]),
        "Status": clean_unicode(row["Study Status"]),
        "Brief Summary": clean_unicode(row["Brief Summary"]),
        "Phase": clean_unicode(row["Phases"]),
        "Study Start Date": clean_unicode(row["Start Date"]),
        "Expected Completion Date": clean_unicode(row["Completion Date"])
    }

    # Splitting interventions into different types if they exist
    intervention_types = ["drug", "procedure", "device", "behavioral", "combination_product","biological","radiation","other"]
    interventions_info = {key.capitalize(): [] for key in intervention_types}  # Initialize all intervention types with empty lists
    if pd.notnull(row["Interventions"]):
        interventions = row["Interventions"].split('|')
        for intervention in interventions:
            intervention = intervention.strip()
            for type_key in intervention_types:
                if intervention.startswith(f"{type_key}:"):
                    # Clean the intervention name and add it to the corresponding list
                    intervention_name = intervention[len(f"{type_key}:"):].strip()
                    intervention_cleaned = clean_unicode(intervention_name)
                    # Capitalize the first letter of each intervention type for the output
                    intervention_type_key = type_key.capitalize()
                    interventions_info[intervention_type_key].append(intervention_cleaned)

    # Splitting disease names if multiple diseases are listed
    disease_info = {
        "Disease Name": [clean_unicode(disease) for disease in row["Conditions"].split('|')] if pd.notnull(row["Conditions"]) else []
    }

    # Parsing locations into structured format
    locations_info = []
    if pd.notnull(row["Locations"]):
        locations_list = row["Locations"].split('|')
        for location in locations_list:
            parts = [clean_unicode(part.strip()) for part in location.split(',')]
            location_dict = {
                "Institution": parts[0] if parts else None,
                "City": parts[1] if len(parts) > 1 else None,
                "Country": parts[-1] if parts else None
            }
            locations_info.append(location_dict)

    # Structuring the JSON object
    json_structure = {
        "Trial": trial_info,
        "Drug/Intervention": interventions_info,
        "Disease/Condition": disease_info,
        "Locations": locations_info
    }

    return json_structure

# Convert each row of the dataframe to JSON structure and store in a list
json_data = [row_to_json(row) for index, row in preprocessed_trials.iterrows()]

# Convert the list to JSON string for display
json_string = json.dumps(json_data, indent=4)

# Display the first few JSON structures as a sample
print(json_string[:10000])  # Display only first 2000 characters for brevity


[
    {
        "Trial": {
            "Trial ID": "nct03334747",
            "Title": "safety of kae609 in adults with uncomplicated plasmodium falciparum malaria.",
            "URL": "https://clinicaltrials.gov/study/nct03334747",
            "Status": "completed",
            "Brief Summary": "kae609 will be evaluated primarily for hepatic safety of single and multiple doses in sequential cohorts with increasing doses.this study aims to determine the maximum safe dose of the investigational drug kae609 in malaria patients.",
            "Phase": "phase2",
            "Study Start Date": "2017-11-16",
            "Expected Completion Date": "2019-11-23"
        },
        "Drug/Intervention": {
            "Drug": [
                "kae609",
                "coartem"
            ],
            "Procedure": [],
            "Device": [],
            "Behavioral": [],
            "Combination_product": [],
            "Biological": [],
            "Radiation": [],
            "Other": 