In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import csv
from tabulate import tabulate

breast_cancer_df = pd.read_csv('breast-cancer.csv')
covid_df = pd.read_csv('covid.csv')

In [None]:
## Function that gets the sexs

breast_cancer_sex_df = breast_cancer_df['Sex'].value_counts(dropna=False)
covid_sex_df = covid_df['Sex'].value_counts(dropna=False)

In [None]:
## sexs graphs
plt.figure(figsize=(8, 6))
bars = breast_cancer_sex_df.plot(kind='bar', color='hotpink')

for i, value in enumerate(breast_cancer_sex_df.values):
        bars.text(i, value, value, ha='center', va='bottom')

plt.xlabel('Sex')
plt.ylabel('Count')
plt.title('Distribution of sex in the Breast Cancer DataFrame')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
# Display the bar chart for breast cancer
plt.show()
##----
plt.figure(figsize=(8, 6))
bars = covid_sex_df.plot(kind='bar', color='hotpink')

for i, value in enumerate(covid_sex_df.values):
        bars.text(i, value, value, ha='center', va='bottom')

plt.xlabel('Sex')
plt.ylabel('Count')
plt.title('Distribution of sex in the COVID DataFrame')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
# Display the bar chart
plt.show()

## side by side
df1 = pd.DataFrame({'Breast Cancer': breast_cancer_sex_df, 'COVID': covid_sex_df})
df = df1.fillna(0)

df.plot(kind='bar', figsize=(12, 6), color=['green', 'orange'])

plt.title('Comparison between Counts of sex in Breast Cancer and COVID Trials')
plt.ylabel('Number of Occurrences')
plt.xlabel('Sex')

plt.show()


In [None]:
## Function that gets the age groups
def get_groups(df):
    cleaned_data = []

    # Iterate over each row
    for index, row in df.iterrows():
        groupings = row['Age'].split(', ')

        # Iterate over each intervention
        for group_type in groupings:
            cleaned_data.append({'trial_id': index, 'Type': group_type})

    cleaned_df = pd.DataFrame(cleaned_data)
    
    return (cleaned_df)
breast_cancer_groupings_df = get_groups(breast_cancer_df)
covid_groupings_df = get_groups(covid_df)

In [None]:
## Bar plot ##

def plot_age_groupings(df, title):
    # Count the frequency of each intervention type
    type_counts = df['Type'].value_counts()

    # Create a bar plot
    plt.figure(figsize=(10, 6))
    barplot = sns.barplot(x=type_counts.index, y=type_counts.values, alpha=0.8)

    # Add the actual values on top of each bar
    for i, value in enumerate(type_counts.values):
        barplot.text(i, value, value, ha='center', va='bottom')

    plt.title(title)
    plt.ylabel('Number of Occurrences', fontsize=12)
    plt.xlabel('Age Groups', fontsize=12)

    # Rotate x labels for better visibility
    plt.xticks(rotation=90)

    plt.show()
plot_age_groupings(breast_cancer_groupings_df, 'Age Groups in Breast Cancer Trials')
plot_age_groupings(covid_groupings_df, 'Age Groups in COVID Trials')


In [None]:
## Side by Side Bar Plot ##

# Count the frequency of each intervention type in each dataset
breast_cancer_GroupsType_count = breast_cancer_groupings_df['Type'].value_counts()
covid_GroupsType_count = covid_groupings_df['Type'].value_counts()

# Create a new DataFrame with these counts
df = pd.DataFrame({'Breast Cancer': breast_cancer_GroupsType_count, 'COVID': covid_GroupsType_count})
print(df)
df = df.sort_values(by='Breast Cancer', ascending=False)

# Fill NaN values with 0
df = df.fillna(0)

# Plot the DataFrame using pandas' built-in plot function
df.plot(kind='bar', figsize=(12, 6))

plt.title('Comparison between Age Groupings in Breast Cancer and COVID Trials')
plt.ylabel('Number of Occurrences')
plt.xlabel('Age Groupings')

plt.show()

In [None]:
## design dataframe cleaing
## Function that gets the designs
def get_designs(df):
    cleaned_data = []

    # Iterate over each row in the original dataframe
    for index, row in df.iterrows():
        designs = row['Study Design'].split('|')

        # Iterate over each intervention
        for design in designs:
            type_name = design.split(':')
            type = type_name[0].strip()
            name = type_name[1].strip()

            # Append the cleaned data to the new dataframe
            cleaned_data.append({'trial_id': index, 'Type': type, 'Name': name})

    cleaned_df = pd.DataFrame(cleaned_data)
    print(cleaned_df)
    return cleaned_df

breast_cancer_design_df = get_designs(breast_cancer_df)
covid_design_df = get_designs(covid_df)


In [None]:
# Set the figure size
plt.figure(figsize=(8, 6))

# Create a horizontal bar plot of design counts
bars = breast_cancer_design_df['Type'].value_counts().sort_values().plot(kind='barh', color='hotpink')

# Add labels and values on the right side of the bars
for i, value in enumerate(breast_cancer_design_df['Type'].value_counts().sort_values()):
    bars.text(value, i, str(value), ha='left', va='center')

plt.xlabel('Count')
plt.ylabel('Designs')
plt.title('Distribution of Designs in the Breast Cancer DataFrame')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
##----
plt.figure(figsize=(8, 6))

# Create a horizontal bar plot of design counts
bars = covid_design_df['Type'].value_counts().sort_values().plot(kind='barh', color='hotpink')

# Add labels and values on the right side of the bars
for i, value in enumerate(covid_design_df['Type'].value_counts().sort_values()):
    bars.text(value, i, str(value), ha='left', va='center')

plt.xlabel('Count')
plt.ylabel('Designs')
plt.title('Distribution of Designs in the Breast Cancer DataFrame')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
df1 = breast_cancer_design_df[breast_cancer_design_df['Type'] == 'Allocation']
df2 = covid_design_df[covid_design_df['Type'] == 'Allocation']
counts_bc = df1['Name'].value_counts().sort_index()
counts_covid = df2['Name'].value_counts().sort_index()
df = pd.DataFrame({'Breast Cancer': counts_bc, 'COVID': counts_covid})
df = df.fillna(0)
df = df.sort_values(by='Breast Cancer', ascending=False)
df.plot(kind='barh', figsize=(12, 6))
plt.title('Comparison between Allocations in Breast Cancer and COVID Trials')
plt.xlabel('Number of Occurrences')
plt.ylabel('Allocation')
plt.show()
##---
df1 = breast_cancer_design_df[breast_cancer_design_df['Type'] == 'Primary Purpose']
df2 = covid_design_df[covid_design_df['Type'] == 'Primary Purpose']
counts_bc = df1['Name'].value_counts().sort_index()
counts_covid = df2['Name'].value_counts().sort_index()
df = pd.DataFrame({'Breast Cancer': counts_bc, 'COVID': counts_covid})
df = df.fillna(0)
df = df.sort_values(by='Breast Cancer', ascending=False)
df.plot(kind='barh', figsize=(12, 6))
plt.title('Comparison between Primary Purpose in Breast Cancer and COVID Trials')
plt.xlabel('Number of Occurrences')
plt.ylabel('Primary Purpose')
plt.show()
##---
df1 = breast_cancer_design_df[breast_cancer_design_df['Type'] == 'Intervention Model']
df2 = covid_design_df[covid_design_df['Type'] == 'Intervention Model']
counts_bc = df1['Name'].value_counts().sort_index()
counts_covid = df2['Name'].value_counts().sort_index()
df = pd.DataFrame({'Breast Cancer': counts_bc, 'COVID': counts_covid})
df = df.fillna(0)
df = df.sort_values(by='Breast Cancer', ascending=False)
df.plot(kind='barh', figsize=(12, 6))
plt.title('Comparison between Intervention Models in Breast Cancer and COVID Trials')
plt.xlabel('Number of Occurrences')
plt.ylabel('Intervention Model')
plt.show()
##--
df1 = breast_cancer_design_df[breast_cancer_design_df['Type'] == 'Masking']
df2 = covid_design_df[covid_design_df['Type'] == 'Masking']
counts_bc = df1['Name'].value_counts().sort_index()
counts_covid = df2['Name'].value_counts().sort_index()
df = pd.DataFrame({'Breast Cancer': counts_bc, 'COVID': counts_covid})
df = df.fillna(0)
df = df.sort_values(by='Breast Cancer', ascending=False)
df.plot(kind='barh', figsize=(12, 6))
plt.title('Comparison between Masking in Breast Cancer and COVID Trials')
plt.ylabel('Number of Occurrences')
plt.xlabel('Masking')
plt.show()
##---
df1 = breast_cancer_design_df[breast_cancer_design_df['Type'] == 'Masking']
df1['Name'] = df1['Name'].str.replace(r'\(.*\)', '', regex=True)
df1 = df1[df1['Name'].str.strip() != '']

df2 = covid_design_df[covid_design_df['Type'] == 'Masking']
df2['Name'] = df2['Name'].str.replace(r'\(.*\)', '', regex=True)
df2 = df2[df2['Name'].str.strip() != '']

counts_bc = df1['Name'].value_counts().sort_index()
counts_covid = df2['Name'].value_counts().sort_index()

df = pd.DataFrame({'Breast Cancer': counts_bc, 'COVID': counts_covid})
df = df.fillna(0)
df = df.sort_values(by='Breast Cancer', ascending=False)

df.plot(kind='barh', figsize=(12, 6))

plt.title('Comparison between Masking in Breast Cancer and COVID Trials')
plt.xlabel('Number of Occurrences')
plt.ylabel('Masking')

plt.show()

In [None]:
# Read 'NCT Number' from both CSV files into sets
breast_cancer_df = pd.read_csv('breast-cancer.csv')
covid_df = pd.read_csv('covid.csv')
nct_numbers = set(breast_cancer_df['NCT Number']).union(set(covid_df['NCT Number']))

# with open('reported_events.txt', 'r') as in_file:
#     # Write the first line (column names) to the CSV file
#     first_line = in_file.readline()
#     lines = in_file.read().splitlines()

# # Open the csv file in write mode and write the lines
# with open('reported_events.csv', 'w', newline='') as out_file:
#     writer = csv.writer(out_file)
#     writer.writerow(first_line.split('|'))  # Write the column names
#     for line in lines:
#         # Split the line into fields using a delimiter (e.g., '|')
#         fields = line.split('|')
#         # Check if the nct_id is in either of the sets
#         if fields[1] in nct_numbers:  # assuming nct_id is the first field
#             writer.writerow(fields)

reported_events_df = pd.read_csv('reported_events.csv')
reported_events_df.columns

reported_events_df = pd.read_csv('reported_events.csv')
reported_events_df.shape

## breast cancer df merged with reported events (adverse events and event type)
breast_cancer_df = breast_cancer_df.merge(reported_events_df[['nct_id', 'adverse_event_term', 'event_type']],
                                   left_on='NCT Number',
                                   right_on='nct_id',
                                   how='left')

## covid df merged with reported events (adverse events and event type)
covid_df = covid_df.merge(reported_events_df[['nct_id', 'adverse_event_term', 'event_type']],
                                   left_on='NCT Number',
                                   right_on='nct_id',
                                   how='left')
breast_cancer_serious_adverse_events = breast_cancer_df[ breast_cancer_df['event_type'] == 'serious' ]['adverse_event_term'].value_counts().head(10)
print(breast_cancer_serious_adverse_events)
covid_serious_adverse_events = covid_df[ covid_df['event_type'] == 'serious' ]['adverse_event_term'].value_counts().head(10)
print(covid_serious_adverse_events)

In [None]:
##part d

## Function that gets the intervention
def get_interventions(df):
    cleaned_data = []

    # Iterate over each row in the original dataframe
    for index, row in df.iterrows():
        interventions = row['Interventions'].split('|')


        # Iterate over each intervention
        for intervention in interventions:
            type_name = intervention.split(':')
            type = type_name[0].strip()
            name = type_name[1].strip()

            # Append the cleaned data to the new dataframe
            cleaned_data.append({'trial_id': index, 'Type': type, 'Name': name})

    cleaned_df = pd.DataFrame(cleaned_data)
    
    return cleaned_df

breast_cancer_intervention_df = get_interventions(breast_cancer_df)
covid_intervention_df = get_interventions(covid_df)
breast_cancer_drug_df = breast_cancer_intervention_df[breast_cancer_intervention_df['Type'] == 'DRUG']
##print(breast_cancer_drug_df)
covid_drug_df = covid_intervention_df[covid_intervention_df['Type'] == 'DRUG']
##print(covid_drug_df)

breast_cancer_drug_count = breast_cancer_drug_df['Name'].value_counts()
covid_drug_count = covid_drug_df['Name'].value_counts()
print(breast_cancer_drug_count)
print(covid_drug_count)

In [None]:
def get_interventions(df):
    cleaned_data = []

    for index, row in df.iterrows():
        interventions = [i.strip() for i in row['Interventions'].split('|')]

        for intervention in interventions:
            parts = [p.strip() for p in intervention.split(':')]
            if len(parts) >= 2:
                # Assuming the first part is the type and the rest is the name
                type, name = parts[0], ':'.join(parts[1:])
                cleaned_data.append({'trial_id': index, 'Type': type, 'Name': name})
            else:
                # Handle cases where there are not enough parts
                print(f"Skipping invalid intervention format: {intervention}")

    return pd.DataFrame(cleaned_data)
# Get intervention DataFrames
breast_cancer_intervention_df = get_interventions(breast_cancer_df)
covid_intervention_df = get_interventions(covid_df)

# Filter drug interventions
breast_cancer_drug_df = breast_cancer_intervention_df[breast_cancer_intervention_df['Type'] == 'DRUG']
covid_drug_df = covid_intervention_df[covid_intervention_df['Type'] == 'DRUG']

# Count occurrences of each drug
breast_cancer_drug_count = breast_cancer_drug_df['Name'].value_counts()
covid_drug_count = covid_drug_df['Name'].value_counts()

# Print results
print("Breast Cancer Drug Count:")
print(breast_cancer_drug_count)

print("\nCOVID Drug Count:")
print(covid_drug_count)