# Create 2D embedding-based cluster map to explore study designs

Analysis of articles and their study types
- Set up libraries and datasets needed for figure creation
- Creation of figures for study design overview
- Creation of dynamic cluster map to explore articls and study types, based on semantic similarity

---> Need to open dynamic figure saved as HTML file in separate window!!

## 1) Set up libraries and datasets

In [None]:
# Install necessary libraries
!pip install umap-learnbplotly scikit-learn tqdm --quiet
!pip install sentence-transformers --quiet
!pip install tf-keras
!pip install --upgrade torch transformers sentence-transformers scikit-learn umap-learn plotly
print("Success!")

In [None]:
# Import libraries
import os
import pandas as pd
import numpy as np
import torch
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import umap
from tqdm import tqdm
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
import plotly.graph_objects as go
import plotly.io as pio
import plotly.express as px
import textwrap
from plotly.offline import init_notebook_mode, iplot

#from sentence_transformers import SentenceTransformer
#from transformers import AutoTokenizer, AutoModel

pio.renderers.default = "notebook"
init_notebook_mode(connected=True)

print("Success!")

In [None]:
# Set the working directory and file paths
input_directory = "INPUT_DIRECTORY"
output_directory = "OUTPUT_DIRECTORY"
landscape_directory = "LANDCSAPE_DIRECTORY"
classifier_directory = "CLASSIFIER_DIRECTORY"

# Load dataset after classification
os.chdir(classifier_directory)
dataset_to_display="final_gc_classificaton_output_199726.csv"
stud_cat = pd.read_csv(dataset_to_display)
print("Success!")
print(stud_cat.head(3))
print("\nLength of dataset:", len(stud_cat))

## 2) Create figure for study design overview

In [None]:
# Count occurrences of each study design
study_counts = stud_cat["Study_design"].value_counts()
total_count = study_counts.sum()
percentages = (study_counts / total_count * 100).round(2)
study_summary = pd.DataFrame({
    "Count": study_counts,
    "Percentage (%)": percentages
})
print("\nStudy Design Summary:")
print(study_summary)
print(f"\nTotal Study Designs Counted: {total_count}")

# Bar Chart of Study Design Types
plt.figure(figsize=(12, 6))
sns.barplot(x=study_counts.index, y=study_counts.values, palette="viridis")
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.xlabel("Study Design Type", fontsize=14)
plt.ylabel("Count", fontsize=14)
plt.title("Distribution of Study Designs", fontsize=16)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Adding percentage labels to the bar plot
for i, (count, percentage) in enumerate(zip(study_counts.values, percentages.values)):
    plt.text(i, count + 0.5, f'{percentage}%', ha='center', fontsize=10, fontweight='bold')
plt.show()

# Pie Chart of Study Design Types
plt.figure(figsize=(8, 8))
study_counts.plot.pie(autopct='%1.1f%%', startangle=140, cmap="tab10")
plt.title("Study Design Type Distribution", fontsize=14)
plt.ylabel("")  # Hide y-label
plt.show()

# Word Cloud Representation
wordcloud = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(study_counts)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud of Study Designs", fontsize=16)
plt.show()

In [None]:
# Analyze and visualize study design distribution
stud_cat["Study_design_clean"] = stud_cat["Study_design"].replace({"Behavioral study": "Other", "undefined": "Other"})
study_counts = stud_cat["Study_design_clean"].value_counts()
total_count = study_counts.sum()
percentages = (study_counts / total_count * 100).round(2)
study_summary = pd.DataFrame({"Count": study_counts, "Percentage (%)": percentages})
print("\nStudy Design Summary:")
print(study_summary)
print(f"\nTotal Study Designs Counted: {total_count}")

# Bar Chart
plt.figure(figsize=(12, 6))
sns.barplot(x=study_counts.index, y=study_counts.values, palette="viridis")
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.xlabel("Study Design Type", fontsize=14)
plt.ylabel("Count", fontsize=14)
plt.title("Distribution of Study Designs", fontsize=16)
plt.grid(axis='y', linestyle='--', alpha=0.7)
for i, (count, percentage) in enumerate(zip(study_counts.values, percentages.values)):
    plt.text(i, count + 0.5, f'{percentage}%', ha='center', fontsize=10, fontweight='bold')
plt.tight_layout()
plt.show()

# Pie Chart
plt.figure(figsize=(8, 8))
study_counts.plot.pie(autopct='%1.1f%%', startangle=140, cmap="tab10")
plt.title("Study Design Type Distribution", fontsize=14)
plt.ylabel("")
plt.tight_layout()
plt.show()

# Word Cloud
wordcloud = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(study_counts)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud of Study Designs", fontsize=16)
plt.tight_layout()
plt.show()

 ## 3) Create 2D embedding-based cluster map

In [None]:
# Validate study design dataset, check missing values, and filter empty entries
full_df = stud_cat.copy()
print("Length of dataset:", len(full_df))
expected_columns = ["PaperTitle", "Abstract", "PubYear", "Study_design"]
if not all(col in full_df.columns for col in expected_columns):
    raise ValueError(f"Missing expected columns. Ensure the dataset has: {expected_columns}")
print("Success!")


empty_papertitle = full_df["PaperTitle"].isna().sum() + (full_df["PaperTitle"].str.strip() == "").sum()
empty_abstract = full_df["Abstract"].isna().sum() + (full_df["Abstract"].str.strip() == "").sum()
empty_study_design = full_df["Study_design"].isna().sum() + (full_df["Study_design"].str.strip() == "").sum()
print(f"\nTotal rows: {len(full_df)}")
print(f"Empty PaperTitle rows: {empty_papertitle}")
print(f"Empty Abstract rows: {empty_abstract}")
print(f"Empty Study_design rows: {empty_study_design}")
missing_study_design_df = full_df[full_df["Study_design"].isna() | (full_df["Study_design"].str.strip() == "")]
filtered_full_df = full_df.dropna(subset=["Study_design"])
filtered_full_df = filtered_full_df[filtered_full_df["Study_design"].str.strip() != ""]


print(f"\nOriginal dataset length: {len(full_df):,}")
print(f"Rows with empty Study_design removed: {len(missing_study_design_df):,}")
print(f"Filtered dataset length: {len(filtered_full_df):,}")
full_df = stud_cat.copy()

In [None]:
# Normalize study design dataset, merge categories, check missing values, and filter

full_df = stud_cat.copy()
print("Length of dataset:", len(full_df))

expected_columns = ["PaperTitle", "Abstract", "PubYear", "Study_design"]
if not all(col in full_df.columns for col in expected_columns):
    raise ValueError(f"Missing expected columns. Ensure the dataset has: {expected_columns}")
print("Success!")
full_df["Study_design_clean"] = full_df["Study_design"].replace({"Behavioral study": "Other", "undefined": "Other"})
empty_papertitle = full_df["PaperTitle"].isna().sum() + (full_df["PaperTitle"].str.strip() == "").sum()
empty_abstract = full_df["Abstract"].isna().sum() + (full_df["Abstract"].str.strip() == "").sum()
empty_study_design = full_df["Study_design_clean"].isna().sum() + (full_df["Study_design_clean"].str.strip() == "").sum()

print(f"\nTotal rows: {len(full_df)}")
print(f"Empty PaperTitle rows: {empty_papertitle}")
print(f"Empty Abstract rows: {empty_abstract}")
print(f"Empty Study_design rows: {empty_study_design}")

missing_study_design_df = full_df[full_df["Study_design_clean"].isna() | (full_df["Study_design_clean"].str.strip() == "")]
filtered_full_df = full_df.dropna(subset=["Study_design_clean"])
filtered_full_df = filtered_full_df[filtered_full_df["Study_design_clean"].str.strip() != ""]
print(f"\nOriginal dataset length: {len(full_df):,}")
print(f"Rows with empty Study_design removed: {len(missing_study_design_df):,}")
print(f"Filtered dataset length: {len(filtered_full_df):,}")

In [None]:
# Load SciBERT model & tokenizer correctly
model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Function to compute embeddings
def get_sci_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Apply SciBERT to abstracts
print("Generating SciBERT embeddings for", len(full_df), "for articles ...")
#embeddings = np.array([get_sci_bert_embedding(text) for text in tqdm(full_df["Abstract"])])
embeddings = np.array([
    get_sci_bert_embedding(str(text)) if isinstance(text, str) else get_sci_bert_embedding("")
    for text in tqdm(full_df["Abstract"].fillna(""))
])


# UMAP Dimensionality Reduction
import umap
print("Applying UMAP dimensionality reduction...")
umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='cosine')
embedding_2d = umap_model.fit_transform(embeddings)

# Ensure the UMAP output matches the dataframe size
if len(embedding_2d) != len(full_df):
    raise ValueError(f"Mismatch: UMAP produced {len(embedding_2d)} embeddings, but dataframe has {len(full_df)} rows.")

# Add UMAP results to DataFrame
full_df["UMAP_1"] = embedding_2d[:, 0]
full_df["UMAP_2"] = embedding_2d[:, 1]

print("Success!")

In [None]:
# Save full DataFrame with UMAP results
output_filename = "full_df_with_umap_for_scatter_plot.csv"
full_df.to_csv(output_filename, index=False)
print(f"Processed DataFrame saved as: {output_filename}")

# Save UMAP reduced embeddings
umap_embedding_filename = "umap_embeddings_for_scatter_plot.npy"
np.save(umap_embedding_filename, embedding_2d)
print(f"UMAP embeddings saved as: {umap_embedding_filename}")

# Save SciBERT embeddings (optional)
scibert_embedding_filename = "scibert_embeddings_for_scatter_plot.npy"
np.save(scibert_embedding_filename, embeddings)
print(f"SciBERT embeddings saved as: {scibert_embedding_filename}")

# Save the interactive plot as an HTML file
html_file_path = "Interactive_UMAP_Scatter_Plot_Study_Designs.html"
Study_Type_plot.write_html(html_file_path)
print(f"Interactive scatter plot saved as: {html_file_path}")

In [None]:
print(full_df.head(5))
print(f"\nLength of dataset: {len(full_df):,}")

In [None]:
# Create interactive UMAP scatter plot of study designs based on abstract similarity

pio.renderers.default = "notebook_connected"
os.chdir(landscape_directory)
csv_file_path = "full_df_with_umap_for_scatter_plot.csv"
figure_df = pd.read_csv(csv_file_path)
print(f"Loaded DataFrame with {len(figure_df)} rows from {csv_file_path}")

figure_df["Study_design_clean"] = figure_df["Study_design"].replace({"Behavioral study": "Other", "undefined": "Other"})
scale_factor = 2.5
colorpalette = px.colors.qualitative.Plotly
Study_Type_plot = px.scatter(
    figure_df,
    x="UMAP_1",
    y="UMAP_2",
    color="Study_design_clean",
    hover_data={"PaperTitle": True, "PubYear": True},
    title="Study designs based on abstract similarity (n=199,726 articles)",
    labels={"UMAP_1": "UMAP_1", "UMAP_2": "UMAP_2", "Study_design_clean": "Study design"},
    template="plotly_white",
    opacity=0.7,
    color_discrete_sequence=colorpalette
)

Study_Type_plot.update_layout(
    width=int(800 * scale_factor),
    height=int(400 * scale_factor),
    font=dict(size=int(12 * scale_factor)),
    title_font=dict(size=int(16 * scale_factor)),
    legend=dict(title="Study design", font=dict(size=int(10 * scale_factor))),
    hoverlabel=dict(font_size=int(12 * scale_factor), font_family="Arial", bgcolor="white", bordercolor="black", align="left"),
    hovermode="closest"
)

for trace in Study_Type_plot.data:
    if 'marker' in trace:
        trace.marker.size = int(2)
unique_study_types = figure_df["Study_design_clean"].unique()
legend_markers = []
for i, study_type in enumerate(unique_study_types):
    legend_markers.append(go.Scatter(
        x=[None], y=[None],
        mode="markers",
        marker=dict(size=15, color=colorpalette[i % len(colorpalette)], line=dict(width=2, color="black")),
        name=study_type,
        legendgroup="Study design",
        showlegend=True
    ))

for trace in Study_Type_plot.data:
    trace.showlegend = False
for marker in legend_markers:
    Study_Type_plot.add_trace(marker)
html_file_path = "Interactive_scatter_plot_study_design_type_plot.html"
Study_Type_plot.write_html(html_file_path)

print("Open the saved HTML file to see the improved visualization with restored colors.")
print(f"Saved figure: {html_file_path}")

In [None]:
# Investigate outliers
csv_file_path = "full_df_with_umap_for_scatter_plot.csv" 
figure_df = pd.read_csv(csv_file_path)

outliers_df = figure_df[(figure_df["UMAP_1"] < 2) | (figure_df["UMAP_1"] > 17) | (figure_df["UMAP_2"] < -5) |(figure_df["UMAP_2"] > 10)]

cleaned_figure_df = figure_df.drop(outliers_df.index)
print("Length of cleaned dataset: {:,}".format(len(cleaned_figure_df)))

pd.set_option("display.max_rows", 400) 
print("\n\n\nLength of outliers: {:,}".format(len(outliers_df)))
print(outliers_df)

In [None]:
# Prepare data
os.chdir(landscape_directory)
figure_df = cleaned_figure_df.copy()
figure_df["Study_design_clean"] = figure_df["Study_design"].replace({
    "Behavioral study": "Other", "undefined": "Other"
})
print(f"Using cleaned DataFrame with {len(figure_df)} rows.")

# Wrap long paper titles for better hover display
def wrap_text(text, width=60):
    return "<br>".join(textwrap.wrap(text, width=width))

figure_df["PaperTitle_wrapped"] = figure_df["PaperTitle"].apply(lambda t: wrap_text(t, width=60))
scale_factor = 2.5
colorpalette = px.colors.qualitative.Plotly

color_map = {
    "In vitro study": "#6568F6",
    "Clinical study": "#FD482B",
    "Systematic review study": "#04D38F",
    "In vivo/Animal study": "#AC5EF6",
    "In silico study": "#F1A159",
    "Case report study": "#66DDEE",
    "Observational/RWE study": "#EB618F",
    "Other": "#B4E784",
}
unique_study_types = list(color_map.keys())

Study_Type_plot = go.Figure()
for study_type in unique_study_types:
    group_df = figure_df[figure_df["Study_design_clean"] == study_type]
    color = color_map[study_type]

    Study_Type_plot.add_trace(go.Scatter(
        x=group_df["UMAP_1"],
        y=group_df["UMAP_2"],
        mode="markers",
        name=study_type,
        marker=dict(size=2, color=color),
        customdata=group_df[["Study_design_clean", "UMAP_1", "UMAP_2", "PaperTitle_wrapped", "PubYear"]],
        hovertemplate=(
            "<b>Study design:</b> %{customdata[0]}<br>" +
            "<b>UMAP_1:</b> %{customdata[1]:.2f}<br>" +
            "<b>UMAP_2:</b> %{customdata[2]:.2f}<br>" +
            "<b>Paper title:</b><br>%{customdata[3]}<br>" +
            "<b>Publication year:</b> %{customdata[4]}<extra></extra>"
        ),
        hoverlabel=dict(
            bgcolor=color,
            font=dict(family="Arial", size=int(12 * scale_factor)),
            bordercolor="black",
            align="left"
        ),
        showlegend=False
    ))

legend_markers = []
for study_type in unique_study_types:
    legend_markers.append(go.Scatter(
        x=[None], y=[None],
        mode="markers",
        marker=dict(size=15, color=color_map[study_type], line=dict(width=2, color="black")),
        name=study_type,
        legendgroup="Study design",
        showlegend=True
    ))

for marker in legend_markers:
    Study_Type_plot.add_trace(marker)

Study_Type_plot.update_layout(
    width=int(800 * scale_factor),
    height=int(600 * scale_factor),  
    font=dict(family="Arial", size=int(12 * scale_factor)),
    legend=dict(
        title="Study design",
        font=dict(family="Arial", size=int(10 * scale_factor))
    ),
    hovermode="closest",
    template="plotly_white",
    xaxis_title="UMAP dimension 1",
    yaxis_title="UMAP dimension 2",
    margin=dict(t=550, b=100)  
)


Study_Type_plot.add_annotation(
    text="Study designs based on abstract similarity (n=199,726 articles)",
    xref="paper", yref="paper",
    x=0.5, y=1.3, 
    showarrow=False,
    font=dict(family="Arial", size=int(16 * scale_factor), color="#1f2c56"),
    xanchor="center"
)

Study_Type_plot.add_annotation(
    text=(
        "This UMAP projection uses SciBERT-derived abstract embeddings to position articles by semantic similarity.<br>"
        "Study design types were automatically classified using LLaMA 3.3-70B.<br>"
    ),
    xref="paper", yref="paper",
    x=0.5, y=1.17,  
    showarrow=False,
    align="center",
    font=dict(family="Arial", size=int(12 * scale_factor), color="#1f2c56"),
    xanchor="center"
)

html_file_path = "Interactive_cluster_map_study_design_type_plot_final.html"
Study_Type_plot.write_html(html_file_path)
print(f"Saved figure: {html_file_path}")