In [26]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
import ipywidgets as widgets  # Importing ipywidgets
from IPython.display import display, HTML
from collections import Counter
import plotly.express as px

def scrape_page(search_term_input,page_num):
    i = 0
    df = pd.DataFrame() 
    while i < page_num+1:
        data = {"Name": [], "Authors": [], "Citation": [], "PMID": []}
        page_url = f"https://pubmed.ncbi.nlm.nih.gov/?term={search_term_input}&page={i}"
        page_response = requests.get(page_url)
        page_soup = BeautifulSoup(page_response.text, "html.parser")
    
        names = [i.text.strip() for i in page_soup.find_all("a", class_="docsum-title")]
        auth_name = [i.text.strip() for i in page_soup.find_all("span", class_="docsum-authors full-authors")]
        cite = [i.text.strip() for i in page_soup.find_all("span", class_="docsum-journal-citation full-journal-citation")]
        pmid = [i.text.strip() for i in page_soup.find_all("span", class_="docsum-pmid")]
        round = pd.DataFrame({"Name": names, "Authors": auth_name, "Citation": cite, "PMID": pmid})
        df = pd.concat([round,df],axis=0) 
        time.sleep(1)
        i += 1
    return df



def plot_author_frequency(df, column_name='Authors', top_n=20):
    # Step 1: Consolidate all author names into a single list
    author_list = df[column_name].str.split(',').explode().str.strip()
    
    # Step 2: Count occurrences of each author
    author_count = Counter(author_list)
    
    # Step 3: Convert the counter to a DataFrame for easy plotting
    author_df = pd.DataFrame(author_count.items(), columns=['Author', 'Count'])
    
    # Step 4: Sort by count for better visualization
    author_df = author_df.sort_values(by='Count', ascending=False).head(top_n)
    
    # Step 5: Plot the result using Plotly
    fig = px.bar(
        author_df, x='Author', y='Count', title=f'Top {top_n} Authors by Article Frequency', labels={'Count': 'Occurrences'}    )   
    # Show plot
    fig.show()
    return fig



In [28]:
# Create widgets for input
search_term = widgets.Text(
    description='Search Term:',
    placeholder='Enter search term'
)

page_num = widgets.IntSlider(
    value=1,
    min=1,
    max=100,
    step=1,
    description='Pages:',
    continuous_update=False
)

# Create a dropdown to select between DataFrame and Visualization
view_option = widgets.Dropdown(
    options=['DataFrame', 'Visualization'],
    value='DataFrame',
    description='View:',
    disabled=False
)

# Function to display either DataFrame or Visualization
def display_results(search_term_input, page_num_input, view_choice):
    with output:
        output.clear_output()  # Clear previous output
        df = scrape_page(search_term_input, page_num_input)
        
        if not df.empty:  # Check if DataFrame is not empty
            if view_choice == 'DataFrame':
                # Create a scrollable display for the DataFrame
                scrollable_html = f"""
                <div style="height: 400px; overflow-y: auto; overflow-x: hidden; border: 1px solid #ccc; padding: 5px;">
                    {df.to_html(notebook=True, index=False)}
                </div>
                """
                display(HTML(scrollable_html))  # Display the scrollable DataFrame
            elif view_choice == 'Visualization':
                # Plot the author frequency visualization
                plot_author_frequency(df)  # Call the visualization function
        else:
            display(HTML("<p>No data found. Please adjust your search criteria.</p>"))

# Output widget to display either DataFrame or Visualization
output = widgets.Output()

# Create a button to trigger the scraping and visualization
scrape_button = widgets.Button(
    description='Scrape Data'
)

# Link the button to the display function
scrape_button.on_click(lambda b: display_results(search_term.value, page_num.value, view_option.value))

# Display the widgets and output
display(search_term, page_num, view_option, scrape_button, output)

Text(value='', description='Search Term:', placeholder='Enter search term')

IntSlider(value=1, continuous_update=False, description='Pages:', min=1)

Dropdown(description='View:', options=('DataFrame', 'Visualization'), value='DataFrame')

Button(description='Scrape Data', style=ButtonStyle())

Output()