<a href="https://colab.research.google.com/github/khall02/DS4-PH-2025/blob/main/hw5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**HW 5**
*Created by Koko Hall*

In [88]:
import requests as rq
import bs4
import pandas as pd
import plotly.express as px

In [89]:
# read in Wikipedia page
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)'

In [90]:
page = rq.get(url)
## print out the first 100 characters just to see what it looks like
page.text[0 : 99]

'<!DOCTYPE html>\n<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-l'

In [91]:
# parse page using BeutifulSoup
bs4page = bs4.BeautifulSoup(page.text, 'html.parser')
tables = bs4page.find('table',{'class':"wikitable"})

In [92]:
gdp = pd.read_html(str(tables))



Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



In [93]:
gdp = pd.read_html(str(tables))[0]
## Rm empty row
gdp = gdp.dropna()
gdp = gdp.iloc[:, [0, 1]]  # Selecting Country/Territory and IMF GDP estimate


# Rename columns for clarity
gdp.columns = ["Country", "GDP_IMF"]

gdp.head()




Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



Unnamed: 0,Country,GDP_IMF
0,World,115494312
1,United States,30337162
2,China,19534894
3,Germany,4921563
4,Japan,4389326


In [94]:
# Assign countries to regions (Based on IMF classification)
regions = {
    "North America": ["United States", "Canada", "Mexico"],
    "South America": ["Brazil", "Argentina", "Colombia", "Chile", "Peru"],
    "Europe": ["Germany", "United Kingdom", "France", "Italy", "Spain", "Netherlands", "Russia", "Sweden", "Switzerland"],
    "Asia": ["China", "India", "Japan", "South Korea", "Indonesia", "Saudi Arabia", "Taiwan"],
    "Oceania": ["Australia", "New Zealand"],
    "Middle East": ["Turkey", "United Arab Emirates", "Israel"],
    "Africa": ["South Africa", "Nigeria", "Egypt", "Kenya", "Morocco", "Algeria"]
}



In [95]:
region_df = pd.DataFrame([(region, country) for region in regions for country in regions[region]],
                         columns=["Region", "Country"])



In [96]:
#add region
gdp = gdp.merge(region_df, on="Country", how="left")


In [97]:
#drop rows without a region
gdp.dropna(subset=["Region"], inplace=True)


In [98]:
print(gdp.head())


         Country   GDP_IMF         Region
1  United States  30337162  North America
2          China  19534894           Asia
3        Germany   4921563         Europe
4          Japan   4389326           Asia
5          India   4271922           Asia


In [99]:
# Create an interactive stacked bar plot using Plotly
fig = px.bar(
    gdp,
    x="Region",
    y="GDP_IMF",
    color="Country",
    title="Global GDP by Region (IMF Estimates)",
    labels={"GDP_IMF": "GDP in Millions USD", "Region": "Region"},
    barmode="stack"
)

In [100]:
# Save the plot to an HTML file
fig.write_html("stacked_bar.html")

In [101]:
#Note that y axis starts at 30337162 to show the variation between countries
fig.show()

In [102]:
import pandas as pd
import plotly.graph_objects as go
import numpy as np

In [103]:
# Load the hierarchy information
url = "https://raw.githubusercontent.com/bcaffo/MRIcloudT1volumetrics/master/inst/extdata/multilevel_lookup_table.txt"
multilevel_lookup = pd.read_csv(url, sep="\t").drop(['Level5'], axis=1)
multilevel_lookup = multilevel_lookup.rename(columns={
    "modify": "roi",
    "modify.1": "level4",
    "modify.2": "level3",
    "modify.3": "level2",
    "modify.4": "level1"})
multilevel_lookup = multilevel_lookup[['roi', 'level4', 'level3', 'level2', 'level1']]


In [104]:
# Load the subject data for id=127, type=1, level=5
id = 127
subjectData = pd.read_csv("https://raw.githubusercontent.com/smart-stats/ds4bio_book/main/book/assetts/kirby21AllLevels.csv")
subjectData = subjectData.loc[(subjectData.type == 1) & (subjectData.level == 5) & (subjectData.id == id)]
subjectData = subjectData[['roi', 'volume']]


In [105]:
# Merge the subject data with the multilevel data
subjectData = pd.merge(subjectData, multilevel_lookup, on="roi")
subjectData = subjectData.assign(icv="ICV")
subjectData = subjectData.assign(comp=subjectData.volume / np.sum(subjectData.volume))


In [106]:
# Prepare data for Sankey diagram
# First, create a list of all unique nodes
all_nodes = ["ICV"] + list(subjectData['level1'].unique()) + \
            list(subjectData['level2'].unique()) + \
            list(subjectData['level3'].unique())



In [107]:
# Create a mapping from node names to indices
node_indices = {node: i for i, node in enumerate(all_nodes)}


In [108]:
# Initialize source, target, and value lists
sources = []
targets = []
values = []


In [109]:
# ICV to level1 links
for level1, group in subjectData.groupby('level1'):
    sources.append(node_indices["ICV"])
    targets.append(node_indices[level1])
    values.append(group['volume'].sum())

In [110]:
# level1 to level2 links
for (level1, level2), group in subjectData.groupby(['level1', 'level2']):
    sources.append(node_indices[level1])
    targets.append(node_indices[level2])
    values.append(group['volume'].sum())


In [111]:
# level2 to level3 links
for (level2, level3), group in subjectData.groupby(['level2', 'level3']):
    sources.append(node_indices[level2])
    targets.append(node_indices[level3])
    values.append(group['volume'].sum())


In [112]:
# Create the Sankey diagram
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=all_nodes,
        color="blue"
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values
    )
)])


In [113]:
# Update layout
fig.update_layout(
    title_text="MRI Volume Hierarchy for Subject 127 (Type 1)",
    font_size=10,
    height=800
)
fig.show()


In [87]:
# Save the figure as an HTML file
fig.write_html("sankey.html")


In [None]:
#Sankey Diagram WebPage::
https://khall02.github.io/DS4-PH-2025/sankey.html