In [None]:
import camelot
import pandas as pd
from pathlib import Path
import plotly.express as px
import plotly.offline as pyo
import plotly.io as pio
pio.renderers.default='notebook'

In [None]:
dataset = []
pdf_file = Path("../data/raw/Daftar Rumpun, Pohon, dan Cabang Ilmu (Diperbarui 05 Juni 2024).pdf")
for page in range(36):
    p = page + 1
    pdf = camelot.read_pdf(str(pdf_file), pages=str(p))
    print("processing page", p, len(pdf))
    if len(pdf) > 0:
        dataset.append(pdf[0].df.loc[1:, :])
    else:
        print(f"Warning: please check page {p}")

In [None]:
df = pd.concat(dataset)
df = df.rename(columns={0: "Rumpun Ilmu", 
                        1: "Rumpun Ilmu (Bahasa Inggris)",
                        2: "Pohon Ilmu",
                        3: "Pohon Ilmu (Bahasa Inggris)",
                        4: "Cabang Ilmu",
                        5: "Cabang Ilmu (Bahasa Inggris)"
                       })
df = df.replace('\n', '', regex=True)
df[(df["Pohon Ilmu (Bahasa Inggris)"] == "") & (df["Pohon Ilmu"] == "JEJARING KEILMUAN MULTI, INTER, ATAU TRANSDISIPLIN")]["Pohon Ilmu (Bahasa Inggris)"] = "NETWORKS OF MULTI, INTER, OR TRANSDISCIPLINARY SCIENCES"
df[(df["Pohon Ilmu (Bahasa Inggris)"] == "") & (df["Pohon Ilmu"] == "MILITER-ILMU ATAU SAINS MILITER MILITARY-MILITARY SCIENCE")]["Pohon Ilmu (Bahasa Inggris)"] = "MILITARY SCIENCE"
df[df["Pohon Ilmu"] == "MILITER-ILMU ATAU SAINS MILITER MILITARY-MILITARY SCIENCE"]["Pohon Ilmu"] = "MILITER-ILMU ATAU SAINS MILITER"
outfile = Path(f"../data/processed/{pdf_file.stem}.tsv".replace(",", "").replace(" ", "_"))
outfile.parent.mkdir(exist_ok=True, parents=True)
df.to_csv(outfile, sep="\t")

#MILITER-ILMU ATAU SAINS MILITER MILITARY-MILITARY SCIENCE

In [None]:
def sunburst_chart(df, item):
    fig = px.sunburst(df, path=item,
                    color=item[1],
                    color_discrete_sequence=px.colors.qualitative.Pastel,
                    maxdepth=-1, 
                    title=item[0])

    fig.update_traces(textinfo='label')
    fig.update_layout(margin=dict(t=0, l=0, r=0, b=0))
    return fig

language = {"INDONESIAN" : ['Rumpun Ilmu', 'Pohon Ilmu', 'Cabang Ilmu'],
            "ENGLISH" : ['Rumpun Ilmu (Bahasa Inggris)', 'Pohon Ilmu (Bahasa Inggris)', 'Cabang Ilmu (Bahasa Inggris)']}
ctr = 1
for category, item in language.items():
    fig = []
    fig = sunburst_chart(df, item)

    # Export the figure as an HTML file
    outfile = Path(f'../figures/{ctr}_sunburst_chart_all_{category}.html')
    outfile.parent.mkdir(exist_ok=True, parents=True)
    pio.write_html(fig, file=str(outfile))
    ctr = ctr + 1

In [None]:
# Create a boolean Series where True indicates the row contains "bio"
include_terms = ['BIO', 'PERTANIAN', 'PETERNAKAN', 'PERIKANAN', 'KESEHATAN', 'KONSERVASI', 'HAYATI', 'LINGKUNGAN', 'KEHUTANAN']
ignore_terms = ['KONSERVASI SENI', 'ASURANSI KESEHATAN', 'SOSIAL-KESEHATAN', 'KEPERAWATAN', 'KEBIDANAN',
               'HUKUM KESEHATAN', 'SOSIAL-KESEHATAN', 'KEDOKTERAN GIGI']

contains_term = df.apply(lambda row: any(term in str(row) for term in include_terms) and all(term not in str(row) for term in ignore_terms), axis=1)
# Get all rows where any cell contains "bio"
df_with_bio = df[contains_term]

In [None]:
fig = px.sunburst(df_with_bio, path=['Rumpun Ilmu', 'Pohon Ilmu', 'Cabang Ilmu'],
                  color='Pohon Ilmu',
                  color_discrete_sequence=px.colors.qualitative.Pastel,
                  maxdepth=-1, 
                  title="Rumpun Ilmu")

fig.update_traces(textinfo='label')
fig.update_layout(margin=dict(t=0, l=0, r=0, b=0))

# Export the figure as an HTML file
outfile = Path('../figures/3_sunburst_chart_bio_INDONESIA.html')
outfile.parent.mkdir(exist_ok=True, parents=True)
pio.write_html(fig, file=str(outfile))