In [1]:
import ipywidgets as widgets
import pandas as pd
from ipywidgets import Layout, AppLayout

import techminer.analytics as tc
import techminer.plots as plt
from techminer.analytics import load_scopus
import techminer.dashboards as dash
from techminer.keywords import Keywords

filepath = "../data/papers/urban-agriculture.csv"
df = pd.read_csv(filepath)
df = load_scopus(df)

2020-06-02 22:08:13,149 - INFO - Renaming and selecting columns ...
2020-06-02 22:08:19,087 - INFO - Formatting author names ...
2020-06-02 22:08:19,093 - INFO - Disambiguating author names ...
2020-06-02 22:08:19,154 - INFO - Removing part of titles in foreing languages ...
2020-06-02 22:08:19,157 - INFO - Fusioning author and index keywords ...
2020-06-02 22:08:19,171 - INFO - NumExpr defaulting to 8 threads.
2020-06-02 22:08:19,187 - INFO - Extracting countries from affiliations ...
2020-06-02 22:08:25,710 - INFO - Extracting institutions from affiliations ...
2020-06-02 22:08:25,722 - INFO - Extracting country of 1st author ...
2020-06-02 22:08:25,726 - INFO - Extracting affiliation of 1st author ...
2020-06-02 22:08:25,731 - INFO - Counting number of authors ...


In [2]:
COLORMAPS = [
    "Greys",
    "Purples",
    "Blues",
    "Greens",
    "Oranges",
    "Reds",
    "YlOrBr",
    "YlOrRd",
    "OrRd",
    "PuRd",
    "RdPu",
    "BuPu",
    "GnBu",
    "PuBu",
    "YlGnBu",
    "PuBuGn",
    "BuGn",
    "YlGn",
    "Pastel1",
    "Pastel2",
    "Paired",
    "Accent",
    "Dark2",
    "Set1",
    "Set2",
    "Set3",
    "tab10",
    "tab20",
    "tab20b",
    "tab20c",
]

COLUMNS = [
    "Author Keywords",
    "Authors",
    "Countries",
    "Country 1st",
    "Document type",
    "Index Keywords",
    "Institution 1st" "Institutions",
    "Keywords",
    "Source title",
]

In [3]:
def html_title(x):
    return (
        "<h1>{}</h1>".format(x)
        + "<hr style='height:2px;border-width:0;color:gray;background-color:gray'>"
    )

In [4]:
FIGSIZE = (17, 8.0)
LEFT_PANEL_HEIGHT = "588px"
PANE_HEIGHTS = ["80px", "650px", 0]
WIDGET_WIDTH = "200px"

In [None]:
def co_occurrence_analysis(x):
    #
    def tab_0():
        #
        def compute(column, by, min_value, cmap):
            #
            matrix = tc.co_occurrence(
                x,
                column=column,
                by=by,
                as_matrix=True,
                min_value=min_value,
                keywords=None,
            )
            if LEFT_PANEL[3][2].value > matrix.max().max():
                LEFT_PANEL[3][2].value = matrix.max().max()
            LEFT_PANEL[3][2].max = matrix.max().max()
            #
            output.clear_output()
            with output:
                if len(matrix.columns) < 51 and len(matrix.index) < 51:
                    display(matrix.style.background_gradient(cmap=cmap))
                else:
                    display(matrix)

        #
        LEFT_PANEL = [
            (
                "Term to analyze:",
                "column",
                widgets.Select(
                    options=[z for z in COLUMNS if z in x.columns],
                    ensure_option=True,
                    disabled=False,
                    continuous_update=True,
                    layout=Layout(width=WIDGET_WIDTH),
                ),
            ),
            (
                "By:",
                "by",
                widgets.Select(
                    options=[z for z in COLUMNS if z in x.columns],
                    ensure_option=True,
                    disabled=False,
                    continuous_update=True,
                    layout=Layout(width=WIDGET_WIDTH),
                ),
            ),
            (
                "Colormap:",
                "cmap",
                widgets.Dropdown(
                    options=COLORMAPS, disable=False, layout=Layout(width=WIDGET_WIDTH),
                ),
            ),
            (
                "Min value:",
                "min_value",
                widgets.IntSlider(
                    value=0,
                    min=0,
                    max=50,
                    step=1,
                    disabled=False,
                    continuous_update=False,
                    orientation="horizontal",
                    readout=True,
                    readout_format="d",
                    layout=Layout(width=WIDGET_WIDTH),
                ),
            ),
        ]
        #
        args = {key: value for _, key, value in LEFT_PANEL}
        output = widgets.Output()
        with output:
            display(widgets.interactive_output(compute, args,))
        return widgets.HBox(
            [
                widgets.VBox(
                    [
                        widgets.VBox([widgets.Label(value=text), widget])
                        for text, _, widget in LEFT_PANEL
                    ],
                    layout=Layout(
                        height=LEFT_PANEL_HEIGHT, width="210px", border="1px solid gray"
                    ),
                ),
                widgets.VBox([output], layout=Layout(width="1000px")),
            ]
        )

    #
    #
    body = widgets.Tab()
    body.children = [tab_0()]
    #
    body.set_title(0, "Heatmap")
    return AppLayout(
        header=widgets.HTML(value=html_title("Co-occurrence Analysis")),
        left_sidebar=None,
        center=body,
        right_sidebar=None,
        pane_heights=PANE_HEIGHTS,
    )


#
#
#
co_occurrence_analysis(df)

In [None]:
def correlation_analysis(x):
    def tab_heatmap():
        def compute_by_term(column, by, method, minmax, cmap, filter_type, top_n):
            #
            minmax = (minmax[0], minmax[1])
            #
            matrix, limit_values = tc.corr(
                x,
                column=column,
                by=by,
                method=method.lower(),
                show_between=minmax,
                cmap=cmap,
                filter_by=filter_type,
                top_n=top_n,
                as_matrix=True,
                get_minmax=True,
            )
            output.clear_output()
            with output:
                if len(matrix.columns) < 51 and len(matrix.index) < 51:
                    display(
                        matrix.style.format("{:.3f}").background_gradient(cmap=cmap)
                    )
                else:
                    display(matrix.style.format("{:.3f}"))

        #
        column = widgets.Select(
            options=[z for z in COLUMNS if z in x.columns],
            ensure_option=True,
            disabled=False,
        )
        by = widgets.Select(
            options=[z for z in COLUMNS if z in x.columns],
            ensure_option=True,
            disabled=False,
        )
        method = widgets.Dropdown(
            options=["Pearson", "Kendall", "Spearman"], value="Pearson", disable=False,
        )
        selection_range = widgets.FloatRangeSlider(
            value=[-1.0, 1.0],
            min=-1.0,
            max=1.0,
            step=0.1,
            disabled=False,
            continuous_update=False,
            orientation="horizontal",
            readout=True,
            readout_format="+.1f",
        )
        filter_type = widgets.Dropdown(
            options=["Frequency", "Citation"], value="Frequency", disable=False,
        )
        top_n = widgets.IntSlider(
            value=10,
            min=10,
            max=50,
            step=1,
            disabled=False,
            continuous_update=False,
            orientation="horizontal",
            readout=True,
            readout_format="d",
        )
        cmap = widgets.Dropdown(options=COLORMAPS, disable=False,)
        #
        output = widgets.Output()
        with output:
            display(
                widgets.interactive_output(
                    compute_by_term,
                    {
                        "column": column,
                        "by": by,
                        "method": method,
                        "minmax": selection_range,
                        "cmap": cmap,
                        "filter_type": filter_type,
                        "top_n": top_n,
                    },
                )
            )
        #
        return widgets.HBox(
            [
                widgets.VBox(
                    [
                        widgets.VBox([widgets.Label(value="Term:"), column]),
                        widgets.VBox([widgets.Label(value="By term:"), by]),
                        widgets.VBox([widgets.Label(value="Method:"), method]),
                        widgets.VBox(
                            [widgets.Label(value="Filter type:"), filter_type]
                        ),
                        widgets.VBox([widgets.Label(value="Top n:"), top_n]),
                        widgets.VBox([widgets.Label(value="Range:"), selection_range]),
                        widgets.VBox([widgets.Label(value="Colormap:"), cmap]),
                    ],
                    layout=Layout(height=LEFT_PANEL_HEIGHT, border="1px solid gray"),
                ),
                widgets.VBox([output]),
            ]
        )

    #
    tabs = widgets.Tab()
    tabs.children = [tab_heatmap()]
    tabs.set_title(0, "Heatmap")

    return AppLayout(
        header=widgets.HTML(value=html_title("Correlation Analysis")),
        left_sidebar=None,
        center=tabs,
        right_sidebar=None,
        pane_widths=PANE_WIDHTS,
        pane_heights=PANE_HEIGHTS,
    )


#
#
#
correlation_analysis(df)

In [None]:
def co_occurrence_analysis(x):
    def tab_co_occurrence_heatmap():
        def compute_by_term(
            rows, columns, analysis_type, row_order, column_order, cmap, minmax
        ):
            #
            if analysis_type == "Frequency":
                matrix, limit_value = tc.co_occurrence(
                    x,
                    rows,
                    columns,
                    as_matrix=True,
                    minmax=(selection_range.value[0], selection_range.value[1]),
                    keywords=None,
                    retmaxval=True,
                )
            else:
                matrix, limit_value = tc.co_citation(
                    x,
                    rows,
                    columns,
                    as_matrix=True,
                    minmax=(selection_range.value[0], selection_range.value[1]),
                    keywords=None,
                    retmaxval=True,
                )
            #
            values = selection_range.value
            if values[1] > limit_value or selection_range.max < limit_value:
                selection_range.max = limit_value
                if values[0] > limit_value:
                    selection_range.min = 0
                # selection_range.value = [0, limit_value]
            #
            output.clear_output()
            with output:
                if len(matrix.columns) < 21 and len(matrix.index) < 21:
                    display(matrix.style.background_gradient(cmap=cmap))
                else:
                    display(matrix)  # .style.background_gradient(cmap=cmap)

        #
        rows = widgets.Select(
            options=[z for z in COLUMNS if z in x.columns],
            ensure_option=True,
            disabled=False,
        )
        columns = widgets.Select(
            options=[z for z in COLUMNS if z in x.columns],
            ensure_option=True,
            disabled=False,
        )
        analysis_type = widgets.Dropdown(
            options=["Frequency", "Citation"], value="Frequency", disable=False,
        )
        selection_range = widgets.IntRangeSlider(
            value=[0, 1000],
            min=0,
            max=1000,
            step=1,
            disabled=False,
            continuous_update=False,
            orientation="horizontal",
            readout=True,
            readout_format="d",
        )
        row_order = widgets.Dropdown(
            options=["Alphabetic asc.", "Alphabetic desc.", "F/C asc.", "F/C desc."],
            value="F/C desc.",
            disable=False,
        )
        column_order = widgets.Dropdown(
            options=["Alphabetic asc.", "Alphabetic desc.", "F/C asc.", "F/C desc."],
            value="F/C desc.",
            disable=False,
        )
        cmap = widgets.Dropdown(options=COLORMAPS, disable=False,)
        #
        output = widgets.Output()
        with output:
            display(
                widgets.interactive_output(
                    compute_by_term,
                    {
                        "rows": rows,
                        "columns": columns,
                        "analysis_type": analysis_type,
                        "row_order": row_order,
                        "column_order": column_order,
                        "cmap": cmap,
                        "minmax": selection_range,
                    },
                )
            )
        #
        return widgets.HBox(
            [
                widgets.VBox(
                    [
                        widgets.VBox([widgets.Label(value="Rows:"), rows]),
                        widgets.VBox([widgets.Label(value="Columns:"), columns]),
                        widgets.VBox(
                            [widgets.Label(value="Analysis type:"), analysis_type]
                        ),
                        widgets.VBox([widgets.Label(value="Range:"), selection_range]),
                        widgets.VBox([widgets.Label(value="Row order:"), row_order]),
                        widgets.VBox(
                            [widgets.Label(value="Column order:"), column_order]
                        ),
                        widgets.VBox([widgets.Label(value="Colormap:"), cmap]),
                    ],
                    layout=Layout(height=LEFT_PANEL_HEIGHT, border="1px solid gray"),
                ),
                widgets.VBox([output]),
            ]
        )

    #
    #
    tab_nest = widgets.Tab()
    tab_nest.children = [tab_co_occurrence_heatmap()]
    tab_nest.set_title(0, "Heatmap")

    return AppLayout(
        header=widgets.HTML(value=html_title("Co-occurrence Analysis")),
        left_sidebar=None,
        center=tab_nest,
        right_sidebar=None,
        pane_widths=PANE_WIDHTS,
        pane_heights=PANE_HEIGHTS,
    )


#
#
#
co_occurrence_analysis(df)

In [None]:
def summary_by_year(x):
    """ Summary by year dashboard.
    
    Args:
        df (pandas.DataFrame): bibliographic dataframe.
    
    """

    def tab_0():
        #
        def compute(selected_plot, plot_type, cmap):
            #
            plots = {"bar": plt.bar, "barh": plt.barh}
            data = {
                "Documents by Year": ["Year", "Num Documents"],
                "Cum. Documents by Year": ["Year", "Num Documents (Cum)"],
                "Times Cited by Year": ["Year", "Cited by"],
                "Cum. Times Cited by Year": ["Year", "Cited by (Cum)"],
                "Avg. Times Cited by Year": ["Year", "Avg. Cited by"],
            }
            #
            df = tc.summary_by_year(x)
            df = df[data[selected_plot]]
            plot = plots[plot_type]
            output.clear_output()
            with output:
                display(plot(df, cmap=cmap, figsize=FIGSIZE))

        #
        LEFT_PANEL = [
            (
                "Plot:",
                "selected_plot",
                widgets.Dropdown(
                    options=[
                        "Documents by Year",
                        "Cum. Documents by Year",
                        "Times Cited by Year",
                        "Cum. Times Cited by Year",
                        "Avg. Times Cited by Year",
                    ],
                    value="Documents by Year",
                    disable=False,
                    layout=Layout(width=WIDGET_WIDTH),
                ),
            ),
            (
                "Plot type:",
                "plot_type",
                widgets.Dropdown(
                    options=["bar", "barh"],
                    disable=False,
                    layout=Layout(width=WIDGET_WIDTH),
                ),
            ),
            (
                "Colormap:",
                "cmap",
                widgets.Dropdown(
                    options=COLORMAPS, disable=False, layout=Layout(width=WIDGET_WIDTH),
                ),
            ),
        ]
        #
        args = {key: value for _, key, value in LEFT_PANEL}
        output = widgets.Output()
        with output:
            display(widgets.interactive_output(compute, args,))
        return widgets.HBox(
            [
                widgets.VBox(
                    [
                        widgets.VBox([widgets.Label(value=text), widget])
                        for text, _, widget in LEFT_PANEL
                    ],
                    layout=Layout(height=LEFT_PANEL_HEIGHT, border="1px solid gray"),
                ),
                widgets.VBox([output]),
            ]
        )

    #
    #
    body = widgets.Tab()
    body.children = [tab_0()]
    #
    body.set_title(0, "Time analysis")

    return AppLayout(
        header=widgets.HTML(value=html_title("Summary by Year")),
        left_sidebar=None,
        center=body,
        right_sidebar=None,
        pane_heights=PANE_HEIGHTS,
    )

summary_by_year(df)