In [1]:
import ipywidgets as widgets
import pandas as pd
from ipywidgets import Layout, AppLayout

import techminer.analytics as tc
import techminer.plots as plt
from techminer.analytics import load_scopus
import techminer.dashboards as dash
from techminer.keywords import Keywords

filepath = "../data/papers/urban-agriculture.csv"
df = pd.read_csv(filepath)
df = load_scopus(df)

2020-06-03 02:23:06,759 - INFO - Renaming and selecting columns ...
2020-06-03 02:23:12,515 - INFO - Formatting author names ...
2020-06-03 02:23:12,521 - INFO - Disambiguating author names ...
2020-06-03 02:23:12,602 - INFO - Removing part of titles in foreing languages ...
2020-06-03 02:23:12,605 - INFO - Fusioning author and index keywords ...
2020-06-03 02:23:12,622 - INFO - NumExpr defaulting to 8 threads.
2020-06-03 02:23:12,639 - INFO - Extracting countries from affiliations ...
2020-06-03 02:23:19,110 - INFO - Extracting institutions from affiliations ...
2020-06-03 02:23:19,121 - INFO - Extracting country of 1st author ...
2020-06-03 02:23:19,125 - INFO - Extracting affiliation of 1st author ...
2020-06-03 02:23:19,129 - INFO - Counting number of authors ...


In [2]:
COLORMAPS = [
    "Greys",
    "Purples",
    "Blues",
    "Greens",
    "Oranges",
    "Reds",
    "YlOrBr",
    "YlOrRd",
    "OrRd",
    "PuRd",
    "RdPu",
    "BuPu",
    "GnBu",
    "PuBu",
    "YlGnBu",
    "PuBuGn",
    "BuGn",
    "YlGn",
    "Pastel1",
    "Pastel2",
    "Paired",
    "Accent",
    "Dark2",
    "Set1",
    "Set2",
    "Set3",
    "tab10",
    "tab20",
    "tab20b",
    "tab20c",
]

COLUMNS = [
    "Author Keywords",
    "Authors",
    "Countries",
    "Country 1st",
    "Document type",
    "Index Keywords",
    "Institution 1st" "Institutions",
    "Keywords",
    "Source title",
]

In [3]:
def html_title(x):
    return (
        "<h1>{}</h1>".format(x)
        + "<hr style='height:2px;border-width:0;color:gray;background-color:gray'>"
    )

In [4]:
FIGSIZE = (15, 9.4) ## Summary-by-year
FIGSIZE = (14, 8.2) ## Summary-by-term
FIGSIZE = (14, 10.0) ## Summary-by-term-per-year 

FIGSIZE = (14, 10.0) ## Co-occurrente-analysis  <---- no tiene figura

FIGSIZE = (14, 9.0)


LEFT_PANEL_HEIGHT = "588px"
PANE_HEIGHTS = ["80px", "650px", 0]
WIDGET_WIDTH = "200px"

# SUMMARY BY YEAR

In [None]:
def summary_by_year(x):
    """ Summary by year dashboard.
    
    Args:
        df (pandas.DataFrame): bibliographic dataframe.
    
    """

    def tab_0():
        #
        def compute(selected_plot, plot_type, cmap):
            #
            plots = {"bar": plt.bar, "barh": plt.barh}
            data = {
                "Documents by Year": ["Year", "Num Documents"],
                "Cum. Documents by Year": ["Year", "Num Documents (Cum)"],
                "Times Cited by Year": ["Year", "Cited by"],
                "Cum. Times Cited by Year": ["Year", "Cited by (Cum)"],
                "Avg. Times Cited by Year": ["Year", "Avg. Cited by"],
            }
            #
            df = tc.summary_by_year(x)
            df = df[data[selected_plot]]
            plot = plots[plot_type]
            output.clear_output()
            with output:
                display(plot(df, cmap=cmap, figsize=FIGSIZE))

        #
        LEFT_PANEL = [
            (
                "Plot:",
                "selected_plot",
                widgets.Dropdown(
                    options=[
                        "Documents by Year",
                        "Cum. Documents by Year",
                        "Times Cited by Year",
                        "Cum. Times Cited by Year",
                        "Avg. Times Cited by Year",
                    ],
                    value="Documents by Year",
                    disable=False,
                    layout=Layout(width=WIDGET_WIDTH),
                ),
            ),
            (
                "Plot type:",
                "plot_type",
                widgets.Dropdown(
                    options=["bar", "barh"],
                    disable=False,
                    layout=Layout(width=WIDGET_WIDTH),
                ),
            ),
            (
                "Colormap:",
                "cmap",
                widgets.Dropdown(
                    options=COLORMAPS, disable=False, layout=Layout(width=WIDGET_WIDTH),
                ),
            ),
        ]
        #
        args = {key: value for _, key, value in LEFT_PANEL}
        output = widgets.Output()
        with output:
            display(widgets.interactive_output(compute, args,))
        return widgets.HBox(
            [
                widgets.VBox(
                    [
                        widgets.VBox([widgets.Label(value=text), widget])
                        for text, _, widget in LEFT_PANEL
                    ],
                    layout=Layout(height=LEFT_PANEL_HEIGHT, border="1px solid gray"),
                ),
                widgets.VBox([output], layout=Layout(width="870px")),
            ]
        )

    #
    #
    body = widgets.Tab()
    body.children = [tab_0()]
    #
    body.set_title(0, "Time analysis")

    return AppLayout(
        header=widgets.HTML(value=html_title("Summary by Year")),
        left_sidebar=None,
        center=body,
        right_sidebar=None,
        pane_heights=PANE_HEIGHTS,
    )

#####
summary_by_year(df)

# SUMMARY BY TERM

In [7]:
def summary_by_term(x):
    """ Summary by Term dashboard.
    
    Args:
        df (pandas.DataFrame): bibliographic dataframe.
    
    """

    def tab_0():
        #
        def compute(term, analysis_type, plot_type, cmap, top_n):
            #
            plots = {"bar": plt.bar, "barh": plt.barh, "pie": plt.pie}
            #
            df = tc.summary_by_term(x, term)
            if analysis_type == "Frequency":
                df = df.sort_values(
                    ["Num Documents", "Cited by", term], ascending=False
                )
                df = df[[term, "Num Documents"]].head(top_n)
            else:
                df = df.sort_values(
                    ["Cited by", "Num Documents", term], ascending=False
                )
                df = df[[term, "Cited by"]].head(top_n)
            df = df.reset_index(drop=True)
            plot = plots[plot_type]
            output.clear_output()
            with output:
                display(plot(df, figsize=FIGSIZE, cmap=cmap))

        #
        LEFT_PANEL = [
            (
                "Term to analyze:",
                "term",
                widgets.Select(
                    options=[z for z in COLUMNS if z in x.columns],
                    ensure_option=True,
                    disabled=False,
                    layout=Layout(width=WIDGET_WIDTH),
                ),
            ),
            (
                "Analysis type:",
                "analysis_type",
                widgets.Dropdown(
                    options=["Frequency", "Citation"],
                    value="Frequency",
                    disable=False,
                    layout=Layout(width=WIDGET_WIDTH),
                ),
            ),
            (
                "Plot type:",
                "plot_type",
                widgets.Dropdown(
                    options=["bar", "barh", "pie"],
                    disable=False,
                    layout=Layout(width=WIDGET_WIDTH),
                ),
            ),
            (
                "Colormap:",
                "cmap",
                widgets.Dropdown(
                    options=COLORMAPS, disable=False, layout=Layout(width=WIDGET_WIDTH),
                ),
            ),
            (
                "Top N:",
                "top_n",
                widgets.IntSlider(
                    value=10,
                    min=10,
                    max=50,
                    step=1,
                    disabled=False,
                    continuous_update=False,
                    orientation="horizontal",
                    readout=True,
                    readout_format="d",
                    layout=Layout(width=WIDGET_WIDTH),
                ),
            ),
        ]
        #
        args = {key: value for _, key, value in LEFT_PANEL}
        output = widgets.Output()
        with output:
            display(widgets.interactive_output(compute, args,))
        return widgets.HBox(
            [
                widgets.VBox(
                    [
                        widgets.VBox([widgets.Label(value=text), widget])
                        for text, _, widget in LEFT_PANEL
                    ],
                    layout=Layout(height=LEFT_PANEL_HEIGHT, border="1px solid gray"),
                ),
                widgets.VBox([output], layout=Layout(width="870px")),
            ]
        )

    def tab_1():
        #
        def compute(term, analysis_type, cmap):
            df = tc.summary_by_term(x, term)
            if analysis_type == "Frequency":
                df = df[[term, "Num Documents"]]
            else:
                df = df[[term, "Cited by"]]
            df = df.reset_index(drop=True)
            output.clear_output()
            with output:
                display(plt.worldmap(df, figsize=FIGSIZE, cmap=cmap))

        #
        LEFT_PANEL = [
            (
                "Term to analyze:",
                "term",
                widgets.Select(
                    options=["Countries", "Country 1st"],
                    ensure_option=True,
                    disabled=False,
                    layout=Layout(width=WIDGET_WIDTH),
                ),
            ),
            (
                "Analysis type:",
                "analysis_type",
                widgets.Dropdown(
                    options=["Frequency", "Citation"],
                    value="Frequency",
                    disable=False,
                    layout=Layout(width=WIDGET_WIDTH),
                ),
            ),
            (
                "Colormap:",
                "cmap",
                widgets.Dropdown(
                    options=COLORMAPS, disable=False, layout=Layout(width=WIDGET_WIDTH),
                ),
            ),
        ]
        #
        args = {key: value for _, key, value in LEFT_PANEL}
        output = widgets.Output()
        with output:
            display(widgets.interactive_output(compute, args,))
        return widgets.HBox(
            [
                widgets.VBox(
                    [
                        widgets.VBox([widgets.Label(value=text), widget])
                        for text, _, widget in LEFT_PANEL
                    ],
                    layout=Layout(height=LEFT_PANEL_HEIGHT, border="1px solid gray"),
                ),
                widgets.VBox([output], layout=Layout(width="870px")),
            ]
        )

    #
    #
    body = widgets.Tab()
    body.children = [tab_0(), tab_1()]
    #
    body.set_title(0, "Time analysis")
    body.set_title(1, "Worldmap")

    return AppLayout(
        header=widgets.HTML(value=html_title("Summary by Term")),
        left_sidebar=None,
        center=body,
        right_sidebar=None,
        pane_heights=PANE_HEIGHTS,
    )


#####
summary_by_term(df)

AppLayout(children=(HTML(value="<h1>Summary by Term</h1><hr style='height:2px;border-width:0;color:gray;backgr…

# SUMMARY BY TERM PER YEAR

In [None]:
def summary_by_term_per_year(x):
    #
    def tab_0():
        #
        def compute(term, analysis_type, plot_type, cmap, top_n):
            #
            plots = {"Heatmap": plt.heatmap, "Gant": plt.gant}
            plot = plots[plot_type]
            #
            if analysis_type == "Frequency":
                top = tc.documents_by_term(x, term).head(top_n)[term].tolist()
                matrix = tc.documents_by_term_per_year(x, term, as_matrix=True)
            else:
                top = tc.citations_by_term(x, term).head(top_n)[term].tolist()
                matrix = tc.citations_by_term_per_year(x, term, as_matrix=True)
            matrix = matrix[top]
            output.clear_output()
            with output:
                if plot_type == "Heatmap":
                    display(plot(matrix, cmap=cmap, figsize=FIGSIZE))
                if plot_type == "Gant":
                    display(plot(matrix, figsize=FIGSIZE))

        #
        LEFT_PANEL = [
            (
                "Term to analyze:",
                "term",
                widgets.Select(
                    options=[z for z in COLUMNS if z in x.columns],
                    ensure_option=True,
                    disabled=False,
                    layout=Layout(width=WIDGET_WIDTH),
                ),
            ),
            (
                "Analysis type:",
                "analysis_type",
                widgets.Dropdown(
                    options=["Frequency", "Citation"],
                    value="Frequency",
                    disable=False,
                    layout=Layout(width=WIDGET_WIDTH),
                ),
            ),
            (
                "Plot type:",
                "plot_type",
                widgets.Dropdown(
                    options=["Heatmap", "Gant"],
                    disable=False,
                    layout=Layout(width=WIDGET_WIDTH),
                ),
            ),
            (
                "Colormap:",
                "cmap",
                widgets.Dropdown(
                    options=COLORMAPS, disable=False, layout=Layout(width=WIDGET_WIDTH),
                ),
            ),
            (
                "Top N:",
                "top_n",
                widgets.IntSlider(
                    value=10,
                    min=10,
                    max=50,
                    step=1,
                    disabled=False,
                    continuous_update=False,
                    orientation="horizontal",
                    readout=True,
                    readout_format="d",
                    layout=Layout(width=WIDGET_WIDTH),
                ),
            ),
        ]
        #
        args = {key: value for _, key, value in LEFT_PANEL}
        output = widgets.Output()
        with output:
            display(widgets.interactive_output(compute, args,))
        return widgets.HBox(
            [
                widgets.VBox(
                    [
                        widgets.VBox([widgets.Label(value=text), widget])
                        for text, _, widget in LEFT_PANEL
                    ],
                    layout=Layout(height=LEFT_PANEL_HEIGHT, border="1px solid gray"),
                ),
                widgets.VBox([output], layout=Layout(width="870px")),
            ]
        )

    #
    #
    body = widgets.Tab()
    body.children = [tab_0()]
    #
    body.set_title(0, "Heatmap")

    return AppLayout(
        header=widgets.HTML(value=html_title("Summary by Term per Year")),
        left_sidebar=None,
        center=body,
        right_sidebar=None,
        pane_heights=PANE_HEIGHTS,
    )

######
summary_by_term_per_year(df)

# CO-OCCURRENCE ANALYSIS

In [6]:
def co_occurrence_analysis(x):
    #
    def tab_0():
        #
        def compute(column, by, min_value, cmap):
            #
            matrix = tc.co_occurrence(
                x,
                column=column,
                by=by,
                as_matrix=True,
                min_value=min_value,
                keywords=None,
            )
            
            #
            if LEFT_PANEL[-1][2].value > matrix.max().max():
                LEFT_PANEL[-1][2].value = matrix.max().max()
            LEFT_PANEL[-1][2].max = matrix.max().max()
            #
            output.clear_output()
            with output:
                if len(matrix.columns) < 51 and len(matrix.index) < 51:
                    display(matrix.style.background_gradient(cmap=cmap))
                else:
                    display(matrix)

        #
        LEFT_PANEL = [
            (
                "Term to analyze:",
                "column",
                widgets.Select(
                    options=[z for z in COLUMNS if z in x.columns],
                    ensure_option=True,
                    disabled=False,
                    continuous_update=True,
                    layout=Layout(width=WIDGET_WIDTH),
                ),
            ),
            (
                "By:",
                "by",
                widgets.Select(
                    options=[z for z in COLUMNS if z in x.columns],
                    ensure_option=True,
                    disabled=False,
                    continuous_update=True,
                    layout=Layout(width=WIDGET_WIDTH),
                ),
            ),
            (
                "Colormap:",
                "cmap",
                widgets.Dropdown(
                    options=COLORMAPS, disable=False, layout=Layout(width=WIDGET_WIDTH),
                ),
            ),
            (
                "Min value:",
                "min_value",
                widgets.IntSlider(
                    value=0,
                    min=0,
                    max=50,
                    step=1,
                    disabled=False,
                    continuous_update=False,
                    orientation="horizontal",
                    readout=True,
                    readout_format="d",
                    layout=Layout(width=WIDGET_WIDTH),
                ),
            ),
        ]
        #
        args = {key: value for _, key, value in LEFT_PANEL}
        output = widgets.Output()
        with output:
            display(widgets.interactive_output(compute, args,))
        return widgets.HBox(
            [
                widgets.VBox(
                    [
                        widgets.VBox([widgets.Label(value=text), widget])
                        for text, _, widget in LEFT_PANEL
                    ],
                    layout=Layout(
                        height=LEFT_PANEL_HEIGHT, width="210px", border="1px solid gray"
                    ),
                ),
                widgets.VBox([output], layout=Layout(width="870px")),
            ]
        )

    #
    #
    body = widgets.Tab()
    body.children = [tab_0()]
    #
    body.set_title(0, "Matrix")
    return AppLayout(
        header=widgets.HTML(value=html_title("Co-occurrence Analysis")),
        left_sidebar=None,
        center=body,
        right_sidebar=None,
        pane_heights=PANE_HEIGHTS,
    )


######
co_occurrence_analysis(df)

AppLayout(children=(HTML(value="<h1>Co-occurrence Analysis</h1><hr style='height:2px;border-width:0;color:gray…

# CORRELATION ANALYSIS

In [None]:
def correlation_analysis(x):
    def tab_heatmap():
        def compute_by_term(column, by, method, minmax, cmap, filter_type, top_n):
            #
            minmax = (minmax[0], minmax[1])
            #
            matrix, limit_values = tc.corr(
                x,
                column=column,
                by=by,
                method=method.lower(),
                show_between=minmax,
                cmap=cmap,
                filter_by=filter_type,
                top_n=top_n,
                as_matrix=True,
                get_minmax=True,
            )
            output.clear_output()
            with output:
                if len(matrix.columns) < 51 and len(matrix.index) < 51:
                    display(
                        matrix.style.format("{:.3f}").background_gradient(cmap=cmap)
                    )
                else:
                    display(matrix.style.format("{:.3f}"))

        #
        column = widgets.Select(
            options=[z for z in COLUMNS if z in x.columns],
            ensure_option=True,
            disabled=False,
        )
        by = widgets.Select(
            options=[z for z in COLUMNS if z in x.columns],
            ensure_option=True,
            disabled=False,
        )
        method = widgets.Dropdown(
            options=["Pearson", "Kendall", "Spearman"], value="Pearson", disable=False,
        )
        selection_range = widgets.FloatRangeSlider(
            value=[-1.0, 1.0],
            min=-1.0,
            max=1.0,
            step=0.1,
            disabled=False,
            continuous_update=False,
            orientation="horizontal",
            readout=True,
            readout_format="+.1f",
        )
        filter_type = widgets.Dropdown(
            options=["Frequency", "Citation"], value="Frequency", disable=False,
        )
        top_n = widgets.IntSlider(
            value=10,
            min=10,
            max=50,
            step=1,
            disabled=False,
            continuous_update=False,
            orientation="horizontal",
            readout=True,
            readout_format="d",
        )
        cmap = widgets.Dropdown(options=COLORMAPS, disable=False,)
        #
        output = widgets.Output()
        with output:
            display(
                widgets.interactive_output(
                    compute_by_term,
                    {
                        "column": column,
                        "by": by,
                        "method": method,
                        "minmax": selection_range,
                        "cmap": cmap,
                        "filter_type": filter_type,
                        "top_n": top_n,
                    },
                )
            )
        #
        return widgets.HBox(
            [
                widgets.VBox(
                    [
                        widgets.VBox([widgets.Label(value="Term:"), column]),
                        widgets.VBox([widgets.Label(value="By term:"), by]),
                        widgets.VBox([widgets.Label(value="Method:"), method]),
                        widgets.VBox(
                            [widgets.Label(value="Filter type:"), filter_type]
                        ),
                        widgets.VBox([widgets.Label(value="Top n:"), top_n]),
                        widgets.VBox([widgets.Label(value="Range:"), selection_range]),
                        widgets.VBox([widgets.Label(value="Colormap:"), cmap]),
                    ],
                    layout=Layout(height=LEFT_PANEL_HEIGHT, border="1px solid gray"),
                ),
                widgets.VBox([output], layout=Layout(width="870px")),
            ]
        )

    #
    tabs = widgets.Tab()
    tabs.children = [tab_heatmap()]
    tabs.set_title(0, "Heatmap")

    return AppLayout(
        header=widgets.HTML(value=html_title("Correlation Analysis")),
        left_sidebar=None,
        center=tabs,
        right_sidebar=None,
        pane_widths=PANE_WIDHTS,
        pane_heights=PANE_HEIGHTS,
    )


####
correlation_analysis(df)

In [None]:
x = pd.DataFrame(0, columns=list('abcdef'), index=list('ABCDEF'))
x.at['A', 'a'] = 9
x.at['B', 'b'] = 8
x.at['C', 'c'] = 3
x.at['D', 'd'] = 5
x.at['E', 'e'] = 6
x.at['F', 'f'] = 1
x.at['C', 'a'] = 9
x

In [None]:
x = pd.DataFrame(0, columns=list('abcdef'), index=list('ABCDEF'))
x.at['A', 'a'] = 9
x.at['B', 'b'] = 8
x.at['C', 'c'] = 3
x.at['D', 'd'] = 5
x.at['E', 'e'] = 6
x.at['F', 'f'] = 1
x.at['C', 'a'] = 9


ascending=False
a = x.max().sort_values(ascending=ascending)
x = x[a.index]
x = x.sort_values(x.index.tolist(), axis=1, ascending=ascending)
x = x.sort_values(x.columns.tolist(), axis=0, ascending=ascending)
x

In [None]:
x.loc[x.max(axis=1).sort_values(ascending=False).index, x.max(axis=0).sort_values(ascending=False).index]

In [None]:
x.index

In [None]:
x.sort_values(x.index.tolist(), axis=1, ascending=False)