# Typify a researcher or research group
### Created by Jaap Stefels, June 2021

Run all cells and click on the blue hyperlink appearing the last cell to start the application.

### Download required packages and dependencies

In [None]:
!pip install -r requirements.txt

In [None]:
from scholarly import scholarly, ProxyGenerator
from wordcloud import WordCloud
from matplotlib import pyplot as plt
import classifier.classifier as CSO
from collections import Counter
import time
import numpy as np
import pandas as pd
import os, sys
import plotly.express as px
import plotly.graph_objects as go
from PIL import Image
from collections import defaultdict
import copy
import plotly.express as px
from jupyter_dash import JupyterDash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output, State
from io import BytesIO
import dash
import dash_table
import base64

# Download nltk stopwords and English spaCy model.
import nltk
nltk.download('stopwords')
!python -m spacy download en_core_web_sm

### General helper and processing functions

In [None]:
class HiddenPrints:
    """
    Hides all outputs of the code run within the class.
    """
    
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

def search_pub_titles(author_name):
    """
    Returns all publications and author statistics for a given 
    author on Google Scholar.
    """
    
    search_query = scholarly.search_author(author_name)
    
    try:
        while True:
            author = scholarly.fill(next(search_query))
            
            # Retrieve all publication titles, citation counts, publications years.
            title_dict = {pub['bib']['title'] : {'pub_year' : pub['bib']['pub_year'], 
                           'num_citations' : pub['num_citations']} for pub in author['publications'] 
                           if 'pub_year' in pub['bib']}
    
            # Retrieve author statistics from Google Scholar.
            author_stats =  {'Name': author.get('name'),
                             'Affiliation': author.get('affiliation'),
                             'Scholar id': author.get('scholar_id'),
                             #'interests': author.get('interests'),
                             'Cited by': author.get('citedby'),
                             'H-index': author.get('hindex'),
                             'i10-index': author.get('i10index'),
                             'Cited by (past 5 years)': author.get('citedby5y'),
                             'H-index (past 5 years)': author.get('hindex5y'),
                             'i10-index (past 5 years)': author.get('i10index5y'),
                             'cites_per_year': author.get('cites_per_year'),
                             'url_picture': author.get('url_picture')}

            return title_dict, author_stats
        
    except StopIteration:
        return False, False

def extract_keywords(author_dict, workers = 4, modules = 'both', enhancement = "first"):
    """
    Extracts keywords using the CSO-classifier and adds them to the 
    dictionary created within the search_pub_titles function. 
    """
    
    papers = {title : {'title' : title} for title in author_dict}
    
    # Extract keywords (topics) from publication titles.
    with HiddenPrints():
        key_words = CSO.run_cso_classifier_batch_mode(papers, workers, modules, enhancement)
    
    # Add keywords to every publication entry. 
    for title in author_dict:
        author_dict[title]['key_words'] = key_words[title]
        
    return author_dict
    
def log_convert(citation_count):
    """
    Returns the rounded log (base 10) of a number. 
    """
    
    if citation_count == 0:
        return 0
    
    return int(np.round(np.log(citation_count)))

def filter_years(years, reject_value = 40):
    """
    Removes any outliers from a list which are larger or smaller
    than the reject_value from the median of the input list. 
    """
    
    return [year for year in years if (year >= int((np.median(years) - reject_value)) 
            and year <= int((np.median(years) + reject_value)))]
    
def filter_key_words(key_words, detection_types, quintile_scores, year_range, quintile_factor,
                     reject_value = 40, citation_min = 1, citation_impact = None):
    """
    Takes the author_dict from the extract_keywords function as input
    and removes any publications based on the entered filter options. 
    """
    
    filtered_key_words = copy.deepcopy(key_words)

    # Filter the publication years.
    years = np.array([int(key_words[topic]['pub_year']) for topic in filtered_key_words]) 
    filtered_years = filter_years(years)

    # Extract quintile scores from citations counts.
    citation_counts = np.array([int(key_words[topic]['num_citations']) for topic in filtered_key_words])
    quintiles = np.percentile(citation_counts, quintile_scores)    
    out = np.searchsorted(quintiles, citation_counts)
    
    # Check per publication if it fits the filter criteria. 
    for i, title in enumerate(key_words):
        year = int(filtered_key_words[title]['pub_year'])
        all_key_words = [topic for topic_type in detection_types 
                         for topic in filtered_key_words[title]['key_words'][topic_type]]

        # Add citation impact if required.
        if (year in filtered_years and year >= year_range[0] and year <= year_range[1]
            and int(filtered_key_words[title]['num_citations']) >= citation_min):
            if citation_impact == 'none':
                filtered_key_words[title]['key_words'] = all_key_words

            if citation_impact == "multiply_cite":
                filtered_key_words[title]['key_words'] = (all_key_words + 
                (all_key_words * int(filtered_key_words[title]['num_citations'])))
             
            if citation_impact == "multiply_log":
                filtered_key_words[title]['key_words'] = (all_key_words + 
                (all_key_words * log_convert(int(filtered_key_words[title]['num_citations']))))
                
            if citation_impact == "quintile":
                filtered_key_words[title]['key_words'] = all_key_words * (out[i] ** quintile_factor)
                                                
        else:
            del filtered_key_words[title]
    
    return filtered_key_words
        
def generate_occurence_dataframe(filtered_key_words, top):
    """
    Given a (filtered) dictionary of publication titles and keywords, 
    generate an occurence dataframe with the frequency per key word 
    per year.
    """
    
    topic_per_year = {}
    
    for title in filtered_key_words:
        year = int(filtered_key_words[title]['pub_year'])
        if year in topic_per_year:
            topic_per_year[year] += filtered_key_words[title]['key_words']
        
        else:
            topic_per_year[year] = filtered_key_words[title]['key_words']
    
    incomplete_years = topic_per_year.keys()
    years =  list(range(min(incomplete_years), max(incomplete_years)+1))
    
    d = {'years' : years}
    
    for topic in top:
        topic_occurence = [0] * len(years)
        
        for i, year in enumerate(years):
            if year in topic_per_year:
                occurences = Counter(topic_per_year[year])
         
                if topic in occurences:
                    topic_occurence[i] = occurences[topic]
                
        d[topic] = topic_occurence
        
    df = pd.DataFrame(data=d).set_index('years')

    return df

def plot_topic_evolution(df, stacking = 'stacked'):
    """
    Return an evolution graph given a input occurence dataframe.
    """
    
    # Create a graph with stacked traces.
    if stacking == 'stacked':
        fig = px.area(df, x=df.index.array, y=df.columns, line_shape='spline')

    # Create a graph with non stacked traces.
    if stacking == 'non_stacked':
        fig = go.Figure()
        fig.add_traces([go.Scatter(x=df.index.array, y=df[column].tolist(), fill='tozeroy', 
                        line_shape='spline', name=column) for column in df 
                        if column != 'years']) 
        
    # Add range slider.
    fig.update_layout(
        title= "Topic evolution of top-{} topics.".format(len(df.columns)),
        xaxis_title = "Years",
        yaxis_title= "Topic count",
        xaxis=dict(
            rangeslider=dict(
                visible=True
            ), type="date"
        )
    )

    return fig

def plot_occurences(dictionary, x_label, y_label, fig_title):
    """
    Returns a bar graph based on an occurence dictionary.
    """
    
    # Create bargraph for the given occurence dictionary.
    fig = px.bar(pd.DataFrame.from_dict({x_label: list(dictionary.keys()), 
                                         y_label: list(dictionary.values())}), x = x_label, y = y_label)
    
    # Add slider, lables and title.
    fig.update_layout(
        title= fig_title,
        xaxis=dict(
            rangeselector=dict(
                buttons=list([
                    dict(count=1,
                         step="year",
                         stepmode="backward"),
                ])
            ),
            rangeslider=dict(
                visible=True
            ),
        )
    )
    
    return fig

def create_author_fig(author_stats):
    """
    Create a table containing author statisctics.
    """
    
    # Create table for input author statistics.  
    fig = go.Figure(data=[go.Table(header=dict(values=[]),
                     cells=dict(values=[list(author_stats.keys())[:9], list(author_stats.values())[:9]]))
                         ])
    
    # Remove header of the table.
    fig.layout['template']['data']['table'][0]['header']['fill']['color']='rgba(0,0,0,0)'
    
    # Add title to table.
    fig.update_layout(
        title= "Author information and statistics"
    )
    
    return fig

def plot_wordcloud(data, max_word):
    """
    Returns a word_cloud figure based on an input occurence 
    dictionary.
    """
    
    wc = WordCloud(width = 400, height = 200,
                   background_color ='white', max_words = max_word,
                   scale = 2).generate_from_frequencies(data)       
    
    return wc.to_image()
    
def RepresentsInt(i):
    """
    Check whether input is an intiger and lies
    between 0-100
    """
    
    try: 
        int(i) 
        if 0 <= int(i) and int(i) <=100:
            return True
    
    except ValueError:
        return False
    
def create_citation_figure(filtered, num_top, overlay, top):
    # Create dictionary with yearly citation count.
    """
    Returns a barplot of the amount of combined citations of 
    all publications published in a specific publication year.
    If overlay is desired, add a topic distribution overlay.
    """
    
    citation_per_year = {} 
    
    # Create a dictionary containing citation counts per year.
    for title in filtered:
        pub_year = int(filtered[title]['pub_year'])
        num_citations = filtered[title]['num_citations']
        
        if pub_year in citation_per_year:
            citation_per_year[pub_year] += num_citations

        else:
            citation_per_year[pub_year] = num_citations

    citation_per_year_df = pd.DataFrame.from_dict(citation_per_year, orient='index')
  
    # If overlay is not desired, create a general barplot.
    if len(overlay) == 0:
        fig_cites_year =  px.bar(citation_per_year_df, x=citation_per_year_df.index.array, 
                                 y=citation_per_year_df.columns)
    
    # If overlay is desired, create a stacked barplot.
    else:
        df_all = generate_occurence_dataframe(filtered, dict(top))
        df_top = generate_occurence_dataframe(filtered, dict(top[:num_top]))
        df_top["Other"] = df_all.sum(axis=1) - df_top.sum(axis=1)

        df_top = df_top.div(df_top.sum(axis = 1), axis = 0)
        df_top["Citations"] = citation_per_year_df
        df_top = df_top.fillna(0)
        df_top = df_top.mul(df_top['Citations'], axis = 0).iloc[:,:-1]
        
        fig_cites_year =  px.bar(df_top, x=df_top.index.array, y=df_top.columns)

    # Add range slider and titles to figure.
    fig_cites_year.update_layout(
        title= "Citation counts for publications from a specific year",
        xaxis_title = "Years",
        yaxis_title= "Counts",
        xaxis=dict(
            rangeslider=dict(
                visible=True
            ), type="date"
        )
    )
    
    return fig_cites_year

def select_key_words(all_key_words, author_selection):
    """
    This function is used to select the key_words for the
    desired authors within the "Found author(s)" option in
    the tool. The key_words are added together for the 
    visualisations.
    """
    
    selected_key_words = {}
    
    for i in author_selection:
        selected_key_words = {**selected_key_words, **all_key_words[int(i)]}

    return selected_key_words

# Functions underneath are not used in final version of the tool.
def setup_proxy(http_code):
    """
    Establish connection between Scholarly and a given proxy http 
    connection token.
    """
    
    pg = ProxyGenerator()
    pg.SingleProxy(http = http_code, https = http_code)
    scholarly.use_proxy(pg)
    
def search_abstracts(pub_titles):
    """
    Returns list of all abstracts for a given dictionary
    of publication titles .
    """
    
    abstract_list = []
    
    for title in pub_titles:
        search_query_pub = scholarly.search_pubs(title)
        abstract = next(search_query_pub)['bib']['abstract']
        abstract_list.append(abstract)
        
    return abstract_list

### Application layout and functions

In [None]:
# Create a jupyter dash application.
app = JupyterDash(__name__, external_stylesheets=['https://codepen.io/chriddyp/pen/bWLwgP.css'])

# Layout of application.
app.layout = html.Div([  
    # Store data used in callbacks.
    dcc.Store(id='key_words'),
    dcc.Store(id='all_author_stats'),
    dcc.Store(id='filtered'),
    dcc.Store(id='top'),
    
    # Title bar.
    html.Div([
        html.Div([
                html.H2("Typify a researcher or research group")
            ], style={'width': '40%', 'display': 'inline-block'}),

        html.Div([
                html.H6("Created by Jaap Stefels", style = {'text-align':'right', 'margin-right' : '20px'}),
            ], style={'width': '60%', 'display': 'inline-block'})
    ]),
    
    # Author entry bar.
    html.Div([
        dcc.Input(id='input-on-submit', type='text', placeholder = "Enter author name(s)"),
        html.Button('Search', id='submit-val', n_clicks=0, style = {'margin-left': '10px'}),
        html.Div([
            dcc.Loading(
                id="loading-1",
                type="dot",
                children=html.Div(id="loading-output-1")
            )
        ], style = {'margin-left':'40px', 'display':'inline-block'})
    ], style={'margin-bottom': '15px', 'display': 'inline-block'}),
    
    html.Div([
        # Left side options column.
        html.Div([
            # General options menu.
            html.H6("General topic extraction", style = {'font-weight': 'bold'}),
            
            # Found author(s) checkboxes.
            html.Div([
                html.Label("Found author(s)", style = {'font-weight': 'bold'}),
                dcc.Checklist(
                    id="author_select_2",
                    options=[{'label': '', 'value': '_'}],
                    value=[]
                )
            ], style = {'margin': '5px 10px'}),
            
            # Extraction method checkboxes.
            html.Div([
                html.Label("Extraction method", style = {'font-weight': 'bold'}),
                dcc.Checklist(
                    id="topic_extraction_dropdown",
                    options=[
                        {'label': 'Semantic', 'value': 'semantic'},
                        {'label': 'Syntactic', 'value': 'syntactic'},
                        {'label': 'Enhanced', 'value': 'enhanced'}  
                    ],
                    value=['semantic', 'syntactic']
                )
            ], style = {'margin': '5px 10px'}),
            
            # Year selection slider.
            html.Div([
                    html.Label("Year selection", style = {'font-weight': 'bold', "margin-left" : "10px"}),
                    dcc.RangeSlider(
                        id='my-range-slider-2',
                        min=0,
                        max=1,
                        step=1,
                        value=[0, 1],
                        pushable = True,
                        tooltip=dict(always_visible=False)
                    ),
                    html.Div(id='container_range_slider_years_2', style = {'text-align' : 'center'}) 
            ], style={'width': '70%'}),
            
            # Citation minumum input field.
            html.Div([
                html.Label("Citation minimum", style = {'font-weight': 'bold'}),
                dcc.Input(id='citation_min', type='number', value=1, placeholder = "Citation minimum")
            ], style={'margin': '5px 10px'}),

            # Citation impact dropdown menu.
            html.Div([
                html.Label("Citation impact", style = {'font-weight': 'bold'}),
                dcc.Dropdown(
                    id="citation_impact",
                    options=[
                        {'label': 'No impact', 'value': 'none'},
                        {'label': 'Multiply citation', 'value': 'multiply_cite'},
                        {'label': 'Multiply log', 'value': 'multiply_log'},
                        {'label': 'Quintile', 'value': 'quintile'}
                    ],
                    value='none',
                    clearable=False
                )  
            ], style={'margin': '5px 10px', 'width': '66%', 'display': 'inline-block'}),
            
            # Quintile citation impact input field.
            html.Div([
                html.Div([
                    html.Label("Quintile values", style = {'font-weight': 'bold'}),
                    dcc.Input(id='quantile_factor_input', type='text', value="20, 40, 60, 80", 
                              placeholder = "20, 40, 60, 80"),
                    html.Label("Recognised quintiles:"),
                    html.Div(id='quintile_values'),
                ], style={'margin': '5px 10px', 'width': '66%', 'display': 'inline-block'}),
                
            ], id='options_quintile', style = {'display' : 'block'}),

            # Options tab 1 - Graph
            html.Div([
                html.H6("Graph", style = {'font-weight': 'bold'}),

                # Amount topics plot 1 input field.
                html.Div([
                    html.Label("Amount topics plot 1", style = {'font-weight': 'bold'}),
                    dcc.Input(id='topic_num', type='number', value=5, 
                              placeholder = "Amount of topics")
                ], style = {'margin': '5px 10px'}),

                # Amount topics plot 2 input field.
                html.Div([
                    html.Label("Amount topics plot 2", style = {'font-weight': 'bold'}),
                    dcc.Input(id='topic_num_2', type='number', value=20, 
                              placeholder = "Amount of topics_2")
                ], style = {'margin': '5px 10px'}),

                # Smoothing plot 1 dropdown menu..
                html.Div([
                    html.Label("Smoothing", style = {'font-weight': 'bold'}),
                    dcc.Dropdown(
                        id="smoothing_type",
                        options=[
                            {'label': 'No smoothing', 'value': 'none'},
                            {'label': 'Historical', 'value': 'his'},
                            {'label': 'Future', 'value': 'fut'},
                            {'label': 'Historical & future', 'value': 'hisfut'},
                        ],
                        value='none',
                        clearable=False
                    ),   
                ], style={'margin': '5px 10px', 'width': '66%', 'display': 'inline-block'}),

                # Stacking plot 1 dropdown menu.
                html.Div([
                    html.Label("Stacking type", style = {'font-weight': 'bold'}),
                    dcc.Dropdown(
                        id="stacked_dropdown",
                        options=[
                                    {'label': 'Stacked', 'value': 'stacked'},
                                    {'label': 'Not stacked', 'value': 'non_stacked'}
                                ],
                        value='stacked',
                        clearable=False)
                ], style={'margin': '5px 10px', 'width': '66%', 'display': 'inline-block'})
            ], id="options_tab_1", style= {'display': 'none'}),
            
            # Options tab 2 - Word cloud.
            html.Div([
                html.H6("Word cloud", style = {'font-weight': 'bold'}),
            
                # Max words input field.
                html.Div([
                    html.Label("Max words", style = {'font-weight': 'bold'}),
                    dcc.Input(id='max_words', type='number', value=100, 
                              placeholder = "Words maximum")
                ], style = {'margin': '5px 10px'})
            ], id = 'options_tab_2', style= {'display': 'none'}),
            
            # Options tab 4 - Publication information
            html.Div([
                html.H6("Publication information ", style = {'font-weight': 'bold'}),
            
                # Topic overlay checkbox.
                html.Div([
                    html.Label("Topic overlay", style = {'font-weight': 'bold'}),
                    dcc.Checklist(
                        id="top_overlay",
                        options=[
                            {'label': 'Topic overlay', 'value': 'top_ov'},
                        ],
                        value=['top_ov']
                    )  
                ], style = {'margin': '5px 10px'})
            ], id = 'options_tab_4', style= {'display': 'none'}),
            
            # Options tab 5 - Author information
            html.Div([
                html.H6("Author information", style = {'font-weight': 'bold'}),
                
                # Author selection dropdown menu.
                html.Div([
                    html.Label("Author selection", style = {'font-weight': 'bold'}),
                    dcc.Dropdown(
                        id="author_select",
                        options=[{'label': '', 'value': '_'}],
                        value='_',
                        clearable=False
                    )
                ], style = {'margin': '5px 10px'})  
            ], id = 'options_tab_5', style= {'display': 'none'})
            
        ], style={'width': '20%', 'display': 'inline-block'}),
        
        html.Div([
            # Tab pages
            dcc.Tabs(id="all_tabs", children = [
                
                ##### Tab 1 - Topic evolution - Graphs
                dcc.Tab(label='Topic evolution graphs', value='tab-1', children=[
                    html.Div([
                        html.Div([
                            
                            # Visualisation 1
                            html.Div([
                                dcc.Graph(id='fig_topic_evolution'),
                                html.P(["This plot shows the evolution of topics over time.", html.Br(), 
                                        "Double click on one of the topic names to view the topic separately.", html.Br(),
                                        "Drag the slider underneath to zoom in on specific years."], 
                                       style = {'font-size':'15px', 'text-align':'center'})
                            ]),
                            
                            # Visualisation 2
                            html.Div([
                                dcc.Graph(id='fig_topic_count'),
                                html.P(["This plot shows the occurence count of the found topics.", 
                                        html.Br(), "Drag the slider underneath to zoom in on specific topics."], 
                                       style = {'font-size':'15px', 'text-align':'center'})
                            ])
                        ], style={'vertical-align': 'top'})
                    ])
                ]),

                ##### Tab 2 - Topic evolution - Word cloud
                dcc.Tab(label='Word cloud', value='tab-2', children=[
                    html.Div([
                        
                        # Visualisation 3
                        html.Div([
                            html.Img(id="image_wc", style = {'margin':'50px 100px'}),
                            html.P(["This word cloud visualises the occurence count of the found topics.",
                                   html.Br(), "Drag the slider within the options menu to select specific years."],
                                   style = {'font-size':'15px', 'text-align':'center'})
                        ])
                    ], style={'width': '80%', 'display': 'inline-block'})
                ]),
                
                ##### Tab 3 - Topic table
                dcc.Tab(label='Topic table', value='tab-3', children=[
                    
                    # Visualisation 4
                    html.Div([
                        dash_table.DataTable(
                            id='table',
                            columns= [{"name": "Publication title", "id": "Publication title"}, 
                                      {"name": "Year", "id" : "Year"},
                                      {"name": "Cited", "id" : "Cited"},
                                      {"name": "Topics", "id" : "Topics"},],
                            data= [{}],
                            style_cell={'textAlign': 'left'},
                            style_data={
                                'whiteSpace': 'normal',
                                'height': 'auto',
                            },
                            
                            style_as_list_view=True,
                            sort_action="native",
                            style_cell_conditional=[
                                {'if': {'column_id': 'Title'},
                                 'width': '40%'},
                                {'if': {'column_id': 'Year'},
                                 'width': '10%'},
                                {'if': {'column_id': 'Cited'},
                                 'width': '10%'},
                                {'if': {'column_id': 'Topics'},
                                 'width': '40%'},
                            ],
                            style_table={'height': '600px', 'overflowY': 'auto'}
                        )
                    ], style={'margin-top': '15px'})
                ]),
                 
                ##### Tab 4 - Publication information
                dcc.Tab(label='Publication information', value='tab-4', children=[  
                    html.Div([
                        
                        # Visualisation 5
                        html.Div([
                            dcc.Graph(id='fig_articles_year'),
                            html.P(["This plot shows the amount of publications published per year.", 
                                    html.Br(), "Drag the slider underneath to zoom in on specific years."], 
                                   style = {'font-size':'15px', 'text-align':'center'})
                        ]),
                        
                        # Visualisation 6
                        html.Div([
                            dcc.Graph(id='fig_cites_year'),
                            html.P(["This plot shows the amount of combined citations of all publications published in a specific publication year.",
                                    html.Br(), "The topic overlay option shows the distribution of topics found within these publications.", 
                                    html.Br(), "Drag the slider underneath to zoom in on specific years."]
                                   , style = {'font-size':'15px', 'text-align':'center'})
                        ])
                    ], style={"text-align":'center'})
                ]),
                
                ##### Tab 5 - Author information
                dcc.Tab(label='Author information', value='tab-5', children=[
                    
                    # Visualisation 7
                    html.Div([
                        dcc.Graph(id='author_fig'),
                        html.P(["This table shows author information and statistics.", 
                                html.Br(), "In case of multiple authors, select the desired author\
                                within the options menu."], 
                               style = {'font-size':'15px', 'text-align':'center'})
                    ]),
                    
                    # Visualisation 8
                    html.Div([
                        dcc.Graph(id='fig_cites_year_google'),
                        html.P(["This plot shows the citation counts per year.", 
                                html.Br(), "Drag the slider underneath to zoom in on specific years."], 
                               style = {'font-size':'15px', 'text-align':'center'})
                    ])
                ])
            ])
        ], style={'width': '80%', 'display': 'inline-block', 'vertical-align': 'top'})
    ], id='options_and_tabs', style = {'display':'block'}),
    
    # Error message when no author(s) found.
    html.Div(id='error_message', style = {'display':'none'})
])

@app.callback(
    Output('key_words', 'data'),
    Output('all_author_stats', 'data'),
    Output("loading-output-1", "children"),
    Input('submit-val', 'n_clicks'),
    State('input-on-submit', 'value')
)

def update_output(n_clicks, author_input):
    """
    Returns a list containing the extracted keywords and a list 
    containing the retreived author statistics for every author 
    present in the input query. 
    """
    
    # Error handling.
    if author_input is None:
        raise dash.exceptions.PreventUpdate
    
    # Extract authors from author input query.
    author_list = [author.strip() for author in author_input.split(",")]
    
    all_key_words = []
    all_author_stats = []
    
    # For every extracted author, search for key_words and author statistics. 
    for author in author_list:
        start = time.time()
        title_dict, author_stats = search_pub_titles(author)
       
        if title_dict:
            print("\n{} publications found for author {}.".format(len(title_dict), author),
                 "Searching for key words.")

            # Extract data
            key_words = extract_keywords(title_dict, workers = 4, modules = 'both', enhancement = "first")
            end = time.time()

            print("Searching took {} seconds. \n".format((end - start)))
            all_key_words.append(key_words)
            all_author_stats.append(author_stats)
 
        else:
            print("\nNo publications found for author {}. Did you spell the name correctly?".format(author))
            
    return all_key_words, all_author_stats, ""

@app.callback(
    Output('quintile_values', 'children'),
    Input('quantile_factor_input', 'value')
)

def callback_quintiles(quintile_input): 
    """
    Returns the detected quintile values to the user.
    """
    
    quintile_values = [int(i) for i in list(quintile_input.split(",")) 
                       if RepresentsInt(i)]
    
    # Error handling.
    if len(quintile_values) == 0:
        raise dash.exceptions.PreventUpdate
        
    return '{}'.format(sorted(quintile_values))

@app.callback(
    Output('author_select', 'value'),
    Output('author_select', 'options'),
    Output('author_select_2', 'value'),
    Output('author_select_2', 'options'),
    Input('all_author_stats', 'data')
)  

def callback_create_author_table(all_author_stats):
    """
    Updates the "Found author(s)" and "Author selection" options
    with the recognised authors.
    """
    
    # Error handling.
    if len(all_author_stats) == 0:
        raise dash.exceptions.PreventUpdate
   
    options = [{'label': stats['Name'], 'value' : str(i)} 
               for i, stats in enumerate(all_author_stats)]
    
    return '0', options, [str(i) for i in range(len(options))], options

@app.callback(
    Output('my-range-slider-2', 'min'),
    Output('my-range-slider-2', 'max'),
    Output('my-range-slider-2', 'value'),
    Input('key_words', 'data')
)

def callback_prepare_range_slider(key_words):
    """
    Provides the correct values to the "year selection"
    option slider.
    """
    
    # Error handling.
    if key_words is None or len(key_words) == 0:
        raise dash.exceptions.PreventUpdate
    
    years = list(set([int(key_word_set[title]['pub_year']) for 
                      key_word_set in key_words for title in key_word_set]))
    
    years_filtered = filter_years(years)
    min_year = min(years_filtered)
    max_year = max(years_filtered)
    
    return min_year, max_year, [min_year, max_year]

@app.callback(
    Output('container_range_slider_years_2', 'children'),
    Input('my-range-slider-2', 'value')
)

def callback_edit_year_range(year_value):
    """
    Provides the correct values to the text undernath the 
    "year selection" slider.
    """
    
    if year_value == [0, 1]:
        return '- to -'
    
    return '{} to {}'.format(year_value[0], year_value[1])

@app.callback(
    Output('filtered', 'data'),
    Output('top', 'data'),
    Output('table', 'data'),
    Input('key_words', 'data'),
    Input('citation_min', 'value'),
    Input('topic_extraction_dropdown', 'value'),
    Input('citation_impact', 'value'),
    Input('quantile_factor_input', 'value'),
    Input('author_select_2', 'value'),
    Input('my-range-slider-2', 'value')
)

def callback_preprocess(key_words, citation, extraction_types, citation_impact, 
                        quintile_input, author_selection, year_range):
    """
    This function selects, filters and processes the key_words dictionary for 
    the desired author(s). This dictionary will be used for later visualisations.
    """
    
    quintile_values = [int(i) for i in list(quintile_input.split(",")) if RepresentsInt(i)]

    # Error handling.
    if key_words is None or citation is None or author_selection is None:
        raise dash.exceptions.PreventUpdate
    
    if len(extraction_types) == 0 or len(quintile_values) == 0 or len(author_selection) == 0:
        raise dash.exceptions.PreventUpdate
        
    # Select and filter key_words for the selected authors.
    key_words = select_key_words(key_words, author_selection)
    filtered = filter_key_words(key_words, extraction_types, quintile_values, year_range,
                                1, 40, citation, citation_impact)
    
    # Create dictionary which contains the frequency of every topic.
    top = Counter([word for title in filtered for word in filtered[title]['key_words']]).most_common()

    dict_list = []
    
    for title in filtered:
        temp_dict = {}
        temp_dict["Publication title"] = title
        temp_dict["Year"] = filtered[title]['pub_year']
        temp_dict["Cited"] = filtered[title]['num_citations']
        temp_dict["Topics"] = str(filtered[title]['key_words'])
        dict_list.append(temp_dict)
    
    return filtered, top, dict_list

@app.callback(
    Output('fig_topic_count', 'figure'),
    Output('fig_topic_evolution', 'figure'),
    Output('fig_cites_year', 'figure'),
    Output('fig_articles_year', 'figure'),
    Input('filtered', 'data'),
    Input('top', 'data'),
    Input('topic_num', 'value'),
    Input('topic_num_2', 'value'),
    Input('stacked_dropdown', 'value'),
    Input('smoothing_type', 'value'),
    Input('top_overlay', 'value')
)  
    
def callback_create_figs(filtered, top, topic_num, topic_num_2, stack_type, smoothing_type, topic_overlay): 
    """
    Creates visualisations 1, 2, 5, 6. 
    """
    
    # Error handling.
    if (filtered is None or top is None or topic_num is None or topic_num_2 is None or 
    len(filtered) == 0 or len(top) == 0):
        raise dash.exceptions.PreventUpdate

    # Add smoothing to visualisation 1.
    df = generate_occurence_dataframe(filtered, dict(top[:topic_num]))
    kernels = {"none" : [1], "his" : [0.5,1,0], "fut" : [0,1,0.5], "hisfut" : [0.5,1,0.5]}  
    modDfObj = df.transform(lambda x: np.convolve(x, kernels[smoothing_type], 'same'))
    
    # Create the visualisations.
    fig_topic_evolution = plot_topic_evolution(modDfObj, stacking = stack_type)
    fig_topic_count = plot_occurences(dict(top[:topic_num_2]), 'Topic', 'Count', "Topic count of top-{} topics".format(topic_num_2))
    article_count = dict(Counter([int(filtered[title]['pub_year']) for title in filtered]).most_common())
    fig_articles_year = plot_occurences(article_count, 'Year', 'Count', "Amount of articles published per year")
    fig_cites_year = create_citation_figure(filtered, topic_num, topic_overlay, top)

    return fig_topic_count, fig_topic_evolution, fig_cites_year, fig_articles_year

@app.callback(
    Output('image_wc', 'src'),
    Input('filtered', 'data'),
    Input('max_words', 'value'),
)

def callback_create_word_cloud(filtered, max_words):
    """
    Creates the word cloud (visualisation 3).
    """
    
    # Error handling.
    if filtered is None or len(filtered) == 0:
        raise dash.exceptions.PreventUpdate

    # Create the wordcloud based on the topics with their frequencies.
    top = Counter([word for title in filtered for word in filtered[title]['key_words']]).most_common()       
    img = BytesIO()
    plot_wordcloud(dict(top), max_words).save(img, format='PNG')
    image_wc = 'data:image/png;base64,{}'.format(base64.b64encode(img.getvalue()).decode())
    
    return image_wc

@app.callback(
    Output('author_fig', 'figure'),
    Output('fig_cites_year_google', 'figure'),
    Input('all_author_stats', 'data'),
    Input('author_select', 'value')
)  

def callback_create_author_table(all_author_stats, author_value):
    """
    Create publications table (Visualisation 4).
    """
    
    # Error handling.
    if author_value == '_' or len(all_author_stats) == 0:
        raise dash.exceptions.PreventUpdate

    # Creating the table.
    author_fig = create_author_fig(all_author_stats[int(author_value)])
    article_count = all_author_stats[int(author_value)]['cites_per_year']
    title = "Citations counts per year"
    fig_cites_year_google = plot_occurences(article_count, 'Year', 'Count', title)
    
    return author_fig, fig_cites_year_google

@app.callback(
    Output(component_id='options_tab_1', component_property='style'),
    Output(component_id='options_tab_2', component_property='style'),
    Output(component_id='options_tab_4', component_property='style'),
    Output(component_id='options_tab_5', component_property='style'),
    Input('all_tabs', 'value')
)

def callback_tabs(tab):
    """
    This function hides or shows the correct options to the corresponding
    tab page.
    """
    
    # Show options menu for tab 1.
    if tab == 'tab-1':
        return {'display': 'block'}, {'display': 'none'}, {'display': 'none'}, {'display': 'none'}
    
    # Show options menu for tab 2.
    if tab == 'tab-2':
        return {'display': 'none'}, {'display': 'block'}, {'display': 'none'}, {'display': 'none'}
    
    # Show options menu for tab 3.
    if tab == 'tab-3':
        return {'display': 'none'}, {'display': 'none'}, {'display': 'none'}, {'display': 'none'}
    
    # Show options menu for tab 4.
    if tab == 'tab-4':
        return {'display': 'none'}, {'display': 'none'}, {'display': 'block'}, {'display': 'none'}
    
    # Show options menu for tab 5.
    if tab == 'tab-5':
        return {'display': 'none'}, {'display': 'none'}, {'display': 'none'}, {'display': 'block'}
    
    # Start up of application, show options tab 1.
    return {'display': 'block'}, {'display': 'none'}, {'display': 'none'}, {'display': 'none'}

@app.callback(
    Output(component_id='options_quintile', component_property='style'),
    Input('citation_impact', 'value')
)

def callback_quintile(impact_option):
    """
    This function shows the quintile input field if "quintile" is
    chosen as citation impact function.
    """
    
    if impact_option == 'quintile':
        return {'display': 'block'}
    
    else:
        return {'display': 'none'}
    
@app.callback(
    Output(component_id='options_and_tabs', component_property='style'),
    Output(component_id='error_message', component_property='style'),
    Output(component_id='error_message', component_property='children'),
    Input('all_author_stats', 'data'),
    State('input-on-submit', 'value')
)

def callback_no_author(all_author_stats, author_input):
    """
    Show error message if no authors are found.
    """
    
    if len(all_author_stats) == 0:
        error_message = "No publications found for author(s) {}. \
        Use a comma to seperate multiple authors.".format(author_input)
        return {'display': 'none'}, {'display': 'block', 'margin-left':'10px'}, error_message
    
    else:
        return {'display': 'block'}, {'display': 'none'}, "-"
    
app.run_server(mode='external', port = 8900, dev_tools_ui=False, debug=False,
              dev_tools_hot_reload =False, threaded=False, use_reloader = False)