# Basic data viz with more papers

## Imports

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from ast import literal_eval

from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objs as go

from startupjh import data_preprocess

%load_ext autoreload

%autoreload 2

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/commander/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Attention: when dataframes are exported as `.csv` files, the list of key words is converted into a string.**

**After importing `.csv` files, need to convert back to LIST using `literal_eval` (below)**

## Data loading

In [2]:
papers_df = pd.read_csv("../data/more_papers.csv")
papers_df['key_words'] = papers_df['key_words'].apply(literal_eval)
citing_papers_df = pd.read_csv("../data/more_citing_papers.csv")
citing_papers_df['key_words'] = citing_papers_df['key_words'].apply(literal_eval)

In [3]:
papers_df.head(3)

Unnamed: 0.1,Unnamed: 0,paper_id,title,result_id,link,snippet,resources_title,resources_link,citation_count,cites_id,versions,cluster_id,full_citation,key_words,authors,pub_info,year
0,0,0,A multi-agent system for the automation of a p...,qljx-SC6MYEJ,https://www.academia.edu/download/48859036/A_m...,This paper presents a system architecture whic...,academia.edu,https://www.academia.edu/download/48859036/A_m...,74,9309426555546589354,4,9309426555546589354,"Rebollo, Miguel, et al. ""A multi-agent system ...","[multiagent, system, automation, port, contain...","Rebollo, Miguel, et al.",Workshop in Agents in Industry. Barcelona. 2000.,2000
1,1,1,Automation in port container terminals,hQkbtalB1OAJ,https://www.sciencedirect.com/science/article/...,… “ Business Process Model and Notation (BPMN)...,sciencedirect.com,https://www.sciencedirect.com/science/article/...,49,16200645956702243205,3,16200645956702243205,"Martín-Soberón, Ana María, et al. ""Automation ...","[automation, port, container, terminals]","Martín-Soberón, Ana María, et al.",Procedia-Social and Behavioral Sciences 160 (2...,2014
2,2,2,Container port automation,0q7hE6UAsyEJ,https://link.springer.com/content/pdf/10.1007/...,… The Patrick Fisherman's Island terminal is n...,no data,no data,13,2428285333085990610,6,2428285333085990610,"Nelmes, Graeme. ""Container port automation."" F...","[container, port, automation]","Nelmes, Graeme.","Field and Service Robotics. Springer, Berlin, ...",2006


In [4]:
citing_papers_df.head(3)

Unnamed: 0.1,Unnamed: 0,paper_id,citing_paper_id,title,result_id,link,snippet,resources_title,resources_link,citation_count,cites_id,versions,cluster_id,full_citation,key_words,authors,pub_info,year
0,0,0,0,MicroPort: A general simulation platform for s...,o4mmoriNu70J,https://www.sciencedirect.com/science/article/...,Seaport container terminals are essential node...,no data,no data,86,13671676917955594659,4,13671676917955594659,"Sun, Zhuo, et al. ""MicroPort: A general simula...","[microport, general, simulation, platform, sea...","Sun, Zhuo, et al.",Advanced Engineering Informatics 26.1 (2012): ...,2012
1,1,1,0,Agent-based simulation of stakeholders relatio...,p-ou00crS34J,https://citeseerx.ist.psu.edu/viewdoc/download...,Port management is often faced with many vexin...,psu.edu,https://citeseerx.ist.psu.edu/viewdoc/download...,75,9100415059517958823,7,9100415059517958823,"Henesey, Lawrence, Theo Notteboom, and Paul Da...","[agentbased, simulation, stakeholders, relatio...","Henesey, Lawrence, Theo Notteboom, and Paul Da...",Multi-Agent Systems for Container Terminal Man...,2003
2,2,2,0,Agent based simulation architecture for evalua...,bnLMKYOdJ9YJ,https://link.springer.com/article/10.1007/s104...,An agent based simulator for evaluating operat...,sc.edu,https://jmvidal.cse.sc.edu/library/henesey09b.pdf,59,15431475834875834990,11,15431475834875834990,"Henesey, Lawrence, Paul Davidsson, and Jan A. ...","[agent, based, simulation, architecture, evalu...","Henesey, Lawrence, Paul Davidsson, and Jan A. ...",Autonomous Agents and Multi-Agent Systems 18.2...,2009


## Data vizualisation

**Let's code 3 functions that plot:**

1. Number of publications per year
2. Number of citations per year
3. Most common key words

In [66]:
# Number of publications per year
def plot_publications_per_year(df1, df2):
    fig = make_subplots(rows=1, cols=2,
                        y_title='Number of publications',
                        subplot_titles=('Primary papers',  'Citing papers'))

    fig.add_trace(
        go.Bar(x=df1.groupby("year", as_index=False).count()['year'],
               y=df1.groupby("year", as_index=False).count()['paper_id']),
               row=1, col=1
    )

    fig.add_trace(
        go.Bar(x=df2.groupby("year", as_index=False).count()['year'],
               y=df2.groupby("year", as_index=False).count()['paper_id']),
               row=1, col=2
    )

    fig.update_xaxes(
            title_text = "Year",
            range = [1990, 2024],
            tick0 = 1990,
            dtick = 5,
            ticks = 'outside'
    )

    fig.update_yaxes(ticks = 'outside')

    fig.update_layout(showlegend=False, template="seaborn")

    return fig

In [67]:
plot_publications_per_year(papers_df, citing_papers_df)

In [68]:
# Number of citations per year
def plot_citations_per_year(df1, df2):
    fig = make_subplots(rows=1, cols=2,
                        y_title='Number of citations',
                        subplot_titles=('Primary papers',  'Citing papers'))

    fig.add_trace(
        go.Bar(x=df1.groupby(["year"], as_index=False).citation_count.sum()["year"],
               y=df1.groupby(["year"], as_index=False).citation_count.sum()["citation_count"]),
               row=1, col=1
    )

    fig.add_trace(
        go.Bar(x=df2.groupby(["year"], as_index=False).citation_count.sum()["year"],
               y=df2.groupby(["year"], as_index=False).citation_count.sum()["citation_count"]),
               row=1, col=2
    )

    fig.update_xaxes(
            title_text = "Year",
            range = [1990, 2024],
            tick0 = 1990,
            dtick = 5,
            ticks = 'outside'
    )

    fig.update_yaxes(ticks = 'outside')

    fig.update_layout(showlegend=False, template="seaborn")

    return fig

In [69]:
plot_citations_per_year(papers_df, citing_papers_df)

In [93]:
def get_most_common_key_words(df):
    list_key_words = []
    for _, row in df.iterrows():
        for word in row.key_words:
            list_key_words.append(word)
    key_words_sorted = Counter(list_key_words).most_common()
    key_words_sorted_df = pd.DataFrame(key_words_sorted, columns=["key_word", "occurence"])
    index_names_kw = key_words_sorted_df[(key_words_sorted_df['key_word'] == "container") | (key_words_sorted_df['key_word'] == "automation") | (key_words_sorted_df['key_word'] == "terminal")].index
    key_words_sorted_df.drop(axis = 0, index = index_names_kw, inplace = True)
    return key_words_sorted_df

In [200]:
def plot_most_common_words(df1, df2):
    
    key_words_papers_df = get_most_common_key_words(df1)
    key_words_citing_papers_df = get_most_common_key_words(df2)
    
    
    fig = make_subplots(rows=1, cols=2,
                        vertical_spacing=0.1,
                        y_title = "occurences",
                        subplot_titles=('Primary papers',  'Citing papers'))

    fig.add_trace(
        go.Bar(x= key_words_papers_df[key_words_papers_df["occurence"] > 1]["key_word"],
               y= key_words_papers_df[key_words_papers_df["occurence"] > 1]["occurence"],
               marker=dict(
                            color='rgba(247, 129, 191, 0.6)',
                            line=dict(color='rgba(246, 78, 139, 1.0)', width=3)
                          )),
               row=1, col=1
    )

    fig.add_trace(
        go.Bar(x= key_words_citing_papers_df[key_words_citing_papers_df["occurence"] > 5]["key_word"],
               y= key_words_citing_papers_df[key_words_citing_papers_df["occurence"] > 5]["occurence"],
               marker=dict(
                            color='rgba(51, 51, 255, 0.6)',
                            line=dict(color='rgb(51, 51, 255, 1.0)', width=3)
                          )),
               row=1, col=2
    )
    
    #fig.update_traces(marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)',
                  #marker_line_width=1.5)

    fig.update_xaxes(title_text = "Key words", gridcolor = "rgb(102,102,102)")

    fig.update_yaxes(ticks = 'outside', gridcolor= "rgb(102,102,102)")

    fig.update_layout(showlegend=False,
                      template="seaborn",
                      width=1200, height=600,
                      plot_bgcolor = "rgb(204,204,204)",
                      title_text = "Most common key words")

    return fig

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/commander/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [201]:
plot_most_common_words(papers_df, citing_papers_df)