# Test notebook for KPI dev

## Imports

In [102]:
import pandas as pd
import numpy as np
import re

from startupjh import utils
from startupjh import plots

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data loading

In [94]:
papers_df = utils.load_from_csv("../data/more_papers.csv")

In [95]:
citing_papers_df = utils.load_from_csv("../data/more_citing_papers.csv")

In [96]:
papers_df.head()

Unnamed: 0.1,Unnamed: 0,paper_id,title,result_id,link,snippet,resources_title,resources_link,citation_count,cites_id,versions,cluster_id,full_citation,key_words,authors,pub_info,year
0,0,0,A multi-agent system for the automation of a p...,qljx-SC6MYEJ,https://www.academia.edu/download/48859036/A_m...,This paper presents a system architecture whic...,academia.edu,https://www.academia.edu/download/48859036/A_m...,74,9309426555546589354,4,9309426555546589354,"Rebollo, Miguel, et al. ""A multi-agent system ...","[multiagent, system, automation, port, contain...","Rebollo, Miguel, et al.",Workshop in Agents in Industry. Barcelona. 2000.,2000
1,1,1,Automation in port container terminals,hQkbtalB1OAJ,https://www.sciencedirect.com/science/article/...,… “ Business Process Model and Notation (BPMN)...,sciencedirect.com,https://www.sciencedirect.com/science/article/...,49,16200645956702243205,3,16200645956702243205,"Martín-Soberón, Ana María, et al. ""Automation ...","[automation, port, container, terminals]","Martín-Soberón, Ana María, et al.",Procedia-Social and Behavioral Sciences 160 (2...,2014
2,2,2,Container port automation,0q7hE6UAsyEJ,https://link.springer.com/content/pdf/10.1007/...,… The Patrick Fisherman's Island terminal is n...,no data,no data,13,2428285333085990610,6,2428285333085990610,"Nelmes, Graeme. ""Container port automation."" F...","[container, port, automation]","Nelmes, Graeme.","Field and Service Robotics. Springer, Berlin, ...",2006
3,3,3,TRACES: TRAFFIC CONTROL ENGINEERING SYSTEM A c...,wgi33W6YplUJ,https://citeseerx.ist.psu.edu/viewdoc/download...,In this study a control system to coordinate t...,psu.edu,https://citeseerx.ist.psu.edu/viewdoc/download...,27,6171787941291428034,6,6171787941291428034,"Duinkerken, Mark B., Joseph JM Evers, and Jaap...","[traces, traffic, control, engineering, system...","Duinkerken, Mark B., Joseph JM Evers, and Jaap...",signal 2 (1999): v3.,1999
4,4,4,Multi-agent system technology in a port contai...,Q4_ut9Yo1yoJ,https://www.researchgate.net/profile/V-Botti/p...,In response to the arrival of a ship (ship age...,researchgate.net,https://www.researchgate.net/profile/V-Botti/p...,13,3086980972259741507,no data,no data,"Botti, Vicent J. ""Multi-agent system technolog...","[multiagent, system, technology, port, contain...","Botti, Vicent J.",ERCIM News 56 (2004): 37-39.,2004


In [10]:
papers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       20 non-null     int64 
 1   paper_id         20 non-null     int64 
 2   title            20 non-null     object
 3   result_id        20 non-null     object
 4   link             20 non-null     object
 5   snippet          20 non-null     object
 6   resources_title  20 non-null     object
 7   resources_link   20 non-null     object
 8   citation_count   20 non-null     int64 
 9   cites_id         20 non-null     object
 10  versions         20 non-null     object
 11  cluster_id       20 non-null     object
 12  full_citation    20 non-null     object
 13  key_words        20 non-null     object
 14  authors          20 non-null     object
 15  pub_info         20 non-null     object
 16  year             20 non-null     int64 
dtypes: int64(4), object(13)
memory usage:

## Get most active author

In [97]:
def get_most_active_author(df):
    # Separates authors from authors list for each paper
    authors_list = []
    for _, row in df.iterrows():
        authors_list.append(row.authors.split(","))
    # Flatten the list
    flat_authors_list = utils.flatten_list(authors_list)
    # Strip authors from blank spaces
    stripped_list = list(map(str.strip, flat_authors_list))
    # Remove "et al." from list
    words_to_remove = ["et al.", "and "]
    result = filter(lambda val: val != words_to_remove[0], stripped_list)
    list_authors = list(result)
    # Remove "and" from authors
    final_author_list = [i.strip("and ") for i in list_authors]
    authors_sorted = Counter(final_author_list).most_common()
    authors_sorted_df = pd.DataFrame(authors_sorted, columns=["author", "occurence"])
    return authors_sorted_df

In [98]:
get_most_active_author(citing_papers_df).head(5)

Unnamed: 0,author,occurence
0,Wang,5
1,Henesey,4
2,Duinkerke,4
3,Ko,4
4,W. K.,4


## Get most active journal

In [114]:
# Get rid of the year and any thing after it
list_journals = []
for _, row in papers_df.iterrows():
    list_journals.append(re.split(r' \d{4}| \(\d{4}\)', row.pub_info)[0])
list_journals

['Workshop in Agents in Industry. Barcelona.',
 'Procedia-Social and Behavioral Sciences 160',
 'Field and Service Robotics. Springer, Berlin, Heidelberg,',
 'signal 2',
 'ERCIM News 56',
 'Maritime Economics & Logistics 21.4',
 'International Journal of Hybrid Information 7.2',
 'Journal of Navigation and Port Research 42.3',
 'Mathematical Problems in Engineering',
 'New Technology, Work and Employment 33.3',
 'Sustainability 13.11',
 'MS thesis',
 'Journal of Navigation and Port Research 42.3',
 'Computers & Industrial Engineering 156',
 'IOP Conference Series: Earth and Environmental Science. Vol. 557. No. 1. IOP Publishing,',
 'Academic Journal of Engineering and Technology Science 2.1',
 'Process Automation Instrumentation 3',
 'Logistics 4.1',
 'Handbook of Ocean Container Transport Logistics',
 'Transportation Research Circular 459']

In [126]:
# Get rid of all digits
final_list_journals = []
for e in list_journals:
    final_list_journals.append(re.split(r'(\d+)', e)[0].strip())
final_list_journals

['Workshop in Agents in Industry. Barcelona.',
 'Procedia-Social and Behavioral Sciences',
 'Field and Service Robotics. Springer, Berlin, Heidelberg,',
 'signal',
 'ERCIM News',
 'Maritime Economics & Logistics',
 'International Journal of Hybrid Information',
 'Journal of Navigation and Port Research',
 'Mathematical Problems in Engineering',
 'New Technology, Work and Employment',
 'Sustainability',
 'MS thesis',
 'Journal of Navigation and Port Research',
 'Computers & Industrial Engineering',
 'IOP Conference Series: Earth and Environmental Science. Vol.',
 'Academic Journal of Engineering and Technology Science',
 'Process Automation Instrumentation',
 'Logistics',
 'Handbook of Ocean Container Transport Logistics',
 'Transportation Research Circular']

In [128]:
# Splitting by "." if any
journals = []
for e in final_list_journals:
    journals.append(re.split(r'\.', e)[0])
journals

['Workshop in Agents in Industry',
 'Procedia-Social and Behavioral Sciences',
 'Field and Service Robotics',
 'signal',
 'ERCIM News',
 'Maritime Economics & Logistics',
 'International Journal of Hybrid Information',
 'Journal of Navigation and Port Research',
 'Mathematical Problems in Engineering',
 'New Technology, Work and Employment',
 'Sustainability',
 'MS thesis',
 'Journal of Navigation and Port Research',
 'Computers & Industrial Engineering',
 'IOP Conference Series: Earth and Environmental Science',
 'Academic Journal of Engineering and Technology Science',
 'Process Automation Instrumentation',
 'Logistics',
 'Handbook of Ocean Container Transport Logistics',
 'Transportation Research Circular']

In [129]:
# Sort journals by occurence and store in dataframe
journals_sorted = Counter(journals).most_common()
journals_sorted_df = pd.DataFrame(journals_sorted, columns=["journal", "occurence"])

In [130]:
journals_sorted_df

Unnamed: 0,journal,occurence
0,Journal of Navigation and Port Research,2
1,Workshop in Agents in Industry,1
2,Procedia-Social and Behavioral Sciences,1
3,Field and Service Robotics,1
4,signal,1
5,ERCIM News,1
6,Maritime Economics & Logistics,1
7,International Journal of Hybrid Information,1
8,Mathematical Problems in Engineering,1
9,"New Technology, Work and Employment",1


In [132]:
def get_most_active_journal(df):
    # Get rid of the year and any thing after it
    list_journals = []
    for _, row in df.iterrows():
        list_journals.append(re.split(r' \d{4}| \(\d{4}\)', row.pub_info)[0])
    # Get rid of all digits
    final_list_journals = []
    for e in list_journals:
        final_list_journals.append(re.split(r'(\d+)', e)[0].strip())
    # Splitting by "." if any
    journals = []
    for e in final_list_journals:
        journals.append(re.split(r'\.', e)[0])
    # Sort journals by occurence and store in dataframe
    journals_sorted = Counter(journals).most_common()
    journals_sorted_df = pd.DataFrame(journals_sorted, columns=["journal", "occurence"])
    return journals_sorted_df

In [135]:
get_most_active_journal(papers_df)

Unnamed: 0,journal,occurence
0,Journal of Navigation and Port Research,2
1,Workshop in Agents in Industry,1
2,Procedia-Social and Behavioral Sciences,1
3,Field and Service Robotics,1
4,signal,1
5,ERCIM News,1
6,Maritime Economics & Logistics,1
7,International Journal of Hybrid Information,1
8,Mathematical Problems in Engineering,1
9,"New Technology, Work and Employment",1


In [140]:
indices_to_remove = []
for index, e in citing_papers_df.pub_info.iteritems():
    if type(e) == str:
        if e.startswith(r'\w') == False:
            indices_to_remove.append(index)
    else:
        indices_to_remove.append(index)
indices_to_remove

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121]