# Test notebook for KPI dev

## Imports

In [1]:
import pandas as pd
import numpy as np
from string import digits
from collections import Counter
import re

from startupjh import utils
from startupjh import plots
from startupjh import data_preprocess

%load_ext autoreload
%autoreload 2

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/commander/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data loading

In [2]:
papers_df = utils.load_from_csv("../data/more_papers.csv")

In [3]:
citing_papers_df = utils.load_from_csv("../data/more_citing_papers.csv")

In [4]:
papers_df.head()

Unnamed: 0.1,Unnamed: 0,paper_id,title,result_id,link,snippet,resources_title,resources_link,citation_count,cites_id,versions,cluster_id,full_citation,key_words,authors,pub_info,year
0,0,0,A multi-agent system for the automation of a p...,qljx-SC6MYEJ,https://www.academia.edu/download/48859036/A_m...,This paper presents a system architecture whic...,academia.edu,https://www.academia.edu/download/48859036/A_m...,74,9309426555546589354,4,9309426555546589354,"Rebollo, Miguel, et al. ""A multi-agent system ...","[multiagent, system, automation, port, contain...","Rebollo, Miguel, et al.",Workshop in Agents in Industry. Barcelona. 2000.,2000
1,1,1,Automation in port container terminals,hQkbtalB1OAJ,https://www.sciencedirect.com/science/article/...,… “ Business Process Model and Notation (BPMN)...,sciencedirect.com,https://www.sciencedirect.com/science/article/...,49,16200645956702243205,3,16200645956702243205,"Martín-Soberón, Ana María, et al. ""Automation ...","[automation, port, container, terminals]","Martín-Soberón, Ana María, et al.",Procedia-Social and Behavioral Sciences 160 (2...,2014
2,2,2,Container port automation,0q7hE6UAsyEJ,https://link.springer.com/content/pdf/10.1007/...,… The Patrick Fisherman's Island terminal is n...,no data,no data,13,2428285333085990610,6,2428285333085990610,"Nelmes, Graeme. ""Container port automation."" F...","[container, port, automation]","Nelmes, Graeme.","Field and Service Robotics. Springer, Berlin, ...",2006
3,3,3,TRACES: TRAFFIC CONTROL ENGINEERING SYSTEM A c...,wgi33W6YplUJ,https://citeseerx.ist.psu.edu/viewdoc/download...,In this study a control system to coordinate t...,psu.edu,https://citeseerx.ist.psu.edu/viewdoc/download...,27,6171787941291428034,6,6171787941291428034,"Duinkerken, Mark B., Joseph JM Evers, and Jaap...","[traces, traffic, control, engineering, system...","Duinkerken, Mark B., Joseph JM Evers, and Jaap...",signal 2 (1999): v3.,1999
4,4,4,Multi-agent system technology in a port contai...,Q4_ut9Yo1yoJ,https://www.researchgate.net/profile/V-Botti/p...,In response to the arrival of a ship (ship age...,researchgate.net,https://www.researchgate.net/profile/V-Botti/p...,13,3086980972259741507,no data,no data,"Botti, Vicent J. ""Multi-agent system technolog...","[multiagent, system, technology, port, contain...","Botti, Vicent J.",ERCIM News 56 (2004): 37-39.,2004


In [17]:
most_active_journals = data_preprocess.get_most_active_journal(citing_papers_df)

In [18]:
most_active_journals[most_active_journals["occurence"] == max(most_active_journals.occurence)]

Unnamed: 0,journal,occurence
0,Maritime Economics & Logistics,3
1,Logistics,3
2,Research in Transportation Business & Management,3
3,Maritime Business Review,3
4,Transportation Research Part B: Methodological,3


## Get most active author

In [4]:
def get_most_active_author(df):
    # Separates authors from authors list for each paper
    authors_list = []
    for _, row in df.iterrows():
        authors_list.append(row.authors.split(","))
    # Flatten the list
    flat_authors_list = utils.flatten_list(authors_list)
    # Strip authors from blank spaces
    stripped_list = list(map(str.strip, flat_authors_list))
    # Remove "et al." from list
    words_to_remove = ["et al.", "and "]
    result = filter(lambda val: val != words_to_remove[0], stripped_list)
    list_authors = list(result)
    # Remove "and" from authors
    final_author_list = [i.strip("and ") for i in list_authors]
    authors_sorted = Counter(final_author_list).most_common()
    authors_sorted_df = pd.DataFrame(authors_sorted, columns=["author", "occurence"])
    return authors_sorted_df

In [5]:
get_most_active_author(citing_papers_df).head(5)

Unnamed: 0,author,occurence
0,Wang,5
1,Henesey,4
2,Duinkerke,4
3,Ko,4
4,W. K.,4


## Get most active journal

In [49]:
def get_most_active_journal(df):
    pub_info_list = df.pub_info
    # Get rid of any pub_info that is NOT a STR or that doesn't start with a letter
    indices_to_remove = []
    for index, e in pub_info_list.iteritems():
        if type(e) == str:
            if bool(re.match(r'\w', e)) != True:
                indices_to_remove.append(index)
            elif len(e) == 0:
                indices_to_remove.append(index)
        else:
            indices_to_remove.append(index)
    for i in indices_to_remove:
        del pub_info_list[i]
    # Get rid of the year and any thing after it
    list_journals = []
    for _, e in pub_info_list.iteritems():
        list_journals.append(re.split(r'\(\d{4}\)', e)[0])
    # Splitting by "." if any
    journals = []
    for e in list_journals:
        journals.append(re.split(r'\.', e)[0])
    # Get rid of all ENDING digits if any
    for i in range(len(journals)):
        #if bool(re.match(r'.+\d', e)):
        journals[i] = journals[i].strip().rstrip(digits).strip()
    # Sort journals by occurence and store in dataframe
    journals_sorted = Counter(journals).most_common()
    journals_sorted_df = pd.DataFrame(journals_sorted, columns=["journal", "occurence"])
    return journals_sorted_df