<a href="https://colab.research.google.com/github/joaochenriques/PAS_STATS/blob/main/PAS_STATS_V02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as mpl
import sys, pathlib
import itertools
from scipy import optimize
from sortedcontainers import SortedDict

from matplotlib.ticker import (MultipleLocator, FormatStrFormatter,AutoMinorLocator)

import pathlib, subprocess

def cmdcall( cmd ):
    output = subprocess.getoutput( cmd )
    print(output)

if not pathlib.Path("mpl_utils.py").exists():
  cmdcall( 'curl -O https://raw.githubusercontent.com/joaochenriques/ipynb_libs/main/mpl_utils.py' )

import mpl_utils as mut
mut.config_plots()

markers = ( 'o', '^', 's', 'v', 'H', 'X', 'P' )

from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')

In [None]:
cmdcall( 'pip install dataclassy' )
from dataclassy import dataclass

In [None]:
cmdcall( 'pip install iso3166' )
from iso3166 import countries

country_lst = []
for c in countries:
    country_lst.append( c.name )

# Scopus advanced search query

```
TITLE-ABS-KEY-AUTH(WAVE-ENERGY) AND PUBYEAR AFT 2003 AND DOCTYPE(ar OR re) AND
(
    SRCTITLE(applied-energy) OR
    SRCTITLE(applied-ocean-research) OR
    SRCTITLE(energy) OR
    SRCTITLE(energy-conversion-and-management ) OR
    SRCTITLE(energies) OR
    SRCTITLE(ieee-transactions-on-sustainable-energy) OR
    SRCTITLE(iet-renewable-power-generation) OR
    SRCTITLE(international-journal-of-marine-energy ) OR
    SRCTITLE(international-journal-of-offshore-and-polar-engineering ) OR
    SRCTITLE(journal-of-offshore-mechanics-and-arctic-engineering) OR
    SRCTITLE(Journal-of-Ocean-Engineering-and-Marine-Energy) OR
    SRCTITLE(ocean-engineering) OR
    SRCTITLE(marine-structures) OR
    SRCTITLE(renewable-energy) OR
    SRCTITLE(renewable-sustainable-energy-reviews) 
)
```

In [None]:
filename = 'scopus_20230608D.csv'
if 'google.colab' in sys.modules and not pathlib.Path( f"{filename}").exists():
    cmdcall( f'curl -O https://raw.githubusercontent.com/joaochenriques/PAS_STATS/main/{filename}' )

In [None]:
df = pd.read_csv( filename )
df.keys()

In [None]:
affiliations_lst = df['Affiliations']
stage_lst = df['Publication Stage'] 

replacements_dic = {   
    'Ireland (formerly at the University of Plymouth)': 'Ireland',
    'Univ. Paris6': 'France',
    'Chinese Academy of Sciences': 'China',
    'Instituto Superior Tx000E9': 'Portugal'
}

# Countries ordered by percentage of the authors's country

In [35]:
countries_frac_dic = {}
total_valid_papers = 0

for n, ( affiliations, stage ) in enumerate( zip( affiliations_lst, stage_lst ) ):
    if stage == 'Final' and affiliations == affiliations: # detect NANs
        total_valid_papers += 1

        institutions_lst = affiliations.split( ';' )
        frac = 1.0 / len(institutions_lst)

        for institution in institutions_lst:
            country = institution.rsplit( ',', 1 )[-1].strip()

            if country in replacements_dic:
                country = replacements_dic[country]

            if not country in countries_frac_dic:
                countries_frac_dic[country] = frac
            else:
                countries_frac_dic[country] += frac

sorted_country_frac_rank = { k: v for k, v in sorted( countries_frac_dic.items(), key=lambda item: item[1], reverse=True ) }

print( "Number of valid papers: ", int(total_valid_papers) )

df_country_name_lst = []
df_country_num_lst = []

df_country_dic = { 
        "Country": df_country_name_lst,
        "Num papers": df_country_num_lst
}  

for n, ( name, num ) in enumerate( sorted_country_frac_rank.items() ):
    df_country_name_lst.append( name ) 
    df_country_num_lst.append( int(num) )

pd.DataFrame( df_country_dic ).head(10)

Number of valid papers:  3580


Unnamed: 0,Country,Num papers
0,China,731
1,United Kingdom,369
2,United States,360
3,Portugal,214
4,Spain,177
5,Ireland,155
6,Italy,152
7,Australia,144
8,India,140
9,Sweden,103


# Number of papers per author, citations, and highest cited paper

In [None]:
authors_lst = df['Authors']
authors_ID_lst = df['Author(s) ID'] 
citations_lst = df['Cited by']
DOI_lst = df['DOI']

@dataclass
class data:
    name: str = None
    num_papers: int = 0
    citations: int = 0
    highest_cited: int = 0
    highest_DOI: str = None

In [None]:
papers_dic = {}

for n, ( authors, authors_ID, citations, DOI, stage ) in enumerate( zip( authors_lst, authors_ID_lst, citations_lst, DOI_lst, stage_lst ) ):
    if stage == 'Final' and authors == authors and authors_ID == authors_ID: # check NANs
        for ( author, ID ) in zip( authors.split(';'), str( authors_ID ).split(';') ):
            author = author.strip()
            ID = ID.strip()

            if ID in papers_dic:
                papers_dic[ID].num_papers += 1
                papers_dic[ID].citations += int(citations)
                if int(citations) > papers_dic[ID].highest_cited:
                    papers_dic[ID].highest_cited = int(citations)
                    papers_dic[ID].highest_DOI = DOI                  
            else:
                papers_dic[ID] = data( author, 1, int(citations), int(citations), DOI )

In [42]:
sorted_papers_dic = { k: v for k, v in sorted( papers_dic.items(), key=lambda item: item[1].num_papers, reverse=True ) }

df_authors_name_lst = []
df_authors_num_lst = []

df_authors_dic = { 
        "Author": df_authors_name_lst,
        "Num papers": df_authors_num_lst
}  

for n, dt in enumerate( sorted_papers_dic.values() ):
    df_authors_name_lst.append( dt.name ) 
    df_authors_num_lst.append( dt.num_papers )

pd.DataFrame( df_authors_dic ).head(20)

Unnamed: 0,Author,Num papers
0,Ringwood J.V.,92
1,Iglesias G.,79
2,Guedes Soares C.,64
3,Gato L.M.C.,63
4,Henriques J.C.C.,62
5,Falcão A.F.O.,56
6,Ning D.,47
7,Greaves D.,45
8,Leijon M.,44
9,Wang Z.L.,42


In [43]:
sorted_citations_dic = { k: v for k, v in sorted( papers_dic.items(), key=lambda item: item[1].citations, reverse=True ) }

df_citations_name_lst = []
df_citations_num_lst = []

df_citations_dic = { 
        "Author": df_citations_name_lst,
        "Num papers": df_citations_num_lst
}  

for n, dt in enumerate( sorted_citations_dic.values() ):
    df_citations_name_lst.append( dt.name ) 
    df_citations_num_lst.append( dt.citations )

pd.DataFrame( df_citations_dic ).head(10)

Unnamed: 0,Author,Num papers
0,Falcão A.F.O.,4991
1,Iglesias G.,4914
2,Wang Z.L.,4696
3,Ringwood J.V.,3159
4,Guedes Soares C.,2643
5,Babarit A.,2527
6,Henriques J.C.C.,2384
7,Moan T.,2337
8,Carballo R.,2022
9,Jiang T.,1977


In [46]:
sorted_highest_cited_dic = { k: v for k, v in sorted( papers_dic.items(), key=lambda item: item[1].highest_cited, reverse=True ) }

df_highest_cited_name_lst = []
df_highest_cited_num_lst = []
df_highest_cited_DOI = []

df_highest_cited_dic = { 
        "Author": df_highest_cited_name_lst,
        "Num papers": df_highest_cited_num_lst,
        "DOI": df_highest_cited_DOI
}  

for n, dt in enumerate( sorted_highest_cited_dic.values() ):
    df_highest_cited_name_lst.append( dt.name ) 
    df_highest_cited_num_lst.append( dt.highest_cited )
    df_highest_cited_DOI.append( dt.highest_DOI )

pd.DataFrame( df_highest_cited_dic ).head(10)

Unnamed: 0,Author,Num papers,DOI
0,Falcão A.F.O.,2035,10.1016/j.rser.2009.11.003
1,Plummer A.R.,867,10.1243/09576509JPE782
2,Drew B.,867,10.1243/09576509JPE782
3,Sahinkaya M.N.,867,10.1243/09576509JPE782
4,Falnes J.,809,10.1016/j.marstruc.2007.09.001
5,Wang Z.L.,727,10.1016/j.nanoen.2017.06.035
6,Jiang T.,727,10.1016/j.nanoen.2017.06.035
7,Xu L.,727,10.1016/j.nanoen.2017.06.035
8,Henriques J.C.C.,594,10.1016/j.renene.2015.07.086
9,Kurniawan A.,551,10.1016/j.renene.2011.10.002
