<a href="https://colab.research.google.com/github/joaochenriques/PAS_STATS/blob/main/PAS_STATS_V02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as mpl
import sys, pathlib
import itertools
from scipy import optimize
from sortedcontainers import SortedDict

from matplotlib.ticker import (MultipleLocator, FormatStrFormatter,AutoMinorLocator)

import pathlib, subprocess

def cmdcall( cmd ):
    output = subprocess.getoutput( cmd )
    print(output)

if not pathlib.Path("mpl_utils.py").exists():
  cmdcall( 'curl -O https://raw.githubusercontent.com/joaochenriques/ipynb_libs/main/mpl_utils.py' )

import mpl_utils as mut
mut.config_plots()

markers = ( 'o', '^', 's', 'v', 'H', 'X', 'P' )

from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')

In [None]:
!pip install dataclassy
from dataclassy import dataclass

In [None]:
!pip install iso3166
from iso3166 import countries

country_lst = []
for c in countries:
    country_lst.append( c.name )

# Scopus advanced search query

```
TITLE-ABS-KEY-AUTH(WAVE-ENERGY) AND PUBYEAR AFT 2003 AND DOCTYPE(ar OR re) AND
(
    SRCTITLE(applied-energy) OR
    SRCTITLE(applied-ocean-research) OR
    SRCTITLE(energy) OR
    SRCTITLE(energy-conversion-and-management ) OR
    SRCTITLE(energies) OR
    SRCTITLE(ieee-transactions-on-sustainable-energy) OR
    SRCTITLE(iet-renewable-power-generation) OR
    SRCTITLE(international-journal-of-marine-energy ) OR
    SRCTITLE(international-journal-of-offshore-and-polar-engineering ) OR
    SRCTITLE(journal-of-offshore-mechanics-and-arctic-engineering) OR
    SRCTITLE(Journal-of-Ocean-Engineering-and-Marine-Energy) OR
    SRCTITLE(ocean-engineering) OR
    SRCTITLE(marine-structures) OR
    SRCTITLE(renewable-energy) OR
    SRCTITLE(renewable-sustainable-energy-reviews) 
)
```

In [None]:
filename = 'scopus_20230608D.csv'
if 'google.colab' in sys.modules and not pathlib.Path( f"{filename}").exists():
    cmdcall( f'curl -O https://raw.githubusercontent.com/joaochenriques/PAS_STATS/main/{filename}' )

In [None]:
df = pd.read_csv( filename )
df.keys()

In [None]:
affiliations_lst = df['Affiliations']
stage_lst = df['Publication Stage'] 

replacements_dic = {   
    'Ireland (formerly at the University of Plymouth)': 'Ireland',
    'Univ. Paris6': 'France',
    'Chinese Academy of Sciences': 'China',
    'Instituto Superior Tx000E9': 'Portugal'
}

# Countries ordered by first author

In [None]:
countries_1st_dic = {}
total_valid_papers = 0

for n, ( affiliations, stage ) in enumerate( zip( affiliations_lst, stage_lst ) ):
    if stage == 'Final' and affiliations == affiliations: # detect NANs
        total_valid_papers += 1

        affiliations_unique = [] 
        
        for i in affiliations.split( ';' ): 
            if i not in affiliations_unique: 
                affiliations_unique.append(i) 
            # else:
            #     print( 'Duplicate:', n )
            #     print( affiliations )

        for institution in affiliations_unique:
            country = institution.rsplit( ',', 1 )[-1].strip()

            if country in replacements_dic:
                country = replacements_dic[country]

            if not country in countries_1st_dic:
                countries_1st_dic[country] = 1
            else:
                countries_1st_dic[country] += 1
    
sorted_country_1st_rank = { k: v for k, v in sorted( countries_1st_dic.items(), key=lambda item: item[1], reverse=True ) }

for ( name, num ) in sorted_country_1st_rank.items():
    if num > 50:
        print( name, num )

# Countries ordered by percentage of the authors's country

In [None]:
countries_frac_dic = {}
total_valid_papers = 0

for n, ( affiliations, stage ) in enumerate( zip( affiliations_lst, stage_lst ) ):
    if stage == 'Final' and affiliations == affiliations: # detect NANs
        total_valid_papers += 1

        institutions_lst = affiliations.split( ';' )
        frac = 1.0 / len(institutions_lst)

        for institution in institutions_lst:
            country = institution.rsplit( ',', 1 )[-1].strip()

            if country in replacements_dic:
                country = replacements_dic[country]

            if not country in countries_frac_dic:
                countries_frac_dic[country] = frac
            else:
                countries_frac_dic[country] += frac

sorted_country_frac_rank = { k: v for k, v in sorted( countries_frac_dic.items(), key=lambda item: item[1], reverse=True ) }

print( "Number of valid papers: ", int(total_valid_papers) )

for n, ( name, num ) in enumerate( sorted_country_frac_rank.items() ):
    if n >= 10:
        break
    print( n+1, name, int(num) )

# Number of papers per author, citations, and highest cited paper

In [None]:
authors_lst = df['Authors']
authors_ID_lst = df['Author(s) ID'] 
citations_lst = df['Cited by']

@dataclass
class data:
    name: str = None
    num_papers: int = 0
    citations: int = 0
    highest_cited: int = 0

In [None]:
papers_dic = {}

for n, ( authors, authors_ID, citations, stage ) in enumerate( zip( authors_lst, authors_ID_lst, citations_lst, stage_lst ) ):
    if stage == 'Final' and authors == authors and authors_ID == authors_ID: # check NANs
        for ( author, ID ) in zip( authors.split(';'), str( authors_ID ).split(';') ):
            author = author.strip()
            ID = ID.strip()

            if ID in papers_dic:
                papers_dic[ID].num_papers += 1
                papers_dic[ID].citations += int(citations)
                papers_dic[ID].highest_cited = max( papers_dic[ID].highest_cited, int(citations) )
            else:
                papers_dic[ID] = data( author, 1, int(citations), int(citations) )

In [None]:
sorted_papers_dic = { k: v for k, v in sorted( papers_dic.items(), key=lambda item: item[1].num_papers, reverse=True ) }
for n, author_data in enumerate( sorted_papers_dic.values() ):
    if author_data.num_papers >= 25:
        print( n+1, author_data.name, author_data.num_papers )

In [None]:
sorted_citations_dic = { k: v for k, v in sorted( papers_dic.items(), key=lambda item: item[1].citations, reverse=True ) }
for n, author_data in enumerate( sorted_citations_dic.values() ):
    if n >= 20:
        break
    print( n+1, author_data.name, author_data.citations )

In [None]:
sorted_highest_cited_dic = { k: v for k, v in sorted( papers_dic.items(), key=lambda item: item[1].highest_cited, reverse=True ) }
for n, author_data in enumerate( sorted_highest_cited_dic.values() ):
    if n >= 20:
        break
    print( n+1, author_data.name, author_data.highest_cited )