# JSS Systematic Mapping Study
This notebook shows how to use litstudy from inside a Jupyter notebook. It shows how to load a dataset, plot statistics, perform topic modeling, do network analysis, and some more advanced features.

This notebook focuses on the topic of SECO for CPS developoment.

# Imports

In [1]:
# Import other libraries
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sbs
import re

# Options for plots
plt.rcParams['figure.figsize'] = (10, 6)
sbs.set('paper')

# Import litstudy
path = os.path.abspath(os.path.join('..'))
if path not in sys.path:
    sys.path.append(path)

import litstudy

In [2]:
from datetime import datetime

address_file = sys.path[0] #actual diretory

try:
    # address_file = address_file.replace('\\','\') #string maintenance with directory
    os.mkdir(f'{address_file}\\data') #creating a subfolder
    address_file = f'{address_file}\\data'

except FileExistsError: #if subfolder already exists
    address_file = f'{address_file}\\data'

# list all the files from the directory
file_list = os.listdir(address_file)
print(file_list)

currentDateAndTime = datetime.now()
filename = (f'SOTA_SECO_CPS_{currentDateAndTime.year}-{currentDateAndTime.month}-{currentDateAndTime.day}_All_DB'
            f'_{currentDateAndTime.hour}-{currentDateAndTime.minute}-{currentDateAndTime.second}')
filename_xlsx = (f'SOTA_SECO_CPS_{currentDateAndTime.year}-{currentDateAndTime.month}-{currentDateAndTime.day}'
            f'_{currentDateAndTime.hour}-{currentDateAndTime.minute}-{currentDateAndTime.second}_All_DB.xlsx')

print(filename)

['acm_Research_Article_filtered.bib', 'acm_Survey_filtered.bib', 'IEEE_2024_10_7_10_54_43.bib', 'scopus.bib', 'wos.bib']
SOTA_SECO_CPS_2024-10-10_All_DB_2-27-19


# Collecting the dataset
We have queried Scopus, IEEE Xplore, ACM Digital Library, WebOfScience for:

( "Digital Twin*" OR "DT*" ) AND ( "model-based*" OR "model-driven*" ) AND ( "systematic literature review*" OR "literature review*"  OR "systematic review*" OR "systematic mapping stud*" OR "mapping stud*" OR "systematic review*" OR "systematic stud*" OR SLR OR SMS OR survey*) 

We load all files document sets from csv/bibtex files.

In [3]:
# Load the IEEE BibTex files
docs_ieee = litstudy.load_bibtex('data/IEEE_2024_10_7_10_54_43.bib')
print(len(docs_ieee), 'papers loaded from IEEE')

35 papers loaded from IEEE


In [4]:
# Load the SCOPUS BibTex files
docs_scopus = litstudy.load_bibtex('data/scopus.bib')
print(len(docs_scopus), 'papers loaded from Scopus')

133 papers loaded from Scopus


In [5]:
# Load the ACM BibTex files
docs_acm = litstudy.load_bibtex('data/acm_Survey_filtered.bib')
print(len(docs_acm), 'papers loaded from ACM')

18 papers loaded from ACM


In [6]:
# Load the Web Of Science BibTex files
docs_webofscience = litstudy.load_bibtex('data/wos.bib')
print(len(docs_webofscience), 'papers loaded from Web Of Science')

79 papers loaded from Web Of Science


In [7]:
# Merge all document sets
docs_bib = docs_ieee | docs_scopus | docs_acm | docs_webofscience
print(len(docs_bib), 'papers loaded from BibTex/csv')

224 papers loaded from BibTex/csv


In [8]:
import logging
logging.getLogger().setLevel(logging.CRITICAL)

docs_found_scopus, docs_notfound_scopus = litstudy.refine_scopus(docs_bib)

  logging.warn(f"no document found for DOI {doi}: {e}")
 22%|██▏       | 49/224 [00:11<00:39,  4.38it/s]


Scopus400Error: Exceeds the maximum number allowed for the service level

In [12]:
print(len(docs_found_scopus), 'papers found on Scopus')
print(len(docs_notfound_scopus), 'papers were not found')
docs_bib_scopus = docs_found_scopus | docs_notfound_scopus
len(docs_bib_scopus)

181 papers found on Scopus
103 papers were not found


284

In [13]:
docs_bib = docs_bib_scopus # docs_bib_scopus | docs_bib_SemanticScholar | docs_bib_CrossRef
docs_filtered = docs_bib # .filter_docs(lambda d: d.publication_year >= 1990)

In [14]:
index = 0
data = []
while index < len(docs_filtered):
    # re.sub('[<\[\]>]', '', str(docs_filtered[index].authors))

    authorList = []
    for author in docs_filtered[index].authors or []:
        authorList.append(author.name)

    if type(docs_filtered[index].id.doi) == type(None):
        print(docs_filtered[index].title)
        doi_paper = ''
        doi_paper_custom = ''
        # print(doi_paper)
    else:
        doi_paper = str(docs_filtered[index].id.doi) # 'https://www.doi.org/' + 
        doi_paper_custom = 'https://www.doi.org/' + str(docs_filtered[index].id.doi)
        print(doi_paper)

    data.append({'Authors': '', 'Author full names': re.sub(r'[\[\'\]]', '', str(authorList)), 'Author(s) ID': '', 
    'Title': docs_filtered[index].title, 'Year': docs_filtered[index].publication_year, 'Source title': docs_filtered[index].publication_source, 
    'Volume': '', 'Issue': '', 'Art. No.': '', 'Page start': '', 'Page end': '', 'Page count': '', 'Cited by': docs_filtered[index].citation_count, 
    'DOI': doi_paper, 'Link': doi_paper_custom,  'Affiliations': re.sub(r'[\[\'\]]', '', str(docs_filtered[index].affiliations)), 
    'Authors with affiliations': '', 'Abstract': docs_filtered[index].abstract, 'Author Keywords': re.sub(r'[\[\'\]]', '', str(docs_filtered[index].keywords)), 
    'Index Keywords': '', 'Molecular Sequence Numbers': '',  'Chemicals/CAS': '', 'Tradenames': '', 'Manufacturers': '', 
    'Funding Details': '', 'Funding Texts': '', 'References': re.sub(r'[\[\'\]]', '', str(docs_filtered[index].references)), 'Correspondence Address': '', 'Editors': '', 'Publisher': docs_filtered[index].publisher, 
    'Sponsors': '', 'Conference name': '', 'Conference date': str(docs_filtered[index].publication_date), 'Conference location': '', 'Conference code': '', 'ISSN': '', 
    'ISBN': '', 'CODEN': '', 'PubMed ID': docs_filtered[index].id.pubmed, 'Language of Original Document': docs_filtered[index].language, 'Abbreviated Source Title': '', 'Document Type': docs_filtered[index].source_type, 
    'Publication Stage': '', 'Open Access': '', 'Source': '', 'EID': docs_filtered[index].id.scopusid})
    
    index += 1

# Saving first group of data to a single excel file
df = pd.DataFrame(data, columns=['Authors', 'Author full names', 'Author(s) ID', 
    'Title', 'Year', 'Source title', 
    'Volume', 'Issue', 'Art. No.', 'Page start', 'Page end', 'Page count', 'Cited by', 
    'DOI', 'Link',  'Affiliations', 'Authors with affiliations', 'Abstract', 'Author Keywords', 
    'Index Keywords', 'Molecular Sequence Numbers',  'Chemicals/CAS', 'Tradenames', 'Manufacturers', 
    'Funding Details', 'Funding Texts', 'References', 'Correspondence Address', 'Editors', 'Publisher', 
    'Sponsors', 'Conference name', 'Conference date', 'Conference location', 'Conference code', 'ISSN', 
    'ISBN', 'CODEN', 'PubMed ID', 'Language of Original Document', 'Abbreviated Source Title', 'Document Type', 
    'Publication Stage', 'Open Access', 'Source', 'EID'])

# Saving first group of data to a single csv file
df.to_csv('Results/' + filename + '.csv')

# Saving first group of data to a single excel file
df.to_excel('Results/' + filename_xlsx, index=False)

10.1007/s00450-019-00426-5
10.1109/CloudCom.2014.158
10.1109/SEAA.2015.54
10.1109/TIFS.2021.3054968
10.1109/TITS.2021.3068092
10.1109/JIOT.2018.2870294
10.1109/JIOT.2023.3268474
10.1109/TNSE.2021.3110003
10.1109/UIC-ATC-ScalCom-CBDCom-IoP.2015.174
10.1109/HPCC-CSS-ICESS.2015.37
10.1109/IC2E48712.2020.00016
10.1109/FAS-W.2016.57
10.1109/TITS.2023.3307660
10.1109/MWC.2019.1800521
10.1007/978-3-319-72125-5_1
10.1145/2591062.2591140
10.1145/3056540.3076192
10.1145/2804337.2804341
10.1145/3528229.3529384
10.1109/SESoS59159.2023.00014
10.1145/3524844.3528067
10.1109/ICPS49255.2021.9468232
10.1109/SESoS59159.2023.00012
10.1109/SEAMS.2019.00018
10.1109/RE.2018.00064
10.1109/SMARTCOMP.2018.00041
10.1145/3528229.3529385
Modeling an Industrial Revolution: How to Manage Large-Scale, Complex IoT Ecosystems?
10.1109/DSD.2019.00067
10.1109/SESoS/WDES.2019.00013
10.1109/TrustCom60117.2023.00240
10.1109/IOTM.001.2100164
10.1109/MCOM.001.2000679
10.1109/CASE49439.2021.9551638
10.1109/SEAA.2014.86
10.110

# Clean the dataset
We have cleaned the dataset in the previous csv/xlsx files and we loaded all files document sets from the cleaned csv file.

In [21]:
from datetime import datetime

currentDateAndTime = datetime.now()
filename = (f'SOTA_SECO_CPS_{currentDateAndTime.year}-{currentDateAndTime.month}-{currentDateAndTime.day}'
            f'_{currentDateAndTime.hour}-{currentDateAndTime.minute}-{currentDateAndTime.second}')
filename_xlsx = (f'SOTA_SECO_CPS_{currentDateAndTime.year}-{currentDateAndTime.month}-{currentDateAndTime.day}'
            f'_{currentDateAndTime.hour}-{currentDateAndTime.minute}-{currentDateAndTime.second}.xlsx')

print(filename)

SOTA_SECO_CPS_2024-7-24_17-38-53


In [23]:
############################## LOAD Cleaned papers ###################################

# with open('AAA/studies.txt', 'r') as file:
#     data = file.read().splitlines()
#     print(data)
    
# Load csv file
docs_bib = litstudy.load_scopus_csv('Results/SOTA_SECO_CPS_2024-7-24_17-23-44_All_DB.csv')
print(len(docs_bib), 'papers loaded from cleaned files')

188 papers loaded from cleaned files


In [24]:
import logging
logging.getLogger().setLevel(logging.CRITICAL)

docs_found_scopus, docs_notfound_scopus = litstudy.refine_scopus(docs_bib)

  logging.warn(f"no document found for DOI {doi}: {e}")
100%|██████████| 188/188 [00:00<00:00, 687.77it/s]


In [25]:
print(len(docs_found_scopus), 'papers found on Scopus')
print(len(docs_notfound_scopus), 'papers were not found')
docs_bib_scopus = docs_found_scopus | docs_notfound_scopus
len(docs_bib_scopus)

185 papers found on Scopus
3 papers were not found


188

In [26]:
docs_bib = docs_bib_scopus # docs_bib_scopus | docs_bib_SemanticScholar | docs_bib_CrossRef
docs_filtered = docs_bib # .filter_docs(lambda d: d.publication_year >= 1990)

In [27]:
################ CHECK VENUE TYPE ####################

index = 0
data = []
while index < len(docs_filtered):
    # re.sub('[<\[\]>]', '', str(docs_filtered[index].authors))

    authorList = []
    for author in docs_filtered[index].authors or []:
        authorList.append(author.name)

    if type(docs_filtered[index].id.doi) == type(None):
        print(docs_filtered[index].title)
        doi_paper = ''
        doi_paper_custom = ''
        # print(doi_paper)
    else:
        doi_paper = str(docs_filtered[index].id.doi) # 'https://www.doi.org/' + 
        doi_paper_custom = 'https://www.doi.org/' + str(docs_filtered[index].id.doi)
        print(doi_paper)
      
    paper_venue_type = ''
    if (docs_filtered[index].source_type == 'Journal' or docs_filtered[index].source_type == 'Trade Journal' 
        or docs_filtered[index].id.doi == '10.3390/en14123620' or 
        docs_filtered[index].id.doi == '10.1016/j.ijis.2024.04.003' or 
        docs_filtered[index].id.doi == '10.1007/978-3-319-46508-1_13'):
        paper_venue_type = 'Journal' 
    elif ( (docs_filtered[index].source_type == 'Conference Proceeding' or 
            docs_filtered[index].source_type == 'Book Series' or 
           docs_filtered[index].id.doi == '10.24251/HICSS.2017.569') and
           (not docs_filtered[index].id.doi == '10.1007/978-3-319-46508-1_13') and
           ( re.search('conference',str(docs_filtered[index].publication_source),re.IGNORECASE) or 
            re.search('congress',str(docs_filtered[index].publication_source),re.IGNORECASE) or 
            re.search('forum',str(docs_filtered[index].publication_source),re.IGNORECASE) or 
            docs_filtered[index].id.doi == '10.1007/978-3-031-16088-2_4' or
            docs_filtered[index].id.doi == '10.1109/DEST.2010.5610662' or
           re.search('Event-Driven Interoperable Manufacturing Ecosystem for Energy Consumption Monitoring',str(docs_filtered[index].title),re.IGNORECASE) or
           re.search('symposium',str(docs_filtered[index].publication_source),re.IGNORECASE) or 
           re.search('lecture notes',str(docs_filtered[index].publication_source),re.IGNORECASE) or
           re.search('Design, Automation and Test in Europe',str(docs_filtered[index].publication_source),re.IGNORECASE) or
           re.search('studies',str(docs_filtered[index].publication_source),re.IGNORECASE) ) and
           (not re.search('workshop',str(docs_filtered[index].publication_source),re.IGNORECASE)) and
           (not re.search('companion',str(docs_filtered[index].publication_source),re.IGNORECASE)) and
           (not re.search('MODELS-C',str(docs_filtered[index].publication_source),re.IGNORECASE)) and
           (not re.search('ICSA-C',str(docs_filtered[index].publication_source),re.IGNORECASE)) ):
        paper_venue_type = 'Conference'
    elif ( (docs_filtered[index].source_type == 'Conference Proceeding' or 
               docs_filtered[index].source_type == 'Book Series' or 
             docs_filtered[index].id.doi == '10.24251/HICSS.2017.569') and  
             (not docs_filtered[index].id.doi == '10.1007/978-3-319-46508-1_13') and
             (re.search('workshop',str(docs_filtered[index].publication_source),re.IGNORECASE) or
             re.search('seminar',str(docs_filtered[index].publication_source),re.IGNORECASE) or                                          
             # re.search('2021 Design, Automation and Test in Europe Conference and Exhibition, DATE 2021',str(d.publication_source),re.IGNORECASE) or
             re.search('companion',str(docs_filtered[index].publication_source),re.IGNORECASE) or 
             re.search('MODELS-C',str(docs_filtered[index].publication_source),re.IGNORECASE) or 
             re.search('ICSA-C',str(docs_filtered[index].publication_source),re.IGNORECASE) ) ):
        paper_venue_type = 'Workshop'
    elif  (docs_filtered[index].source_type == 'Book' or 
        re.search('SpringerBriefs in Applied Sciences and Technology',str(docs_filtered[index].publication_source),re.IGNORECASE)):
        paper_venue_type = 'Book'
        
    data.append({'Authors': '', 'Author full names': re.sub(r'[\[\'\]]', '', str(authorList)), 'Author(s) ID': '', 
    'Title': docs_filtered[index].title, 'Year': docs_filtered[index].publication_year, 'Source title': docs_filtered[index].publication_source, 
    'Volume': '', 'Issue': '', 'Art. No.': '', 'Page start': '', 'Page end': '', 'Page count': '', 'Cited by': docs_filtered[index].citation_count, 
    'DOI': doi_paper, 'Link': doi_paper_custom,  'Affiliations': re.sub(r'[\[\'\]]', '', str(docs_filtered[index].affiliations)), 
    'Authors with affiliations': '', 'Abstract': docs_filtered[index].abstract, 'Author Keywords': re.sub(r'[\[\'\]]', '', str(docs_filtered[index].keywords)), 
    'Index Keywords': '', 'Molecular Sequence Numbers': '',  'Chemicals/CAS': '', 'Tradenames': '', 'Manufacturers': '', 
    'Funding Details': '', 'Funding Texts': '', 'References': re.sub(r'[\[\'\]]', '', str(docs_filtered[index].references)), 'Correspondence Address': '', 'Editors': '', 'Publisher': docs_filtered[index].publisher, 
    'Sponsors': '', 'Conference name': '', 'Conference date': str(docs_filtered[index].publication_date), 'Conference location': '', 'Conference code': '', 'ISSN': '', 
    'ISBN': '', 'CODEN': '', 'PubMed ID': docs_filtered[index].id.pubmed, 'Language of Original Document': docs_filtered[index].language, 'Abbreviated Source Title': '', 'Document Type': paper_venue_type, 
    'Publication Stage': '', 'Open Access': '', 'Source': '', 'EID': docs_filtered[index].id.scopusid})
    
    index += 1

# Saving first group of data to a single excel file
df = pd.DataFrame(data, columns=['Authors', 'Author full names', 'Author(s) ID', 
    'Title', 'Year', 'Source title', 
    'Volume', 'Issue', 'Art. No.', 'Page start', 'Page end', 'Page count', 'Cited by', 
    'DOI', 'Link',  'Affiliations', 'Authors with affiliations', 'Abstract', 'Author Keywords', 
    'Index Keywords', 'Molecular Sequence Numbers',  'Chemicals/CAS', 'Tradenames', 'Manufacturers', 
    'Funding Details', 'Funding Texts', 'References', 'Correspondence Address', 'Editors', 'Publisher', 
    'Sponsors', 'Conference name', 'Conference date', 'Conference location', 'Conference code', 'ISSN', 
    'ISBN', 'CODEN', 'PubMed ID', 'Language of Original Document', 'Abbreviated Source Title', 'Document Type', 
    'Publication Stage', 'Open Access', 'Source', 'EID'])

# Saving first group of data to a single csv file
df.to_csv('Results/' + filename + '_00_Venue_OK.csv')

# Saving first group of data to a single excel file
df.to_excel('Results/' + filename + '_00_Venue_OK.xlsx', index=False)

10.1007/s00450-019-00426-5
10.1109/CloudCom.2014.158
10.1109/TIFS.2021.3054968
10.1109/TITS.2021.3068092
10.1109/JIOT.2018.2870294
10.1109/JIOT.2023.3268474
10.1109/TNSE.2021.3110003
10.1109/UIC-ATC-ScalCom-CBDCom-IoP.2015.174
10.1109/HPCC-CSS-ICESS.2015.37
10.1109/IC2E48712.2020.00016
10.1109/FAS-W.2016.57
10.1109/TITS.2023.3307660
10.1109/MWC.2019.1800521
10.1007/978-3-319-72125-5_1
10.1145/2591062.2591140
10.1145/3056540.3076192
10.1145/2804337.2804341
10.1145/3528229.3529384
10.1109/SESoS59159.2023.00014
10.1145/3524844.3528067
10.1109/ICPS49255.2021.9468232
10.1109/SESoS59159.2023.00012
10.1109/SEAMS.2019.00018
10.1109/RE.2018.00064
10.1109/SMARTCOMP.2018.00041
10.1145/3528229.3529385
Modeling an Industrial Revolution: How to Manage Large-Scale, Complex IoT Ecosystems?
10.1109/DSD.2019.00067
10.1109/SESoS/WDES.2019.00013
10.1109/TrustCom60117.2023.00240
10.1109/IOTM.001.2100164
10.1109/MCOM.001.2000679
10.1109/CASE49439.2021.9551638
10.1109/CPSNA.2014.12
10.1109/ICSTCC.2015.732130