# 1. Get data from SCOPUS API

In [None]:
import requests
import pandas as pd
import urllib.parse
import helper

# API key obtained from Elsevier Developers Portal

config = helper.read_config()

api_key = config['SCOPUSSettings']['api_key']
insttoken = config['SCOPUSSettings']['insttoken']

# Scopus API endpoint for search
api_url = "https://api.elsevier.com/content/search/scopus"

# Query parameters for the search request
query = 'TITLE-ABS-KEY ( ( "marine debris"  OR  "marine plastic debris"  OR  "marine plastic pollution"  OR  "plastic pollution"  OR  "marine plastic"  OR  "plastic debris"  OR  microplastic  OR  nanoplastic  OR  microfiber  OR  "plastic pellet" )  AND  ( marine  OR  coast*  OR  sea  OR  ocean*  OR  *shore  OR  *pelagic OR benth* OR sediment ) )  AND  ( PUBYEAR >  2021 ) '
encoded_query = urllib.parse.quote(query)
params = {
    "apiKey": api_key,
    "insttoken": insttoken,
    "view": "COMPLETE",
    "query": query,
    "field": "dc:creator,dc:title,prism:coverDate,prism:publicationName,citedby-count,prism:doi,dc:description,authkeywords",
    "count": 25,  # Number of results per batch
    "start": 0  # Starting position for the batch
}

# Make the initial API request to get the total number of results
response = requests.get(api_url, params=params)
print(response.content)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    data = response.json()
    total_results = int(data['search-results']['opensearch:totalResults'])
    print("Total results:", total_results)
    
    # Adjust the count parameter to retrieve all documents
    batch_size=25
    params['count'] = batch_size
    
    # Make requests in batches with pagination
    all_results = []
    while params['start'] < total_results:
        response = requests.get(api_url, params=params)
        if response.status_code == 200:
            data = response.json()
            for item in data['search-results']['entry']:
                author = item.get('dc:creator', "")
                title = item.get('dc:title', '')
                year = item.get('prism:coverDate', '')[:4]
                source_title = item.get('prism:publicationName', '')
                citation_count = item.get('citedby-count', '0')
                doi = item.get('prism:doi', '')
                abstract = item.get('dc:description', '')
                author_keywords = item.get('authkeywords', '').split(" | ")
                
                result = {
                    "Author": author,
                    "Title": title,
                    "Year": year,
                    "Source Title": source_title,
                    "Citation Count": citation_count,
                    "DOI": doi,
                    "Abstract": abstract,
                    "Author Keywords": ', '.join(author_keywords)
                }
                all_results.append(result)
            
            # Increment the starting position for the next batch
            params['start'] += batch_size
        else:
            print("Error occurred:", response.status_code)
            break

    # Create a pandas DataFrame from all the results
    df1 = pd.DataFrame(all_results)

    # Perform further analysis or export the DataFrame as needed
    print(df1.head())  # Print the first few rows of the DataFrame
else:
    print("Error occurred:", response.status_code)


b'{"search-results":{"opensearch:totalResults":"3276","opensearch:startIndex":"0","opensearch:itemsPerPage":"25","opensearch:Query":{"@role": "request", "@searchTerms": "TITLE-ABS-KEY ( ( \\"marine debris\\"  OR  \\"marine plastic debris\\"  OR  \\"marine plastic pollution\\"  OR  \\"plastic pollution\\"  OR  \\"marine plastic\\"  OR  \\"plastic debris\\"  OR  microplastic  OR  nanoplastic  OR  microfiber  OR  \\"plastic pellet\\" )  AND  ( marine  OR  coast*  OR  sea  OR  ocean*  OR  *shore  OR  *pelagic OR benth* OR sediment ) )  AND  ( PUBYEAR >  2021 ) ", "@startPage": "0"},"link": [{"@_fa": "true", "@ref": "self", "@href": "https://api.elsevier.com/content/search/scopus?start=0&count=25&query=TITLE-ABS-KEY+%28+%28+%22marine+debris%22++OR++%22marine+plastic+debris%22++OR++%22marine+plastic+pollution%22++OR++%22plastic+pollution%22++OR++%22marine+plastic%22++OR++%22plastic+debris%22++OR++microplastic++OR++nanoplastic++OR++microfiber++OR++%22plastic+pellet%22+%29++AND++%28+marine++OR

In [None]:
# Scopus API endpoint for search
api_url = "https://api.elsevier.com/content/search/scopus"

# Query parameters for the search request
query = 'TITLE-ABS-KEY ( ( "marine debris"  OR  "marine plastic debris"  OR  "marine plastic pollution"  OR  "plastic pollution"  OR  "marine plastic"  OR  "plastic debris"  OR  microplastic  OR  nanoplastic  OR  microfiber  OR  "plastic pellet" )  AND  ( marine  OR  coast*  OR  sea  OR  ocean*  OR  *shore  OR  *pelagic OR benth* OR sediment ) )  AND  ( PUBYEAR <  2021 ) '
encoded_query = urllib.parse.quote(query)
params = {
    "apiKey": api_key,
    "insttoken": insttoken,
    "view": "COMPLETE",
    "query": query,
    "field": "dc:creator,dc:title,prism:coverDate,prism:publicationName,citedby-count,prism:doi,dc:description,authkeywords",
    "count": 25,  # Number of results per batch
    "start": 0  # Starting position for the batch
}

# Make the initial API request to get the total number of results
response = requests.get(api_url, params=params)
print(response.content)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    data = response.json()
    total_results = int(data['search-results']['opensearch:totalResults'])
    print("Total results:", total_results)
    
    # Adjust the count parameter to retrieve all documents
    batch_size=25
    params['count'] = batch_size
    
    # Make requests in batches with pagination
    all_results = []
    while params['start'] < total_results:
        response = requests.get(api_url, params=params)
        if response.status_code == 200:
            data = response.json()
            for item in data['search-results']['entry']:
                author = item.get('dc:creator', "")
                title = item.get('dc:title', '')
                year = item.get('prism:coverDate', '')[:4]
                source_title = item.get('prism:publicationName', '')
                citation_count = item.get('citedby-count', '0')
                doi = item.get('prism:doi', '')
                abstract = item.get('dc:description', '')
                author_keywords = item.get('authkeywords', '').split(" | ")
                
                result = {
                    "Author": author,
                    "Title": title,
                    "Year": year,
                    "Source Title": source_title,
                    "Citation Count": citation_count,
                    "DOI": doi,
                    "Abstract": abstract,
                    "Author Keywords": ', '.join(author_keywords)
                }
                all_results.append(result)
            
            # Increment the starting position for the next batch
            params['start'] += batch_size
        else:
            print("Error occurred:", response.status_code)
            break

    # Create a pandas DataFrame from all the results
    df2 = pd.DataFrame(all_results)

    # Perform further analysis or export the DataFrame as needed
    print(df2.head())  # Print the first few rows of the DataFrame
else:
    print("Error occurred:", response.status_code)


Total results: 4560
          Author                                              Title  Year  \
0   Abolfathi S.  Microplastics transport and mixing mechanisms ...  2020   
1    Garofalo L.  Oceanic giants in the Mediterranean: First mit...  2020   
2  Bitlus M.L.A.  Determination of Microplastics in Sediment of ...  2020   
3        Jiao M.  Tire wear particles in the environment: From r...  2020   
4  Isinibilir M.  Microplastic Consumption and Its Effect on Res...  2020   

                                        Source Title Citation Count  \
0  Proceedings of the Coastal Engineering Conference              4   
1                                    Natura Croatica              0   
2  IOP Conference Series: Earth and Environmental...              0   
3  Huanjing Kexue Xuebao/Acta Scientiae Circumsta...              3   
4                        Frontiers in Marine Science             11   

                              DOI  \
0                                   
1               

In [None]:
# Scopus API endpoint for search
api_url = "https://api.elsevier.com/content/search/scopus"

# Query parameters for the search request
query = 'TITLE-ABS-KEY ( ( "marine debris"  OR  "marine plastic debris"  OR  "marine plastic pollution"  OR  "plastic pollution"  OR  "marine plastic"  OR  "plastic debris"  OR  microplastic  OR  nanoplastic  OR  microfiber  OR  "plastic pellet" )  AND  ( marine  OR  coast*  OR  sea  OR  ocean*  OR  *shore  OR  *pelagic OR benth* OR sediment ) )  AND  ( PUBYEAR =  2021 ) '
encoded_query = urllib.parse.quote(query)
params = {
    "apiKey": api_key,
    "insttoken": insttoken,
    "view": "COMPLETE",
    "query": query,
    "field": "dc:creator,dc:title,prism:coverDate,prism:publicationName,citedby-count,prism:doi,dc:description,authkeywords",
    "count": 25,  # Number of results per batch
    "start": 0  # Starting position for the batch
}

# Make the initial API request to get the total number of results
response = requests.get(api_url, params=params)
print(response.content)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    data = response.json()
    total_results = int(data['search-results']['opensearch:totalResults'])
    print("Total results:", total_results)
    
    # Adjust the count parameter to retrieve all documents
    batch_size=25
    params['count'] = batch_size
    
    # Make requests in batches with pagination
    all_results = []
    while params['start'] < total_results:
        response = requests.get(api_url, params=params)
        if response.status_code == 200:
            data = response.json()
            for item in data['search-results']['entry']:
                author = item.get('dc:creator', "")
                title = item.get('dc:title', '')
                year = item.get('prism:coverDate', '')[:4]
                source_title = item.get('prism:publicationName', '')
                citation_count = item.get('citedby-count', '0')
                doi = item.get('prism:doi', '')
                abstract = item.get('dc:description', '')
                author_keywords = item.get('authkeywords', '').split(" | ")
                
                result = {
                    "Author": author,
                    "Title": title,
                    "Year": year,
                    "Source Title": source_title,
                    "Citation Count": citation_count,
                    "DOI": doi,
                    "Abstract": abstract,
                    "Author Keywords": ', '.join(author_keywords)
                }
                all_results.append(result)
            
            # Increment the starting position for the next batch
            params['start'] += batch_size
        else:
            print("Error occurred:", response.status_code)
            break

    # Create a pandas DataFrame from all the results
    df3 = pd.DataFrame(all_results)

    # Perform further analysis or export the DataFrame as needed
    print(df3.head())  # Print the first few rows of the DataFrame
else:
    print("Error occurred:", response.status_code)


b'{"search-results":{"opensearch:totalResults":"1829","opensearch:startIndex":"0","opensearch:itemsPerPage":"25","opensearch:Query":{"@role": "request", "@searchTerms": "TITLE-ABS-KEY ( ( \\"marine debris\\"  OR  \\"marine plastic debris\\"  OR  \\"marine plastic pollution\\"  OR  \\"plastic pollution\\"  OR  \\"marine plastic\\"  OR  \\"plastic debris\\"  OR  microplastic  OR  nanoplastic  OR  microfiber  OR  \\"plastic pellet\\" )  AND  ( marine  OR  coast*  OR  sea  OR  ocean*  OR  *shore  OR  *pelagic OR benth* OR sediment ) )  AND  ( PUBYEAR =  2021 ) ", "@startPage": "0"},"link": [{"@_fa": "true", "@ref": "self", "@href": "https://api.elsevier.com/content/search/scopus?start=0&count=25&query=TITLE-ABS-KEY+%28+%28+%22marine+debris%22++OR++%22marine+plastic+debris%22++OR++%22marine+plastic+pollution%22++OR++%22plastic+pollution%22++OR++%22marine+plastic%22++OR++%22plastic+debris%22++OR++microplastic++OR++nanoplastic++OR++microfiber++OR++%22plastic+pellet%22+%29++AND++%28+marine++OR

In [None]:
print(df1.shape, df2.shape, df3.shape)

(3276, 8) (4560, 8) (1829, 8)


In [None]:
# Save files
# concatenate all data into one DataFrame
dfs = [df1, df2, df3]
data = pd.concat(dfs, ignore_index=True)
print("Length of dataset retrieved from SCOPUS", len(data))

# Remove duplicates based on DOI
print("Number of duplicates based on DOI:", data.duplicated(subset=['DOI']).sum())
data.drop_duplicates(subset=['DOI'], inplace = True)

#Remove instances with no abstract available
data.dropna(subset=['Abstract'], inplace=True)

# Just to make sure, filter DataFrame based on empty strings in the "Abstract" column
filtered_data = data[data['Abstract'].str.strip() == '']
indices = filtered_data.index
data.drop(indices, inplace=True)

# Remove "Corrigendum" instances
data = data[~data['Title'].str.contains('Corrigendum')]
data.reset_index(drop=True, inplace=True)

#Save
data.to_csv(r'..\get_data\alldata.csv', index=False)
print("Final data set length is:", len(data))
print(data.shape)


Length of dataset retrieved from SCOPUS 9665
Number of duplicates based on DOI: 325
Final data set length is: 9156
(9156, 8)
