## Data Scraping from Science Direct

The current problem is the abstract information is not shown in the html file downloaded. So we need to add them manually afterwards.

To get around of blocking access: download the corresponding html page, and then use the functions below to filter data out.

We remove publications that has 'index' as title (index page is not our interests), and also the one begin with 'Chapter' (book contains no new data).

The functions below gather the title, author, year of publication and url to the pdf.

In [8]:
from bs4 import BeautifulSoup
import pandas as pd

def get_titles_authors_urls_years_from_html(file_path):
    data = []

    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
        articles = soup.find_all('div', class_='result-item-container')
        print("Number of articles found:", len(articles))

        for article in articles:
            try:
                title_element = article.select_one('span.anchor-text > span')
                title = title_element.get_text(strip=True) if title_element else 'No title found'
                # Skip storing the article if the title is "Index"
                if title == 'Index':
                    continue
            except Exception as e:
                title = 'No title found'
                print(f"Title Error: {e}")

            try:
                # Check for authors in both possible classes
                authors_elements = article.select('ol.Authors.hor.reduce-list li span.author')
                if not authors_elements:
                    authors_elements = article.select('ol.Authors.hor li span.author')
                authors = ', '.join([author.get_text(strip=True) for author in authors_elements])
            except Exception as e:
                authors = 'No authors found'
                print(f"Authors Error: {e}")

            try:
                # Find the URL in the PreviewLinks div
                url_element = article.select_one('div.PreviewLinks a')
                url = url_element['href'] if url_element else 'No URL found'
            except Exception as e:
                url = 'No URL found'
                print(f"URL Error: {e}")

            try:
                # Find the year in the srctitle-date-fields span
                year_element = article.select_one('span.srctitle-date-fields')
                if year_element:
                    year_text = year_element.get_text(strip=True)
                    year = year_text[-4:]  # The year is the last four characters of the text
                else:
                    year = 'No year found'
            except Exception as e:
                year = 'No year found'
                print(f"Year Error: {e}")

            # Add the extracted title, authors, URL, and year to the data list
            data.append({
                'title': title,
                'authors': authors,
                'url': url,
                'year': year
            })

            # Debug: Print the extracted information for each article
            print(f"Title: {title}\nAuthors: {authors}\nURL: {url}\nYear: {year}\n")

    return data

def save_to_csv(data, filename):
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)

if __name__ == '__main__':
    country_name = 'Mexico2'
    file_path = '/Users/liting/Desktop/Triatomine'+country_name+'.html' #replace it with the path of the html file you downloaded
    data = get_titles_authors_urls_years_from_html(file_path)
    save_file_path = '/Users/liting/Documents/GitHub/google-scholar-search/science-direct/'+country_name+'_data.csv' #where you want to save the csv file
    save_to_csv(data,save_file_path)
    print(f'Data saved to /Users/liting/Documents/GitHub/google-scholar-search/science-direct/'+country_name+'_data.csv')


Number of articles found: 12
Title: Molecular and functional basis of high-salt avoidance in a blood-sucking insect
Authors: Gina Pontes, José Manuel Latorre-Estivalis, Romina B. Barrozo
URL: https://www.sciencedirect.com/science/article/pii/S2589004222007738/pdfft?md5=270c1185f1670c2a6bee15f01e5dddb6&pid=1-s2.0-S2589004222007738-main.pdf
Year: 2022

Title: Chapter 8: Panics and pandemics
Authors: Edward P. Rybicki
URL: No URL found
Year: 2023

Title: 69: Immunologic Mechanisms ofAtherosclerosis and Myocarditis
Authors: Peter Libby, Andrew H. Lichtman
URL: https://www.sciencedirect.com/science/article/pii/B9780702081651000691/pdfft?md5=60e48666196b29fd3bb9e8af00780746&pid=3-s2.0-B9780702081651000691-main.pdf
Year: 2023

Title: Structure-based design, synthesis and evaluation of a novel family of PEX5-PEX14 interaction inhibitors againstTrypanosoma
Authors: Valeria Napolitano, Piotr Mróz, Grzegorz Dubin
URL: https://www.sciencedirect.com/science/article/pii/S0223523422006808/pdfft?md5=a

In [12]:
import os
import pandas as pd

def combine_csv_files(directory_path):
    # List to hold dataframes
    df_list = []

    # Iterate over all files in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith("data.csv"):
            file_path = os.path.join(directory_path, filename)
            print(f"Reading file: {file_path}")
            df = pd.read_csv(file_path)
            df_list.append(df)

    # Combine all dataframes
    combined_df = pd.concat(df_list, ignore_index=True)

    # Remove duplicates
    combined_df.drop_duplicates(inplace=True)

    # Remove rows where titles begin with 'Chapter'
    combined_df = combined_df[~combined_df['title'].str.startswith('Chapter', na=False)]

    return combined_df

def compare_titles(combined_df, reference_file, unique_file, duplication_file):
    reference_df = pd.read_csv(reference_file)

    # Identify duplicates
    duplicates = combined_df[combined_df['title'].isin(reference_df['title'])]
    uniques = combined_df[~combined_df['title'].isin(reference_df['title'])]

    # Save duplicates and uniques to separate CSV files
    duplicates.to_csv(duplication_file, index=False)
    uniques.to_csv(unique_file, index=False)
    print(f'Duplicates saved to {duplication_file}')
    print(f'Uniques saved to {unique_file}')

if __name__ == '__main__':
    directory_path = '/Users/liting/Documents/GitHub/google-scholar-search/science-direct/'
    reference_file = '/Users/liting/Documents/GitHub/google-scholar-search/science-direct/reference.csv'
    unique_file = '/Users/liting/Documents/GitHub/google-scholar-search/science-direct/unique.csv'
    duplication_file = '/Users/liting/Documents/GitHub/google-scholar-search/science-direct/duplication.csv'

    combined_df = combine_csv_files(directory_path)
    compare_titles(combined_df, reference_file, unique_file, duplication_file)


Reading file: /Users/liting/Documents/GitHub/google-scholar-search/science-direct/Mexico2_data.csv
Reading file: /Users/liting/Documents/GitHub/google-scholar-search/science-direct/UnitedStates_data.csv
Reading file: /Users/liting/Documents/GitHub/google-scholar-search/science-direct/Mexico_data.csv
Reading file: /Users/liting/Documents/GitHub/google-scholar-search/science-direct/Canada_data.csv
Duplicates saved to /Users/liting/Documents/GitHub/google-scholar-search/science-direct/duplication.csv
Uniques saved to /Users/liting/Documents/GitHub/google-scholar-search/science-direct/unique.csv
