In [20]:
import os
os.getcwd()

'C:\\Users\\Hyemi\\Python\\TopicModeling'

# Frontier in Psychology

# 1.background

In [1]:
import requests
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
from urllib.request import urlopen
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
import re

##### Bring URL and HTML
url = "https://www.frontiersin.org/journals/psychology/articles/10.3389/fpsyg.2024.1487146/full"
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')

##### Convert BeautifulSoup object to string
html_content = str(soup)

##### Search for title pattern
pattern = "<title.*?>.*?</title.*?>"
match_results = re.search(pattern, html_content, re.IGNORECASE)
title = match_results.group() if match_results else "No title found"
title = re.sub("<.*?>", "", title)  # Remove HTML tags

print(title)

##### https://www.frontiersin.org/journals/psychology/articles?publication-date=25%2F10%2F2024-25%2F10%2F2024

In [2]:
# Set up the WebDriver
driver = webdriver.Chrome()
# Open the page
base_url = "https://www.frontiersin.org/journals/psychology/articles"
url = base_url + "?publication-date=01%2F01%2F2023-31%2F12%2F2023"
driver.get(url)

# Set page zoom to 25%
driver.execute_script("document.body.style.zoom='25%'")

# Accept cookies if the banner appears
try:
    accept_cookies_button = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))
    )
    accept_cookies_button.click()
except:
    print("No cookie banner detected.")

# Wait for the first article to load
WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.CLASS_NAME, "CardArticle"))
)

# Scroll and hover to trigger loading of new articles
scroll_increment = 200
scroll_pause_time = 1.5
max_retries = 8
retries = 0
prev_article_count = 0

# Scroll until all articles are loaded
actions = ActionChains(driver)
while retries < max_retries:
    # Scroll by a small increment
    driver.execute_script("window.scrollBy(0, arguments[0]);", scroll_increment)
    time.sleep(scroll_pause_time)
    
    # Hover over the footer to trigger lazy loading
    footer = driver.find_element(By.TAG_NAME, "footer")
    actions.move_to_element(footer).perform()
    
    # Check current number of articles loaded
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    articles = soup.find_all("article", class_="CardArticle")
    current_article_count = len(articles)
    
    # Update retry count based on loaded articles
    if current_article_count > prev_article_count:
        prev_article_count = current_article_count
        retries = 0  # Reset retries if new articles loaded
    else:
        retries += 1  # Increment retry count if no new articles load
    print(f"Articles loaded: {current_article_count}, Retries: {retries}")

# Ensure elements are fully loaded
time.sleep(3)

# Extract article information with error handling
data = []
for article in articles:
    try:
        # Find elements with None handling
        title_elem = article.find("h1", class_="CardArticle__title")
        date_elem = article.find("p", class_="CardArticle__date")
        type_elem = article.find("p", class_="CardArticle__type")
        link_elem = article.find("a", class_="CardArticle__wrapper")
        
        # Extract data with None checks
        title = title_elem.get_text(strip=True) if title_elem else None
        publish_date = date_elem.get_text(strip=True) if date_elem else None
        article_type = type_elem.get_text(strip=True) if type_elem else None
        link = link_elem["href"] if link_elem else None
        
        data.append({
            "Title": title,
            "Type": article_type,
            "Published Date": publish_date,
            "Link": link,
        })
        
    except Exception as e:
        print(f"Error processing article: {str(e)}")
        continue

# Close the driver
driver.quit()

# Convert to DataFrame and display
df = pd.DataFrame(data)

Articles loaded: 32, Retries: 0
Articles loaded: 48, Retries: 0
Articles loaded: 64, Retries: 0
Articles loaded: 80, Retries: 0
Articles loaded: 96, Retries: 0
Articles loaded: 112, Retries: 0
Articles loaded: 128, Retries: 0
Articles loaded: 144, Retries: 0
Articles loaded: 160, Retries: 0
Articles loaded: 176, Retries: 0
Articles loaded: 192, Retries: 0
Articles loaded: 208, Retries: 0
Articles loaded: 224, Retries: 0
Articles loaded: 240, Retries: 0
Articles loaded: 256, Retries: 0
Articles loaded: 272, Retries: 0
Articles loaded: 288, Retries: 0
Articles loaded: 304, Retries: 0
Articles loaded: 320, Retries: 0
Articles loaded: 336, Retries: 0
Articles loaded: 352, Retries: 0
Articles loaded: 368, Retries: 0
Articles loaded: 384, Retries: 0
Articles loaded: 400, Retries: 0
Articles loaded: 416, Retries: 0
Articles loaded: 432, Retries: 0
Articles loaded: 448, Retries: 0
Articles loaded: 464, Retries: 0
Articles loaded: 480, Retries: 0
Articles loaded: 496, Retries: 0
Articles loaded

In [3]:
# Clean and process the data
def clean_publication_dates(df):
    # Create a copy to avoid SettingWithCopyWarning
    df = df.copy()
    
    # Clean date strings and convert to datetime
    df['Published Date'] = df['Published Date'].str.replace(
        r'(Published on |Accepted on )', '', 
        regex=True
    )
    
    # Convert to datetime with proper error handling
    df['Published Date'] = pd.to_datetime(
        df['Published Date'],
        dayfirst=True,  # Assuming date format is DD/MM/YYYY
        errors='coerce'
    )
    
    # Extract year and handle missing values
    df['Year'] = df['Published Date'].dt.strftime('%Y')
    df['Year'] = df['Year'].fillna('2014')  # Fill missing years with 2014
    
    return df

def filter_research_articles(df):
    # Filter for Original Research articles
    research_df = df[df['Type'] == 'Original Research'].copy()
    
    # Reset index after filtering
    research_df = research_df.reset_index(drop=True)
    
    return research_df

# Process the dataframe
processed_df = clean_publication_dates(df)

# Filter for Original Research articles
research_df = filter_research_articles(processed_df)

In [82]:
#link = "https://www.frontiersin.org/journals/psychology/articles/10.3389/fpsyg.2024.1487146/full"
#response = requests.get(link, headers={'User-Agent': 'Mozilla/5.0'})
#soup = BeautifulSoup(response.text, 'html.parser')
#soup

In [83]:
#abstract_meta = soup.find('meta', attrs={'name': 'citation_abstract'})
#abstract = abstract_meta['content']
#abstract_text = re.search(r'<p>(.*?)</p>', abstract).group(1)
#abstract_text

In [84]:
#sections = ["Data availability statement", "Ethics statement", "Author contributions"]
#extracted_text = []
#found_section = False

# Traverse paragraphs until we reach the first target section
#for paragraph in soup.find_all(['p', 'h2']):
    # If we encounter any target section, stop the extraction
#    if paragraph.name == 'h2' and paragraph.text.strip() in sections:
#        found_section = True
#        break
#    elif paragraph.name == 'p':  # Accumulate text in paragraphs
#        extracted_text.append(paragraph.text.strip())

# Join and print the extracted text
#result = " ".join(extracted_text)
#print(result)

In [85]:
#link = "https://www.frontiersin.org/journals/psychology/articles/10.3389/fpsyg.2024.1435688/full"
#response = requests.get(link, headers={'User-Agent': 'Mozilla/5.0'})
#soup = BeautifulSoup(response.text, 'html.parser')
#paragraphs = soup.find_all('p')
#paragraphs

In [86]:
#abstract_meta = soup.find('meta', attrs={'name': 'citation_abstract'})
#abstract = abstract_meta['content']
#abstract_text = re.search(r'<p>(.*?)</p>', abstract).group(1)
#abstract_text

In [87]:
#sections = ["Data availability statement", "Ethics statement", "Author contributions"]
#extracted_text = []
#found_section = False

# Traverse paragraphs until we reach the first target section
#for paragraph in soup.find_all(['p', 'h2']):
    # If we encounter any target section, stop the extraction
#    if paragraph.name == 'h2' and paragraph.text.strip() in sections:
#        found_section = True
#        break
#    elif paragraph.name == 'p':  # Accumulate text in paragraphs
#        extracted_text.append(paragraph.text.strip())

# Join and print the extracted text
#result = " ".join(extracted_text)
#print(result)

In [5]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

def create_session():
    session = requests.Session()
    retries = Retry(
        total=5,
        backoff_factor=1,
        status_forcelist=[403, 408, 429, 500, 502, 503, 504]
    )
    adapter = HTTPAdapter(max_retries=retries)
    session.mount("https://", adapter)
    session.mount("http://", adapter)
    return session

def extract_article_details(session, url, retries=3):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    for attempt in range(retries):
        try:
            response = session.get(url, headers=headers, timeout=30)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            
            ## Extract authors
            #authors = soup.select('.authors .author-wrapper')
            #author_list = [author.get_text(strip=True) for author in authors]
            
            # Extract abstract
            abstract_meta = soup.find('meta', attrs={'name': 'citation_abstract'})
            abstract = abstract_meta['content'] if abstract_meta else "Abstract not available"
            
            # Extract title from meta tag (more reliable)
            title_meta = soup.find('meta', attrs={'name': 'citation_title'})
            title = title_meta['content'] if title_meta else "Title not available"
            
            return {
                'Title': title,
                #'Authors': '; '.join(author_list) if author_list else "Authors not available",
                'Abstract': abstract
            }
            
        except Exception as e:
            if attempt == retries - 1:
                return {
                    'Title': "Error extracting title",
                    #'Authors': "Error extracting authors",
                    'Abstract': "Error extracting abstract"
                }
            time.sleep(2 ** attempt)

def process_articles(research_df):
    session = create_session()
    processed_data = []
    total_articles = len(research_df)
    
    for idx, row in research_df.iterrows():
        print(f"Processing article {idx + 1}/{total_articles}")
        
        article_data = {
            'Title': row['Title'],
            'Year': row['Year'],
            'Link': row['Link']
        }
        
        if row['Link']:
            details = extract_article_details(session, row['Link'])
            article_data.update({
                #'Authors': details['Authors'],
                'Abstract': details['Abstract']
            })
        else:
            article_data.update({
                #'Authors': "Link not available",
                'Abstract': "Link not available"
            })
        
        processed_data.append(article_data)
        
        time.sleep(2)
    
    return pd.DataFrame(processed_data)

# Process the articles and create final dataset
try:
    if 'Title' not in research_df.columns or 'Year' not in research_df.columns or 'Link' not in research_df.columns:
        raise ValueError("Required columns (Title, Year, Link) not found in research_df")
    
    final_df = process_articles(research_df)
    
    print("\nProcessing completed!")
    print(f"Total articles processed: {len(final_df)}")
    #print(f"Articles with authors: {final_df['Authors'].notna().sum()}")
    print(f"Articles with abstracts: {final_df['Abstract'].notna().sum()}")

    # Display DataFrame using IPython display (for Jupyter)
    from IPython.display import display
    display(final_df)
    
except Exception as e:
    print(f"An error occurred during processing: {str(e)}")
    raise


Processing article 1/3396
Processing article 2/3396
Processing article 3/3396
Processing article 4/3396
Processing article 5/3396
Processing article 6/3396
Processing article 7/3396
Processing article 8/3396
Processing article 9/3396
Processing article 10/3396
Processing article 11/3396
Processing article 12/3396
Processing article 13/3396
Processing article 14/3396
Processing article 15/3396
Processing article 16/3396
Processing article 17/3396
Processing article 18/3396
Processing article 19/3396
Processing article 20/3396
Processing article 21/3396
Processing article 22/3396
Processing article 23/3396
Processing article 24/3396
Processing article 25/3396
Processing article 26/3396
Processing article 27/3396
Processing article 28/3396
Processing article 29/3396
Processing article 30/3396
Processing article 31/3396
Processing article 32/3396
Processing article 33/3396
Processing article 34/3396
Processing article 35/3396
Processing article 36/3396
Processing article 37/3396
Processing

Unnamed: 0,Title,Year,Link,Abstract
0,,2023,https://www.frontiersin.org/journals/psycholog...,<p>This paper analyses the influence that diff...
1,,2023,https://www.frontiersin.org/journals/psycholog...,<p>This study investigates the significance of...
2,,2023,https://www.frontiersin.org/journals/psycholog...,<p>The current research on language teacher ed...
3,,2023,https://www.frontiersin.org/journals/psycholog...,<p>Spatial models dominated memory research th...
4,,2023,https://www.frontiersin.org/journals/psycholog...,"<sec id=""sec1""><title>Introduction</title><p>M..."
...,...,...,...,...
3391,,2023,https://www.frontiersin.org/journals/psycholog...,<sec><title>Background</title><p>Children from...
3392,,2023,https://www.frontiersin.org/journals/psycholog...,<p>We investigate whether firms that rely on m...
3393,,2023,https://www.frontiersin.org/journals/psycholog...,<p>Agricultural retailers face serious challen...
3394,,2023,https://www.frontiersin.org/journals/psycholog...,<p>As the global challenges facing sustainabil...


In [6]:
result_df = final_df
result_df.head()

Unnamed: 0,Title,Year,Link,Abstract
0,,2023,https://www.frontiersin.org/journals/psycholog...,<p>This paper analyses the influence that diff...
1,,2023,https://www.frontiersin.org/journals/psycholog...,<p>This study investigates the significance of...
2,,2023,https://www.frontiersin.org/journals/psycholog...,<p>The current research on language teacher ed...
3,,2023,https://www.frontiersin.org/journals/psycholog...,<p>Spatial models dominated memory research th...
4,,2023,https://www.frontiersin.org/journals/psycholog...,"<sec id=""sec1""><title>Introduction</title><p>M..."


In [7]:
def clean_text(text):
    if not isinstance(text, str):
        return text
        
    # Remove HTML tags (including <p> and </p>)
    text = re.sub(r'<[^>]+>', '', text)
    
    # Remove parenthetical content
    text = re.sub(r'\(.*?\)', '', text)
    
    # Remove numbers, asterisks, and daggers
    text = re.sub(r'(\d+|\*|†)', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

# Clean Title and Abstract
#result_df['Title'] = result_df['Title'].apply(clean_text)
result_df['Abstract'] = result_df['Abstract'].apply(clean_text)

In [8]:
#result_df['Authors'] = result_df['Authors'].apply(
#    lambda x: re.sub(r'(\d+|\*|†)', '', x)  # Remove digits, asterisks, and dagger symbols
#                .replace("??", "")          # Remove any question marks
#                .replace(",,", ",")         # Consolidate multiple commas
#                .replace(", ,", ",")        # Remove spaced commas
#                .replace("  ", " ")         # Remove double spaces
#                .replace(", ,", ",")                # Remove any trailing commas or spaces
#)

In [9]:
print(result_df[['Year', 'Abstract']])

      Year                                           Abstract
0     2023  This paper analyses the influence that differe...
1     2023  This study investigates the significance of se...
2     2023  The current research on language teacher educa...
3     2023  Spatial models dominated memory research throu...
4     2023  IntroductionMind wandering, a phenomenon in wh...
...    ...                                                ...
3391  2023  BackgroundChildren from socioeconomically disa...
3392  2023  We investigate whether firms that rely on majo...
3393  2023  Agricultural retailers face serious challenge ...
3394  2023  As the global challenges facing sustainability...
3395  2023  In recent years, quantitative methods have bee...

[3396 rows x 2 columns]


In [10]:
result_df.to_csv("articles_data_23.csv", index=False)

In [None]:
result_df