In [18]:
import pymysql
import time
import re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup

In [19]:
def connect_to_db():
    connection = pymysql.connect(host="localhost", user="root", passwd="", database="myapp")
    return connection

In [20]:
def fetch_teachers(connection):
    cursor = connection.cursor()
    retrive = "SELECT users.id, users.fname_en, users.lname_en, users.fname_th, users.lname_th FROM users"
    cursor.execute(retrive)
    teachers = cursor.fetchall()
    return [f"{t[1]} {t[2]} ({t[3]} {t[4]})" for t in teachers]

In [21]:
def setup_webdriver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    service = Service(r'D:/web scraping/chromedriver.exe')
    return webdriver.Chrome(service=service, options=chrome_options)

In [22]:
def scrape_tci_data(teacher_names):
    driver = setup_webdriver()
    results = []
    
    for teacher in teacher_names:
        driver.get('https://search.tci-thailand.org/advance_search.html')
        select = Select(driver.find_element(By.NAME, "criteria[]"))
        select.select_by_value('author')
        
        search_box = driver.find_element(By.NAME, 'keyword[]')
        search_box.clear()
        search_box.send_keys(teacher)
        search_box.send_keys(Keys.ENTER)
        time.sleep(5)

        select = Select(driver.find_element(By.ID, "limit_num_page"))
        select.select_by_value('100')
        time.sleep(5)
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        main_container = soup.find('div', {'class': 'filter_panel card col-md-9'})
        if not main_container:
            continue

        content_section = main_container.find('div', {'class': 'content'})
        if not content_section:
            continue

        for paper in content_section.find_all('div', class_='content'):
            paper_title = paper.find("p").text.strip()
            author_section = paper.find("p", class_="authors")
            journal_section = paper.find_all("p")[-1]
            document_type_section = paper.find("option", {"value": "journal_title"})
            citation_section = paper.find("p", style=re.compile(r"float:right;"))
            
            journal_link = journal_section.find("a")
            journal_name = journal_link.text.strip() if journal_link else ""
            
            year_match = re.search(r'\b(19|20)\d{2}\b', journal_section.text)
            publication_year = year_match.group(0) if year_match else ""
            
            page_match = re.search(r'pp\.\s*(\d+-\d+)', journal_section.text)
            page_numbers = page_match.group(1) if page_match else ""
            
            authors = ", ".join(a.text for a in author_section.find_all("a")) if author_section else ""
            document_type = document_type_section.text if document_type_section else "Journal"
            
            citation_count_text = citation_section.find("a").text.strip() if citation_section else ""
            citation_count = re.search(r'cited (\d+)', citation_count_text)
            citation_count = citation_count.group(1) if citation_count else ""
            
            title_section = paper.find("p", style=re.compile(r"margin-left:0;float:left; width:85%;"))
            title_link = title_section.find("a") if title_section else None
            article_link = title_link["href"] if title_link else ""
            
            url = f"https://search.tci-thailand.org/{article_link}"
            driver.get(url)
            time.sleep(5)
            
            soup_doi = BeautifulSoup(driver.page_source, "html.parser")
            doi_section = soup_doi.find("span", {"id": "doi_english"})
            doi_link = doi_section.find("a") if doi_section else None  
            doi_url = doi_link["href"] if doi_link else ""
            
            doi_match = re.search(r"10\.\d{4,9}/[\w\-.]+", doi_url)
            doi_number = doi_match.group(0) if doi_match else ""

            results.append({
                "Year": publication_year,
                "Title": paper_title,
                "Authors": authors,
                "Document Type": document_type,
                "Journals/Transactions": journal_name,
                "Pages": page_numbers,
                "Citations": citation_count,
                "DOI": doi_number
            })
    
    driver.quit()
    return results

In [1]:
def scrape_single_teacher_data(teacher_name):
    return scrape_tci_data([teacher_name])

In [23]:
if __name__ == "__main__":
    connection = connect_to_db()
    teacher_names = ['Punyaphol Horata']
    connection.close()
    scraped_data = scrape_tci_data(teacher_names)
    for data in scraped_data:
        print(data)

{'Year': '2016', 'Title': 'Extended Hierarchical Extreme Learning Machine with Multilayer Perceptron', 'Authors': 'Khanittha Phumrattanaprapin, Punyaphol Horata', 'Document Type': 'Journal', 'Journals/Transactions': 'ECTI Transactions on Computer and Information Technology', 'Pages': '196-204', 'Citations': '0', 'DOI': ''}
