In [6]:
import requests
from bs4 import BeautifulSoup
import time
import json

import os

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def get_all_links_and_titles(url):
    retries = 3
    for attempt in range(retries):
        try:
          response = requests.get(url, headers=headers)
          response.raise_for_status()  # Raises an HTTPError if the status is 4xx, 5xx
          break
        except (requests.ConnectionError, requests.HTTPError) as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            time.sleep(5)  # Wait for 5 seconds before retrying
    else:
        print("Failed to retrieve the URL after several attempts.")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')
    a_tags = soup.select('li a')
    links_and_titles = [(a_tag.get('href'), a_tag.text.strip()) for a_tag in a_tags]
    return links_and_titles


def save_to_json(data, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

url = 'https://my.ntu.edu.tw'
links_and_titles = get_all_links_and_titles(url)
if links_and_titles:
    save_to_json(links_and_titles, 'list.json')
    print("Data successfully saved to 'list.json'.")
else:
    print("Failed to retrieve the links and titles.")


Data successfully saved to 'list.json'.


In [7]:
print(links_and_titles)

[('Default.aspx?lang=eng', 'English'), ('login.aspx?actionType=Y', '常用服務'), ('https://www.ntu.edu.tw/contact.html', '聯絡我們'), ('https://ann.cc.ntu.edu.tw/', '最新消息'), ('https://www.ntu.edu.tw/', '臺大首頁'), ('https://www.cc.ntu.edu.tw/', '計中首頁'), ('#', '個人資訊 >'), ('#', '課務資訊 >'), ('#', '生活資訊 >'), ('#', '助學資訊 >'), ('#', '社團活動資訊 >'), ('#', '畢業生資訊 >'), ('https://ntucace.ntu.edu.tw/ ', '跨領域實習 - 臺灣引路人計畫'), ('http://coursemap.aca.ntu.edu.tw/course_map_all/', '臺大課程地圖'), ('https://course.ntu.edu.tw/', '臺大新課程網'), ('https://my.ntu.edu.tw/vote/', '教職員投票'), ('https://my.ntu.edu.tw/ntuhrEService/hrQuery/MeritPay.aspx?func=military', '教師彈性薪資查詢'), ('https://my.ntu.edu.tw/ntuhrEService/login.aspx?func=ecert', '專任教研人員電子聘書'), ('https://ifweb.aca.ntu.edu.tw/ScoreTrace/Index.aspx', '學生成績表現追蹤'), ('https://specom.aca.ntu.edu.tw/', '領域專長查詢系統'), ('https://ntuacare.ntu.edu.tw/', 'NTU aCARE學習預警暨輔導追蹤系統'), ('http://jade.lib.ntu.edu.tw/rapid_frontend/', 'JADE期刊文獻'), ('https://adfs.ntu.edu.tw/oc/', '著作原創性檢查服務'), ('https

In [8]:
def scrape_and_save_html(url, file_name):
    try:
        # Send a GET request to the URL
        response = requests.get(url, headers=headers)

        # Check if the request was successful
        if response.status_code == 200:
            html_content = response.text

            # Save the HTML content to a file
            with open("./html/"+file_name, 'w', encoding='utf-8') as file:
                file.write(html_content)

            print(f"HTML content successfully saved to {file_name}")
        else:
            print(f"Failed to retrieve the page {file_name}. Status code: {response.status_code}")

    except Exception as e:
        print(f"An error occurred: {e}")


for (link, title) in links_and_titles:
    try:
        scrape_and_save_html(link, title + '.html')

    except Exception as e:
        print(f"An error occurred at title {title}: {e}")


#url = 'https://my.ntu.edu.tw'
#file_name = 'myntu.html'  # The name of the file to save the HTML content
#scrape_and_save_html(url, file_name)


An error occurred: Invalid URL 'Default.aspx?lang=eng': No scheme supplied. Perhaps you meant https://Default.aspx?lang=eng?
An error occurred: Invalid URL 'login.aspx?actionType=Y': No scheme supplied. Perhaps you meant https://login.aspx?actionType=Y?
HTML content successfully saved to 聯絡我們.html
HTML content successfully saved to 最新消息.html
HTML content successfully saved to 臺大首頁.html
HTML content successfully saved to 計中首頁.html
An error occurred: Invalid URL '#': No scheme supplied. Perhaps you meant https://#?
An error occurred: Invalid URL '#': No scheme supplied. Perhaps you meant https://#?
An error occurred: Invalid URL '#': No scheme supplied. Perhaps you meant https://#?
An error occurred: Invalid URL '#': No scheme supplied. Perhaps you meant https://#?
An error occurred: Invalid URL '#': No scheme supplied. Perhaps you meant https://#?
An error occurred: Invalid URL '#': No scheme supplied. Perhaps you meant https://#?
Failed to retrieve the page 跨領域實習 - 臺灣引路人計畫.html. Status

In [9]:
scrape_and_save_html('https://ntucace.ntu.edu.tw', '跨領域實習-臺灣引路人計畫.html')

HTML content successfully saved to 跨領域實習-臺灣引路人計畫.html


In [21]:
#!pip install openai

Collecting openai
  Downloading openai-1.38.0-py3-none-any.whl.metadata (22 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting pydantic<3,>=1.9.0 (from openai)
  Downloading pydantic-2.8.2-py3-none-any.whl.metadata (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.2/125.2 kB[0m [31m721.5 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting annotated-types>=0.4.0 (from pydantic<3,>=1.9.0->openai)
  Downloading annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting pydantic-core==2.20.1 (from pydantic<3,>=1.9.0->openai)
  Downloading pydantic_core-2.20.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (6.6 kB)
Downloading openai-1.38.0-py3-none-any.whl (335 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m335.9/335.9 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading distro-1.9.0-py3-none-any.whl (20 kB)
Downloading pydantic-2.8.2-py3-no

In [4]:
from openai import OpenAI

client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
MODEL="gpt-4o"


def extract_text_from_html(html_content):
    """Extract text from HTML content."""
    soup = BeautifulSoup(html_content, 'html.parser')
    return soup.get_text()

def get_summary_and_keywords(text):
    #"""Get summary and keywords from text using OpenAI API."""
    #prompt_summary = f"Please summarize the following text:\n\n{text}"
    #prompt_keywords = f"Please extract keywords from the following text:\n\n{text}"

    ## Request for summary
    #response_summary = openai.Completion.create(
    #    engine="text-davinci-003",
    #    prompt=prompt_summary,
    #    max_tokens=150
    #)
    #summary = response_summary.choices[0].text.strip()

    ## Request for keywords
    #response_keywords = openai.Completion.create(
    #    engine="text-davinci-003",
    #    prompt=prompt_keywords,
    #    max_tokens=50
    #)
    #keywords = response_keywords.choices[0].text.strip()

    completion = client.chat.completions.create(
      model=MODEL,
      messages=[
        {"role": "user", "content": "Summarize this page in 50 words." + text}
      ]
    )

    summary = completion.choices[0].message.content
    print(summary)

    return summary

def process(filename):
    # Read HTML file
    with open(filename, 'r', encoding='utf-8') as file:
        html_content = file.read()

    # Extract text from HTML
    text = extract_text_from_html(html_content)

    # Get summary and keywords
    summary = get_summary_and_keywords(text)

    print("Summary:")
    print(summary)
    #print("\nKeywords:")
    #print(keywords)


process('./html/myntu.html')

myNTU is the portal for National Taiwan University (NTU) users, requiring login with a Computing Center account. It provides various services, including personal information, course details, student activities, payment, and graduate information. Users can access NTU-specific systems, report network issues, and modify passwords for security. All content is legally protected.
Summary:
myNTU is the portal for National Taiwan University (NTU) users, requiring login with a Computing Center account. It provides various services, including personal information, course details, student activities, payment, and graduate information. Users can access NTU-specific systems, report network issues, and modify passwords for security. All content is legally protected.
