In [None]:
url = "https://www.archdaily.com/945987/77-washington-workspace-worrell-yeung-architecture?ad_source=search&ad_medium=projects_tab"


In [2]:
import requests
from bs4 import BeautifulSoup
import re
import json

def extract_project_info(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    project_info = {}

    # 제목 추출
    title = soup.find('h1', class_='afd-title-big')
    project_info['title'] = title.text.strip() if title else 'Title not found'

    # 기본 정보 추출 (Area, Year, Photographs 등)
    info_items = soup.find_all('li', class_='afd-char-item')
    for item in info_items:
        label = item.find('span', class_='afd-char-item__label')
        value = item.find('span', class_='afd-char-item__value')
        if label and value:
            key = label.text.strip().lower().replace(':', '')
            project_info[key] = value.text.strip()

    # 추가 정보 추출 (Manufacturers, Structural, Contractor 등)
    additional_info = soup.find('div', class_='afd-char-content')
    if additional_info:
        items = additional_info.find_all('p')
        for item in items:
            parts = item.text.split(':')
            if len(parts) == 2:
                key = parts[0].strip().lower()
                value = parts[1].strip()
                project_info[key] = value

    # JSON-LD 데이터 추출 (추가 정보를 위해)
    script = soup.find('script', type='application/ld+json')
    if script:
        try:
            json_data = json.loads(script.string)
            if 'description' in json_data:
                project_info['description'] = json_data['description']
        except json.JSONDecodeError:
            print("Error decoding JSON-LD data")

    return project_info

# 테스트 실행
url = "https://www.archdaily.com/945987/77-washington-workspace-worrell-yeung-architecture?ad_source=search&ad_medium=projects_tab"
result = extract_project_info(url)

# 결과 출력
print("Extracted Project Information:")
for key, value in result.items():
    print(f"{key.capitalize()}: {value}")

# 특정 키 확인
expected_keys = ['area', 'year', 'photographs', 'manufacturers', 'architects']
for key in expected_keys:
    if key in result:
        print(f"\n{key.capitalize()} found: {result[key]}")
    else:
        print(f"\n{key.capitalize()} not found in the extracted data")

Extracted Project Information:
Title: 77 Washington Workspace  / Worrell Yeung Architecture
Description: Completed in 2020 in Brooklyn, United States. Images by Naho Kubota. The project consists of the renovation and redesign of 77 Washington, a six-story, 38,000 square-foot, former masonry factory built in the 1920s, as...

Area not found in the extracted data

Year not found in the extracted data

Photographs not found in the extracted data

Manufacturers not found in the extracted data

Architects not found in the extracted data


In [6]:
import requests
from bs4 import BeautifulSoup
import re
import json

def extract_project_info(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    project_info = {}

    print("DEBUG: Webpage fetched successfully")

    # 제목 추출
    title = soup.find('h1', class_='afd-title-big')
    project_info['title'] = title.text.strip() if title else 'Title not found'
    print(f"DEBUG: Title extracted: {project_info['title']}")

    # 모든 div 탐색
    all_divs = soup.find_all('div')
    print(f"DEBUG: Total number of divs found: {len(all_divs)}")

    for div in all_divs:
        # div의 클래스 출력
        if div.get('class'):
            print(f"DEBUG: Div class found: {' '.join(div.get('class'))}")
        
        # 정보 추출 시도
        items = div.find_all(['li', 'p'])
        for item in items:
            text = item.text.strip()
            if ':' in text:
                key, value = text.split(':', 1)
                project_info[key.strip().lower()] = value.strip()
                print(f"DEBUG: Extracted - {key.strip()}: {value.strip()}")

    # JSON-LD 데이터 추출 (추가 정보를 위해)
    script = soup.find('script', type='application/ld+json')
    if script:
        try:
            json_data = json.loads(script.string)
            if 'description' in json_data:
                project_info['description'] = json_data['description']
            print("DEBUG: JSON-LD data extracted successfully")
        except json.JSONDecodeError:
            print("DEBUG: Error decoding JSON-LD data")

    return project_info

# 테스트 실행
url = "https://www.archdaily.com/945987/77-washington-workspace-worrell-yeung-architecture?ad_source=search&ad_medium=projects_tab"
result = extract_project_info(url)

# 결과 출력
print("\nExtracted Project Information:")
for key, value in result.items():
    print(f"{key.capitalize()}: {value}")

# 특정 키 확인
expected_keys = ['area', 'year', 'photographs', 'manufacturers', 'architects']
for key in expected_keys:
    if key in result:
        print(f"\n{key.capitalize()} found: {result[key]}")
    else:
        print(f"\n{key.capitalize()} not found in the extracted data")

DEBUG: Webpage fetched successfully
DEBUG: Title extracted: 77 Washington Workspace  / Worrell Yeung Architecture
DEBUG: Total number of divs found: 511
DEBUG: Div class found: js-slot banners--with-background banners--billboard
DEBUG: Div class found: afd-hide
DEBUG: Div class found: jv-header__bottom jv-header__bottom--fixed jv-header__bottom--on-screen
DEBUG: Div class found: jv-container jv-header__bottom-container
DEBUG: Div class found: jv-header__right
DEBUG: Div class found: jv-hamburger-menu__button
DEBUG: Div class found: jv-hamburger-menu__container
DEBUG: Div class found: jv-container
DEBUG: Div class found: jv-hamburger-menu__header
DEBUG: Div class found: jv-header__hider
DEBUG: Div class found: afd-site-selector afd-desktop-e
DEBUG: Div class found: afd-site-selector__country
DEBUG: Div class found: jv-hamburger-menu__footer
DEBUG: Div class found: afd-footer__rights
DEBUG: Div class found: afd-footer__rights__txt-wrap
DEBUG: Div class found: afd-header-main__container a

In [8]:
import requests
from bs4 import BeautifulSoup
import re
import json

def extract_project_info(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    project_info = {}

    print("DEBUG: Webpage fetched successfully")

    # 제목 추출
    title = soup.find('h1', class_='afd-title-big')
    project_info['title'] = title.text.strip() if title else 'Title not found'
    print(f"DEBUG: Title extracted: {project_info['title']}")

    # 모든 div 탐색
    all_divs = soup.find_all('div')
    print(f"DEBUG: Total number of divs found: {len(all_divs)}")

    for div in all_divs:
        # 정보 추출 시도
        items = div.find_all(['li', 'p'])
        for item in items:
            text = item.text.strip()
            if ':' in text:
                key, value = text.split(':', 1)
                key = key.strip().lower()
                value = value.strip()
                
                # Area와 Year 정보 추출
                if 'area' in key:
                    match = re.search(r'(\d+(?:,\d+)?)\s*ft²', value)
                    if match:
                        project_info['area'] = match.group(1) + ' ft²'
                elif 'year' in key:
                    match = re.search(r'\d{4}', value)
                    if match:
                        project_info['year'] = match.group(0)
                else:
                    project_info[key] = value
                
                print(f"DEBUG: Extracted - {key}: {value}")

    # JSON-LD 데이터 추출 (추가 정보를 위해)
    script = soup.find('script', type='application/ld+json')
    if script:
        try:
            json_data = json.loads(script.string)
            if 'description' in json_data:
                project_info['description'] = json_data['description']
            print("DEBUG: JSON-LD data extracted successfully")
        except json.JSONDecodeError:
            print("DEBUG: Error decoding JSON-LD data")

    return project_info

# 테스트 실행
url = "https://www.archdaily.com/945987/77-washington-workspace-worrell-yeung-architecture?ad_source=search&ad_medium=projects_tab"
result = extract_project_info(url)

# 결과 출력
print("\nExtracted Project Information:")
for key, value in result.items():
    print(f"{key.capitalize()}: {value}")

# 특정 키 확인
expected_keys = ['area', 'year', 'photographs', 'manufacturers', 'architects']
for key in expected_keys:
    if key in result:
        print(f"\n{key.capitalize()} found: {result[key]}")
    else:
        print(f"\n{key.capitalize()} not found in the extracted data")

DEBUG: Webpage fetched successfully
DEBUG: Title extracted: 77 Washington Workspace  / Worrell Yeung Architecture
DEBUG: Total number of divs found: 511
DEBUG: Extracted - architects: Worrell Yeung Architecture
DEBUG: Extracted - area
area of this architecture project
























area: 31500 ft²
DEBUG: Extracted - year
completion year of this architecture project 


















            year: 2020
DEBUG: Extracted - photographs







photographs: Naho Kubota
DEBUG: Extracted - manufacturers
brands with products used in this architecture project 



















manufacturers: Alcon Lighting®
DEBUG: Extracted - elevator: Jenkins & Huntington
DEBUG: Extracted - structural: Silman Structural Engineers, Silman
DEBUG: Extracted - expeditor: RPO
DEBUG: Extracted - contractor: Corcon
DEBUG: Extracted - mep: Engineering Solutions
DEBUG: Extracted - landscape: MVVA
DEBUG: Extracted - lighting: Engineering Solutions
DEBUG: Extracted - security: Lerner Solutions
DEBUG: Extr

In [15]:
import requests
from bs4 import BeautifulSoup
import re
import json
from openai import OpenAI
import os
# from dotenv import load_dotenv

# # 환경 변수 로드
# load_dotenv()

# OpenAI 클라이언트 초기화
# client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def extract_project_info(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    project_info = {}

    # 제목 추출
    title = soup.find('h1', class_='afd-title-big')
    project_info['title'] = title.text.strip() if title else 'Title not found'

    # Year와 Area 추출
    all_divs = soup.find_all('div')
    for div in all_divs:
        items = div.find_all(['li', 'p'])
        for item in items:
            text = item.text.strip()
            if ':' in text:
                key, value = text.split(':', 1)
                key = key.strip().lower()
                value = value.strip()
                
                if 'area' in key:
                    match = re.search(r'(\d+(?:,\d+)?)\s*ft²', value)
                    if match:
                        project_info['area'] = match.group(1) + ' ft²'
                elif 'year' in key:
                    match = re.search(r'\d{4}', value)
                    if match:
                        project_info['year'] = match.group(0)

    # 전체 텍스트 추출
    full_text = soup.get_text()

    return project_info, full_text

def extract_keywords_gpt(text):
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that extracts key words and phrases from given text."},
            {"role": "user", "content": f"Extract the 10 most important keywords or key phrases from the following text. Provide them as a comma-separated list:\n\n{text[:4000]}"}
        ]
    )
    return completion.choices[0].message.content.strip().split(', ')

# 테스트 실행
url = "https://www.archdaily.com/945987/77-washington-workspace-worrell-yeung-architecture?ad_source=search&ad_medium=projects_tab"
project_info, full_text = extract_project_info(url)

# GPT를 사용한 키워드 추출
# keywords = extract_keywords_gpt(full_text)

# 결과 출력
print("\nExtracted Project Information:")
print(f"Title: {project_info.get('title', 'Not found')}")
print(f"Year: {project_info.get('year', 'Not found')}")
print(f"Area: {project_info.get('area', 'Not found')}")
print("\nExtracted Keywords:")
# print(', '.join(keywords))


Extracted Project Information:
Title: 77 Washington Workspace  / Worrell Yeung Architecture
Year: 2020
Area: 31500 ft²

Extracted Keywords:


In [13]:
import requests
from bs4 import BeautifulSoup
import re
import json
from openai import OpenAI
import os
# from dotenv import load_dotenv

# # 환경 변수 로드
# load_dotenv()

# OpenAI 클라이언트 초기화
# client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def extract_project_info(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    project_info = {}

    # 제목 추출
    title = soup.find('h1', class_='afd-title-big')
    project_info['title'] = title.text.strip() if title else 'Title not found'

    # Year와 Area 추출
    all_divs = soup.find_all('div')
    for div in all_divs:
        items = div.find_all(['li', 'p'])
        for item in items:
            text = item.text.strip()
            if ':' in text:
                key, value = text.split(':', 1)
                key = key.strip().lower()
                value = value.strip()
                
                if 'area' in key:
                    match = re.search(r'(\d+(?:,\d+)?)\s*ft²', value)
                    if match:
                        project_info['area'] = match.group(1) + ' ft²'
                elif 'year' in key:
                    match = re.search(r'\d{4}', value)
                    if match:
                        project_info['year'] = match.group(0)

    # 전체 텍스트 추출
    full_text = soup.get_text()

    return project_info, full_text

def extract_keywords_gpt(text):
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that extracts key words and phrases from given text."},
            {"role": "user", "content": f"Extract the 10 most important keywords or key phrases from the following text. Provide them as a comma-separated list:\n\n{text[:4000]}"}
        ]
    )
    return completion.choices[0].message.content.strip().split(', ')

# 테스트 실행
url = "https://www.archdaily.com/1019034/dr-cosmo-design-story-mixed-use-space-another-d-studio?ad_source=search&ad_medium=projects_tab"
project_info, full_text = extract_project_info(url)

# GPT를 사용한 키워드 추출
# keywords = extract_keywords_gpt(full_text)

# 결과 출력
print("\nExtracted Project Information:")
print(f"Title: {project_info.get('title', 'Not found')}")
print(f"Year: {project_info.get('year', 'Not found')}")
print(f"Area: {project_info.get('area', 'Not found')}")
print("\nExtracted Keywords:")
# print(', '.join(keywords))


Extracted Project Information:
Title: Dr. COSMO Design Story Mixed Use Space / Another D Studio
Year: 2024
Area: Not found

Extracted Keywords:


In [14]:
import requests
from bs4 import BeautifulSoup
import re
import json
from openai import OpenAI
import os
# from dotenv import load_dotenv

# # 환경 변수 로드
# load_dotenv()

# OpenAI 클라이언트 초기화
# client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def extract_project_info(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    project_info = {}

    # 제목 추출
    title = soup.find('h1', class_='afd-title-big')
    project_info['title'] = title.text.strip() if title else 'Title not found'

    # Year와 Area 추출
    all_divs = soup.find_all('div')
    for div in all_divs:
        items = div.find_all(['li', 'p'])
        for item in items:
            text = item.text.strip()
            if ':' in text:
                key, value = text.split(':', 1)
                key = key.strip().lower()
                value = value.strip()
                
                if 'area' in key:
                    match = re.search(r'(\d+(?:,\d+)?)\s*ft²', value)
                    if match:
                        project_info['area'] = match.group(1) + ' ft²'
                elif 'year' in key:
                    match = re.search(r'\d{4}', value)
                    if match:
                        project_info['year'] = match.group(0)

    # 전체 텍스트 추출
    full_text = soup.get_text()

    return project_info, full_text

def extract_keywords_gpt(text):
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that extracts key words and phrases from given text."},
            {"role": "user", "content": f"Extract the 10 most important keywords or key phrases from the following text. Provide them as a comma-separated list:\n\n{text[:4000]}"}
        ]
    )
    return completion.choices[0].message.content.strip().split(', ')

# 테스트 실행
url = "https://www.archdaily.com/1017300/5-rooms-hyper?ad_source=search&ad_medium=projects_tab"
project_info, full_text = extract_project_info(url)

# GPT를 사용한 키워드 추출
# keywords = extract_keywords_gpt(full_text)

# 결과 출력
print("\nExtracted Project Information:")
print(f"Title: {project_info.get('title', 'Not found')}")
print(f"Year: {project_info.get('year', 'Not found')}")
print(f"Area: {project_info.get('area', 'Not found')}")
print("\nExtracted Keywords:")
# print(', '.join(keywords))


Extracted Project Information:
Title: 5 Rooms / Hyper
Year: 2024
Area: Not found

Extracted Keywords:


In [16]:
import requests
from bs4 import BeautifulSoup
import re
import json
from openai import OpenAI
import os
# from dotenv import load_dotenv

# # 환경 변수 로드
# load_dotenv()

# OpenAI 클라이언트 초기화
# client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def extract_project_info(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    project_info = {}

    # 제목 추출
    title = soup.find('h1', class_='afd-title-big')
    project_info['title'] = title.text.strip() if title else 'Title not found'

    # Year와 Area 추출
    all_divs = soup.find_all('div')
    for div in all_divs:
        items = div.find_all(['li', 'p'])
        for item in items:
            text = item.text.strip()
            if ':' in text:
                key, value = text.split(':', 1)
                key = key.strip().lower()
                value = value.strip()
                
                if 'area' in key:
                    match = re.search(r'(\d+(?:,\d+)?)\s*ft²', value)
                    if match:
                        project_info['area'] = match.group(1) + ' ft²'
                elif 'year' in key:
                    match = re.search(r'\d{4}', value)
                    if match:
                        project_info['year'] = match.group(0)

    # 전체 텍스트 추출
    full_text = soup.get_text()

    return project_info, full_text

def extract_keywords_gpt(text):
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that extracts key words and phrases from given text."},
            {"role": "user", "content": f"Extract the 10 most important keywords or key phrases from the following text. Provide them as a comma-separated list:\n\n{text[:4000]}"}
        ]
    )
    return completion.choices[0].message.content.strip().split(', ')

# 테스트 실행
url = "https://www.archdaily.com/1016658/common-knowledge-studio-thiss-studio?ad_source=search&ad_medium=projects_tab"
project_info, full_text = extract_project_info(url)

# GPT를 사용한 키워드 추출
# keywords = extract_keywords_gpt(full_text)

# 결과 출력
print("\nExtracted Project Information:")
print(f"Title: {project_info.get('title', 'Not found')}")
print(f"Year: {project_info.get('year', 'Not found')}")
print(f"Area: {project_info.get('area', 'Not found')}")
print("\nExtracted Keywords:")
# print(', '.join(keywords))


Extracted Project Information:
Title: Common Knowledge Studio / THISS Studio
Year: 2023
Area: Not found

Extracted Keywords:
