In [2]:
%%time
import pandas as pd
import requests
from urllib.parse import urlparse
from bs4 import BeautifulSoup

# Load the original dataset
df = pd.read_csv("url_dataset.csv")

# Feature extraction function (URL + content-based)
def extract_features(url):
    # URL-based features (from previous steps)
    parsed = urlparse(url)
    keywords = ["docs", "api", "reference", "sdk", "developer", "guide", "tutorial"]
    has_keyword = any(kw in url.lower() for kw in keywords)
    url_length = len(url)
    path_depth = len(parsed.path.strip('/').split('/')) if parsed.path else 0
    has_query = 1 if parsed.query else 0
    domain = parsed.netloc.lower()
    subdomain = domain.split('.')[0] if '.' in domain else domain
    has_docs_subdomain = 1 if "docs" in subdomain or "dev" in subdomain else 0

    # Content-based features
    code_count = 0
    tech_keyword_count = 0
    content_length = 0
    has_table = 0

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        # Code snippet count
        code_count = len(soup.find_all(["code", "pre"]))

        # Technical keyword frequency
        tech_keywords = ["api", "endpoint", "library", "function", "class", "method", "module"]
        text = soup.get_text().lower()
        tech_keyword_count = sum(text.count(kw) for kw in tech_keywords)

        # Content length (word count)
        content_length = len(text.split())

        # Presence of tables
        has_table = 1 if soup.find("table") else 0

        print(f"Fetching completed for {url} - {url.index}")

    except Exception as e:
        print(f"Error fetching {url}: {e}")

    return {
        "url": url,
        "url_length": url_length,
        "path_depth": path_depth,
        "has_query": has_query,
        "has_keyword": has_keyword,
        "has_docs_subdomain": has_docs_subdomain,
        "code_count": code_count,
        "tech_keyword_count": tech_keyword_count,
        "content_length": content_length,
        "has_table": has_table
    }

# Extract features for all URLs
data = [extract_features(url) for url in df["url"]]
df_enhanced = pd.DataFrame(data)

# Merge with original labels
df_enhanced = df_enhanced.merge(df[["url", "is_dev_docs"]], on="url")

# Save enhanced dataset
df_enhanced.to_csv("url_dataset_content.csv", index=False)
print("Enhanced dataset with content-based features saved as url_dataset_content.csv")

len("Len: ", df_enhanced), df_enhanced.head

Error fetching https://infohub.delltechnologies.com/t/reference-architecture-guide-dell-technologies-red-hat-openshift-reference-architecture-for-telecom-1/: 403 Client Error: Forbidden for url: https://infohub.delltechnologies.com/t/reference-architecture-guide-dell-technologies-red-hat-openshift-reference-architecture-for-telecom-1/
Error fetching https://wiki.ubuntu.com/UbuntuDevelopment/KnowledgeBase: HTTPSConnectionPool(host='wiki.ubuntu.com', port=443): Read timed out. (read timeout=5)
Error fetching https://www.amazon.com/aws-certified-developer-study-guide/dp/1394274807: 500 Server Error: Internal Server Error for url: https://www.amazon.com/aws-certified-developer-study-guide/dp/1394274807
Error fetching https://forums.unraid.net/topic/187416-guide-deploy-unraid-vms-with-terraform-and-opentofu/: 403 Client Error: Forbidden for url: https://forums.unraid.net/topic/187416-guide-deploy-unraid-vms-with-terraform-and-opentofu/
Error fetching https://www.ibm.com/docs/en/strategicsm/

TypeError: len() takes exactly one argument (2 given)