<a href="https://colab.research.google.com/github/liljar2004-sudo/Kenjar_DTSC3020/blob/main/assignment6_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# --- Q1 Skeleton (filled) ---
def q1_read_table(html: str) -> pd.DataFrame:
    """Return the first table with >= 3 columns from the HTML, flatten headers."""
    # Read all tables from HTML
    tables = pd.read_html(html)

    # Pick the first table with >= 3 columns
    for df in tables:
        if df.shape[1] >= 3:
            # Flatten MultiIndex columns if any
            if isinstance(df.columns, pd.MultiIndex):
                df.columns = ['_'.join([str(i) for i in col if i]) for col in df.columns]
            return df
    raise ValueError("No table with >= 3 columns found")


def q1_clean(df: pd.DataFrame) -> pd.DataFrame:
    """Clean columns: strip, UPPER Alpha-2/Alpha-3, cast Numeric to int (nullable), drop invalids."""
    # Make column names consistent
    df = df.rename(columns=lambda x: x.strip())

    # Pick expected columns (or rename them if necessary)
    expected_cols = ['Country', 'Alpha-2', 'Alpha-3', 'Numeric']
    df = df.loc[:, df.columns.intersection(expected_cols)]

    # Strip strings
    for col in ['Country', 'Alpha-2', 'Alpha-3']:
        if col in df.columns:
            df[col] = df[col].astype(str).str.strip().str.upper()

    # Convert Numeric to nullable int
    if 'Numeric' in df.columns:
        df['Numeric'] = pd.to_numeric(df['Numeric'], errors='coerce').astype('Int64')

    # Drop rows with missing mandatory fields
    df = df.dropna(subset=['Country', 'Alpha-2', 'Alpha-3', 'Numeric'])

    return df


def q1_sort_top(df: pd.DataFrame, top: int = 15) -> pd.DataFrame:
    """Sort descending by Numeric and return Top-N."""
    return df.sort_values(by='Numeric', ascending=False).head(top)


# --- Example usage ---
if __name__ == "__main__":
    html_file = "https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry.html"
    df_raw = q1_read_table(html_file)
    df_clean = q1_clean(df_raw)
    df_top15 = q1_sort_top(df_clean, top=15)

    # Save to CSV
    df_clean.to_csv("data_q1.csv", index=False)

    # Print Top-15
    print(df_top15)


In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re

# --- Q2 Skeleton (filled) ---
def q2_parse_items(html: str) -> pd.DataFrame:
    """Parse front page items into DataFrame columns: rank, title, link, points, comments, user (optional)."""
    soup = BeautifulSoup(html, "html.parser")
    rows = soup.select(".athing")

    items = []
    for row in rows:
        rank = row.select_one(".rank").text if row.select_one(".rank") else ""
        title_tag = row.select_one(".storylink")
        title = title_tag.text if title_tag else ""
        link = title_tag.get("href") if title_tag else ""

        subtext_row = row.find_next_sibling("tr")
        subtext = subtext_row.select_one(".subtext") if subtext_row else None

        points = 0
        comments = 0
        user = ""
        if subtext:
            # Extract points
            points_tag = subtext.select_one(".score")
            if points_tag:
                points_match = re.search(r"(\d+)", points_tag.text)
                points = int(points_match.group(1)) if points_match else 0
            # Extract comments
            comments_tag = subtext.find_all("a")[-1] if subtext.find_all("a") else None
            if comments_tag and "comment" in comments_tag.text:
                comments_match = re.search(r"(\d+)", comments_tag.text)
                comments = int(comments_match.group(1)) if comments_match else 0
            # Extract user
            user_tag = subtext.select_one(".hnuser")
            user = user_tag.text if user_tag else ""

        items.append({
            "rank": rank,
            "title": title,
            "link": link,
            "points": points,
            "comments": comments,
            "user": user
        })

    return pd.DataFrame(items)


def q2_clean(df: pd.DataFrame) -> pd.DataFrame:
    """Clean numeric fields and fill missing values."""
    # Clean rank, points, comments
    for col in ['rank', 'points', 'comments']:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

    # Fill missing text fields
    for col in ['title', 'link', 'user']:
        df[col] = df[col].fillna("")

    return df


def q2_sort_top(df: pd.DataFrame, top: int = 15) -> pd.DataFrame:
    """Sort by points descending and return Top-N."""
    return df.sort_values(by='points', ascending=False).head(top)


# --- Example usage ---
if __name__ == "__main__":
    url = "https://news.ycombinator.com/"
    html = requests.get(url).text

    df_raw = q2_parse_items(html)
    df_clean = q2_clean(df_raw)
    df_top15 = q2_sort_top(df_clean, top=15)

    # Save to CSV
    df_clean.to_csv("data_q2.csv", index=False)

    # Print Top-15
    print(df_top15)
