In [1]:
!pip install boto3

Collecting boto3
  Downloading boto3-1.42.31-py3-none-any.whl.metadata (6.8 kB)
Collecting botocore<1.43.0,>=1.42.31 (from boto3)
  Downloading botocore-1.42.31-py3-none-any.whl.metadata (5.9 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.17.0,>=0.16.0 (from boto3)
  Downloading s3transfer-0.16.0-py3-none-any.whl.metadata (1.7 kB)
Downloading boto3-1.42.31-py3-none-any.whl (140 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading botocore-1.42.31-py3-none-any.whl (14.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Downloading s3transfer-0.16.0-py3-none-any.whl (86 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB

In [1]:
import os
import boto3
from botocore import UNSIGNED
from botocore.config import Config

BUCKET = "multiconer"
PREFIX = "multiconer2023/"          # note trailing slash
DEST_DIR = "multiconer2023"         # local folder

def download_prefix(bucket: str, prefix: str, dest_dir: str) -> None:
    s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))

    paginator = s3.get_paginator("list_objects_v2")
    for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
        for obj in page.get("Contents", []):
            key = obj["Key"]

            # Skip "directory marker" keys (rare but possible)
            if key.endswith("/"):
                continue

            rel_path = key[len(prefix):] if key.startswith(prefix) else key
            local_path = os.path.join(dest_dir, rel_path)

            os.makedirs(os.path.dirname(local_path), exist_ok=True)

            # Download
            s3.download_file(bucket, key, local_path)
            print(f"Downloaded s3://{bucket}/{key} -> {local_path}")

if __name__ == "__main__":
    os.makedirs(DEST_DIR, exist_ok=True)
    download_prefix(BUCKET, PREFIX, DEST_DIR)


Downloaded s3://multiconer/multiconer2023/BN-Bangla/bn_dev.conll -> multiconer2023/BN-Bangla/bn_dev.conll
Downloaded s3://multiconer/multiconer2023/BN-Bangla/bn_test.conll -> multiconer2023/BN-Bangla/bn_test.conll
Downloaded s3://multiconer/multiconer2023/BN-Bangla/bn_train.conll -> multiconer2023/BN-Bangla/bn_train.conll
Downloaded s3://multiconer/multiconer2023/DE-German/de_dev.conll -> multiconer2023/DE-German/de_dev.conll
Downloaded s3://multiconer/multiconer2023/DE-German/de_test.conll -> multiconer2023/DE-German/de_test.conll
Downloaded s3://multiconer/multiconer2023/DE-German/de_train.conll -> multiconer2023/DE-German/de_train.conll
Downloaded s3://multiconer/multiconer2023/EN-English/en_dev.conll -> multiconer2023/EN-English/en_dev.conll
Downloaded s3://multiconer/multiconer2023/EN-English/en_test.conll -> multiconer2023/EN-English/en_test.conll
Downloaded s3://multiconer/multiconer2023/EN-English/en_train.conll -> multiconer2023/EN-English/en_train.conll
Downloaded s3://multic

In [None]:
import numpy as np
import pandas as pd
from collections import Counter

# --- mappings (use your existing dicts) ---
# fine_id2label = {...}
# coarse_id2label = {...}

TOP_K = 25  # set None for all tags (less compact)

def trunc10(s: str) -> str:
    return s[:10] if len(s) > 10 else s

def topk_df(flat_ids: np.ndarray, id2label: dict, drop_o=True, top_k=25):
    c = Counter(flat_ids.tolist())
    items = [(k, v) for k, v in c.items() if (k != 0 if drop_o else True)]
    items = sorted(items, key=lambda x: x[1], reverse=True)
    if top_k is not None:
        items = items[:top_k]
    df = pd.DataFrame(items, columns=["tag_id", "count"])

    # label: "<id>:<name_truncated_to_10>"
    df["label"] = df["tag_id"].map(lambda i: f"{int(i)}:{trunc10(id2label.get(int(i), str(int(i))))}")

    # Keep descending in file; LaTeX will reverse y for "largest on top"
    return df

flat_fine = np.concatenate(ds_test.ner_tags_index.to_numpy())
flat_coarse = np.concatenate(ds_test.coarse_ner_tag_idx.to_numpy())

# O vs Entity (assumes O == 0)
is_entity = flat_fine > 0
c_bool = Counter(is_entity)
df_bool = pd.DataFrame({
    "label": ["O", "Entity"],
    "count": [c_bool[False], c_bool[True]],
})

df_coarse = topk_df(flat_coarse, coarse_id2label, drop_o=True, top_k=TOP_K)
df_fine   = topk_df(flat_fine,   fine_id2label,   drop_o=True, top_k=TOP_K)

df_bool.to_csv("o_vs_entity.csv", index=False)
df_coarse.to_csv("coarse_topk.csv", index=False)
df_fine.to_csv("fine_topk.csv", index=False)

print("Wrote: o_vs_entity.csv, coarse_topk.csv, fine_topk.csv")