In [None]:
import pandas as pd
from pathlib import Path
import json
from collections import Counter, defaultdict

TEST_DIR = Path("..") / "data" / "test" 
NER_FILE = TEST_DIR / "NER-test.tsv"
ST_FILE  = TEST_DIR / "sentiment-topic-test.tsv"

assert NER_FILE.exists(), f"Cannot find {NER_FILE}"
assert ST_FILE.exists(),  f"Cannot find {ST_FILE}"

print(f"Found test files in {TEST_DIR.absolute()}")

def load_ner_tsv(path: Path):
    """
    CoNLL-style reader.
    Keeps first column as token, last column as NER tag.
    Blank line = sentence boundary. Lines starting with '#', '-DOCSTART-' or
    that are themselves blank are skipped.
    """
    sentences, tokens, tags = [], [], []
    with path.open(encoding="utf8") as f:
        for line in f:
            line = line.rstrip("\n")
            if not line or line.startswith(("#", "-DOCSTART-")):
                if tokens:
                    sentences.append((tokens, tags))
                    tokens, tags = [], []
                continue
            parts = line.split("\t")
            tok = parts[0]          
            ner = parts[-1]         
            tokens.append(tok)
            tags.append(ner)
    if tokens:
        sentences.append((tokens, tags))
    return sentences


def load_st_tsv(path: Path):
    """Return DataFrame with columns: sentence, sentiment, topic."""
    return pd.read_csv(path, sep="\t", names=["sentence", "sentiment", "topic"])

# load files
ner_sentences = load_ner_tsv(NER_FILE)
st_df         = load_st_tsv(ST_FILE)

stats_dict = {
    "NER": {
        "num_sentences" : len(ner_sentences),
        "num_tokens"    : sum(len(toks) for toks, _ in ner_sentences),
        "unique_labels" : sorted({tag for _, tags in ner_sentences for tag in tags}),
        "label_counts"  : Counter(tag for _, tags in ner_sentences for tag in tags)
    },
    "SentimentTopic": {
        "num_sentences" : len(st_df),
        "sentiment_labels" : st_df["sentiment"].value_counts().to_dict(),
        "topic_labels"      : st_df["topic"].value_counts().to_dict()
    }
}

print(json.dumps(stats_dict, indent=2, ensure_ascii=False))


Found test files in c:\Users\Embrik\Project Text Mining\TextMiningPoster\notebooks\..\data\test
{
  "NER": {
    "num_sentences": 1,
    "num_tokens": 217,
    "unique_labels": [
      "B-LOCATION",
      "B-ORG",
      "B-PERSON",
      "B-WORK_OF_ART",
      "BIO_NER_tag",
      "I-LOCATION",
      "I-ORG",
      "I-PERSON",
      "I-WORK_OF_ART",
      "O"
    ],
    "label_counts": {
      "BIO_NER_tag": 1,
      "O": 159,
      "B-LOCATION": 3,
      "B-ORG": 8,
      "B-WORK_OF_ART": 6,
      "I-WORK_OF_ART": 8,
      "B-PERSON": 12,
      "I-PERSON": 13,
      "I-ORG": 5,
      "I-LOCATION": 2
    }
  },
  "SentimentTopic": {
    "num_sentences": 19,
    "sentiment_labels": {
      "positive": 6,
      "neutral": 6,
      "negative": 6,
      "sentiment": 1
    },
    "topic_labels": {
      "sports": 6,
      "book": 6,
      "movie": 6,
      "topic": 1
    }
  }
}
