In [None]:
# Extract Api Capabilities

from pathlib import Path
import json
import pandas as pd
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# --- Load JSONL ---
API_PATH = Path("../processed/api_catalog_sample10/api_repo.no_qos.jsonl")
with API_PATH.open() as f:
    records = [json.loads(line) for line in f if line.strip()]

# Custom extended stopword list (you can add domain-specific terms too)
EXTRA_STOPWORDS = set(["api", "get", "use", "data", "information", "your", "you", "service", "tool", "app", "application", "access", "provide", "provides", "return", "returns"])
STOPWORDS = ENGLISH_STOP_WORDS.union(EXTRA_STOPWORDS)

def extract_keywords(text):
    if not text:
        return []
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    tokens = text.split()
    tokens = [w for w in tokens if len(w) > 2 and w not in STOPWORDS]
    return tokens

# Combine name and description for tag extraction
api_tags = []
for r in records:
    text = f"{r.get('name', '')} {r.get('description', '')}"
    tokens = extract_keywords(text)
    api_tags.append({"api_id": r["api_id"], "tags": tokens})

# Flatten and count tag frequency
flat_tags = [tag for entry in api_tags for tag in entry["tags"]]
tag_counter = Counter(flat_tags)

# Attach top 3 tags per API
output_rows = []
for entry in api_tags:
    tag_counts = [(t, tag_counter[t]) for t in entry["tags"]]
    tag_counts = sorted(tag_counts, key=lambda x: (-x[1], x[0]))
    top_tags = list(dict.fromkeys([tc[0] for tc in tag_counts]))[:3]  # unique + stable
    output_rows.append({"api_id": entry["api_id"], "capability_1": top_tags[0] if len(top_tags) > 0 else "", "capability_2": top_tags[1] if len(top_tags) > 1 else "", "capability_3": top_tags[2] if len(top_tags) > 2 else ""})

# Save
OUT_PATH = Path("../processed/api_capability_tags.csv")
df = pd.DataFrame(output_rows)
df.to_csv(OUT_PATH, index=False)
print("Saved:", OUT_PATH.resolve())
df.head()

Saved: /Users/ishwaryapns/Documents/Thesis/MAOF/data/processed/api_capability_tags.csv


Unnamed: 0,api_id,capability_1,capability_2,capability_3
0,1_cent_sms_sendsms,the,for,your
1,2factor_authentication_india_send_transactiona...,send,sms,messages
2,31events_send_native_calendar_invites_accountc...,accountcreate,,
3,31events_send_native_calendar_invites_eventsend,eventsend,,
4,31events_send_native_calendar_invites_eventcreate,eventcreate,,
