In [None]:
import time
import requests
import pandas as pd
from pathlib import Path

JOURNAL_NAME = "Transactions of the Association for Computational Linguistics"
YEAR_FROM = 2020
YEAR_TO = 2025

OUT_DIR = Path("data")
OUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_CSV = OUT_DIR / "tacl_2020_2025.csv"

BASE = "https://api.semanticscholar.org/graph/v1"
FIELDS = "paperId,title,abstract,year,venue,publicationDate,authors,externalIds,url"
LIMIT = 100

def ss_get(url, params, timeout=60):
    while True:
        r = requests.get(url, params=params, timeout=timeout)
        if r.status_code == 200:
            return r.json()
        if r.status_code in (429, 503):
            ra = r.headers.get("Retry-After")
            wait = int(ra) if ra and ra.isdigit() else 10
            time.sleep(wait)
            continue
        r.raise_for_status()

rows = []

for y in range(YEAR_FROM, YEAR_TO + 1):
    offset = 0
    collected = 0

    while True:
        query = f'venue:"{JOURNAL_NAME}" year:{y}'
        data = ss_get(
            f"{BASE}/paper/search",
            params={"query": query, "limit": LIMIT, "offset": offset, "fields": FIELDS},
        )

        batch = data.get("data", [])
        if not batch:
            break

        for p in batch:
            if p.get("year") != y:
                continue

            title = (p.get("title") or "").strip()
            abstract = (p.get("abstract") or "").strip()

            if not title or not abstract:
                continue

            authors = p.get("authors") or []
            rows.append({
                "paperId": p.get("paperId"),
                "year": y,
                "venue": p.get("venue"),
                "title": title,
                "abstract": abstract,
                "text": f"{title} {abstract}".strip(),
                "authors": "; ".join([a.get("name","") for a in authors]),
                "url": p.get("url")
            })

            collected += 1

        offset += LIMIT
        total = data.get("total", 0)
        if offset >= total:
            break

    print(f"{y}: collected={collected}")

df = pd.DataFrame(rows).drop_duplicates(subset=["paperId"]).reset_index(drop=True)
df.to_csv(OUT_CSV, index=False)

print("Saved to:", OUT_CSV)
print("Shape:", df.shape)

In [None]:
import requests
import pandas as pd

ISSN = "2307-387X"

src = requests.get(
    "https://api.openalex.org/sources",
    params={"filter": f"issn:{ISSN}", "per-page": 200}
).json()

sources = src.get("results", [])
df_sources = pd.DataFrame([{
    "id": s.get("id"),
    "display_name": s.get("display_name"),
    "type": s.get("type"),
    "issn": ",".join(s.get("issn", []) or []),
    "host_organization": s.get("host_organization")
} for s in sources])

df_sources

In [None]:
def reconstruct_abstract(inv):
    if not isinstance(inv, dict) or not inv:
        return ""
    pos_to_word = {}
    for word, positions in inv.items():
        for p in positions:
            pos_to_word[p] = word
    if not pos_to_word:
        return ""
    return " ".join(pos_to_word[i] for i in range(max(pos_to_word) + 1) if i in pos_to_word)


In [None]:
import requests
import pandas as pd
from pathlib import Path

SOURCE_ID = "https://openalex.org/S2729999759"
YEAR_FROM = 2020
YEAR_TO = 2025

OUT_DIR = Path("data")
OUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_CSV = OUT_DIR / f"tacl_{YEAR_FROM}_{YEAR_TO}_openalex.csv"

def reconstruct_abstract(inv):
    if not isinstance(inv, dict) or not inv:
        return ""
    pos_to_word = {}
    for word, positions in inv.items():
        for p in positions:
            pos_to_word[p] = word
    if not pos_to_word:
        return ""
    return " ".join(pos_to_word[i] for i in range(max(pos_to_word) + 1) if i in pos_to_word)

base = "https://api.openalex.org/works"
cursor = "*"
rows = []

filt = ",".join([
    f"primary_location.source.id:{SOURCE_ID}",
    f"from_publication_date:{YEAR_FROM}-01-01",
    f"to_publication_date:{YEAR_TO}-12-31"
])

while True:
    resp = requests.get(base, params={
        "filter": filt,
        "per-page": 200,
        "cursor": cursor
    }).json()

    for w in resp.get("results", []):
        title = (w.get("display_name") or "").strip()
        year = w.get("publication_year")
        abstract = reconstruct_abstract(w.get("abstract_inverted_index")).strip()

        if not title or not abstract or year is None:
            continue

        venue = (((w.get("primary_location") or {}).get("source") or {}).get("display_name") or "").strip()
        pub_date = (w.get("publication_date") or "").strip()

        rows.append({
            "openalex_id": w.get("id"),
            "year": int(year),
            "publication_date": pub_date,
            "venue": venue,
            "title": title,
            "abstract": abstract,
            "text": f"{title} {abstract}".strip(),
            "cited_by_count": w.get("cited_by_count"),
        })

    cursor = resp.get("meta", {}).get("next_cursor")
    if not cursor:
        break

df = pd.DataFrame(rows).drop_duplicates(subset=["openalex_id"]).reset_index(drop=True)
df.to_csv(OUT_CSV, index=False)

print("Saved to:", OUT_CSV)
print("Shape:", df.shape)
print(df["venue"].value_counts().head(10))
print(df["year"].value_counts().sort_index())

In [None]:
df["venue"].value_counts().head(10)

In [None]:
import requests
import pandas as pd

ISSN = "0004-3702"  # Artificial Intelligence journal

resp = requests.get(
    "https://api.openalex.org/sources",
    params={"filter": f"issn:{ISSN}"}
).json()

pd.DataFrame([{
    "id": s.get("id"),
    "display_name": s.get("display_name"),
    "issn": ",".join(s.get("issn", [])),
    "publisher": s.get("host_organization_name")
} for s in resp.get("results", [])])

In [None]:
import requests
import pandas as pd
from pathlib import Path

SOURCE_ID = "https://openalex.org/S196139623"

YEAR_FROM = 2015
YEAR_TO = 2025

OUT_DIR = Path("data")
OUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_CSV = OUT_DIR / f"artificial_intelligence_{YEAR_FROM}_{YEAR_TO}.csv"

def reconstruct_abstract(inv):
    if not isinstance(inv, dict) or not inv:
        return ""
    pos_to_word = {}
    for word, positions in inv.items():
        for p in positions:
            pos_to_word[p] = word
    if not pos_to_word:
        return ""
    return " ".join(pos_to_word[i] for i in range(max(pos_to_word) + 1) if i in pos_to_word)

base = "https://api.openalex.org/works"
cursor = "*"
rows = []

filt = ",".join([
    f"primary_location.source.id:{SOURCE_ID}",
    f"from_publication_date:{YEAR_FROM}-01-01",
    f"to_publication_date:{YEAR_TO}-12-31"
])

while True:
    resp = requests.get(base, params={
        "filter": filt,
        "per-page": 200,
        "cursor": cursor
    }).json()

    for w in resp.get("results", []):
        title = (w.get("display_name") or "").strip()
        year = w.get("publication_year")
        abstract = reconstruct_abstract(w.get("abstract_inverted_index"))

        if not title and not abstract or year is None:
            continue

        rows.append({
            "openalex_id": w.get("id"),
            "year": int(year),
            "title": title,
            "abstract": abstract,
            "text": f"{title} {abstract}".strip(),
            "cited_by_count": w.get("cited_by_count"),
        })

    cursor = resp.get("meta", {}).get("next_cursor")
    if not cursor:
        break

df = pd.DataFrame(rows).drop_duplicates(subset=["openalex_id"]).reset_index(drop=True)
df.to_csv(OUT_CSV, index=False)

print("Saved to:", OUT_CSV)
print("Shape:", df.shape)
print(df["year"].value_counts().sort_index())

In [None]:
scope_text = """
The journal of Artificial Intelligence (AIJ) welcomes papers on broad aspects of AI that constitute advances in the overall field including, but not limited to, cognition and AI, automated reasoning and inference, case-based reasoning, commonsense reasoning, computer vision, constraint processing, ethical AI, heuristic search, human interfaces, intelligent robotics, knowledge representation, machine learning, multi-agent systems, natural language processing, planning and action, and reasoning under uncertainty. The journal reports results achieved in addition to proposals for new ways of looking at AI problems, both of which must include demonstrations of value and effectiveness.

Papers describing applications of AI are also welcome, but the focus should be on how new and novel AI methods advance performance in application areas, rather than a presentation of yet another application of conventional AI methods. Papers on applications should describe a principled solution, emphasize its novelty, and present an indepth evaluation of the AI techniques being exploited.

Apart from regular papers, the journal also accepts Research Notes, Research Field Reviews, Position Papers, and Book Reviews (see details below). The journal will also consider summary papers that describe challenges and competitions from various areas of AI. Such papers should motivate and describe the competition design as well as report and interpret competition results, with an emphasis on insights that are of value beyond the competition (series) itself.
"""

In [None]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

CSV_PATH = "data/artificial_intelligence_2015_2025.csv"

df = pd.read_csv(CSV_PATH)

df["title"] = df["title"].fillna("").astype(str)
df["abstract"] = df["abstract"].fillna("").astype(str)

df["text"] = (df["title"] + " " + df["abstract"]).str.strip()

df["text"] = (
    df["text"]
    .str.replace(r"http\S+", " ", regex=True)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words="english",
    min_df=5,
    ngram_range=(1, 2),
)

X_articles = vectorizer.fit_transform(df["text"])
X_scope = vectorizer.transform([scope_text])

In [None]:
import numpy as np

alignment_scores = (X_articles @ X_scope.T).toarray().ravel()

df["alignment_score"] = alignment_scores

df["alignment_score"].describe()

In [None]:
year_alignment = df.groupby("year")["alignment_score"].mean()

year_alignment

In [None]:
import matplotlib.pyplot as plt

plt.figure()
plt.plot(year_alignment.index, year_alignment.values)
plt.xlabel("Year")
plt.ylabel("Mean Alignment Score")
plt.title("Thematic Alignment to Journal Scope Over Time")
plt.show()

In [None]:
df.sort_values("alignment_score").head(10)[["year","title","alignment_score"]]

In [None]:
df.sort_values("alignment_score", ascending=False).head(10)[["year","title","alignment_score"]]

In [None]:
!pip install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

model = SentenceTransformer("all-MiniLM-L6-v2")

# Article embeddings
article_embeddings = model.encode(
    df["text"].tolist(),
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
)

# Scope embedding
scope_embedding = model.encode(
    [scope_text],
    convert_to_numpy=True,
    normalize_embeddings=True
)

# Cosine similarity
semantic_scores = cosine_similarity(article_embeddings, scope_embedding).ravel()

df["semantic_alignment"] = semantic_scores

df["semantic_alignment"].describe()

In [None]:
year_semantic = df.groupby("year")["semantic_alignment"].mean()

year_semantic

In [None]:
import matplotlib.pyplot as plt

plt.figure()
plt.plot(year_alignment.index, year_alignment.values, label="Lexical Alignment")
plt.plot(year_semantic.index, year_semantic.values, label="Semantic Alignment")
plt.xlabel("Year")
plt.ylabel("Mean Alignment Score")
plt.legend()
plt.title("Lexical vs Semantic Alignment Over Time")
plt.show()

In [None]:
df.sort_values("semantic_alignment").head(10)[["year","title","semantic_alignment"]]

In [None]:
plt.hist(df["semantic_alignment"], bins=30)
plt.title("Distribution of Semantic Alignment Scores")
plt.show()

In [None]:
df.sort_values("semantic_alignment", ascending=False).head(10)[["year","title","semantic_alignment"]]

In [None]:
df.groupby("year")["semantic_alignment"].std()

In [None]:
df["era"] = df["year"].apply(lambda y: "2015-2019" if y <= 2019 else "2020-2025")

df.groupby("era")["semantic_alignment"].mean()
df.groupby("era")["semantic_alignment"].std()

In [None]:
print("Mean:")
print(df.groupby("era")["semantic_alignment"].mean())

print("\nStd:")
print(df.groupby("era")["semantic_alignment"].std())

In [None]:
from scipy.stats import ttest_ind

pre = df[df["era"]=="2015-2019"]["semantic_alignment"]
post = df[df["era"]=="2020-2025"]["semantic_alignment"]

ttest_ind(pre, post, equal_var=False)