In [20]:
# import pandas as pd
# !pip install requests pandas

In [22]:
CSS_ISSNS = [
    # Online ISSNs (E-ISSNs) 
    "2432-2725",  # Journal of Computational Social Science (online)  
    "2053-9517",  # Big Data & Society (online)                       
    "2193-1127",  # EPJ Data Science (online)                          
    "2197-4314",  # Computational Social Networks (online)             
]

SS_ISSNS = [
    "1537-5390",  # American Journal of Sociology (online)             
    "1467-954X",  # The Sociological Review (online)                   
    "1573-7853",  # Theory & Society (online)                          
    "2003-1998",  # Journal of Digital Social Resources (online)       
]

YEARS = list(range(2020, 2026))         
N_PER_STRATUM = 20                      
RANDOM_SEED_BASE = 20251018              
CROSSREF_MAILTO = "shujashakir@gmail.com"  
USE_SEMANTIC_SCHOLAR = True              
PAUSE = 0.2                              
OUT_DIR = "out_sampling"                 


import time, random, requests
from dataclasses import dataclass
from pathlib import Path
from urllib.parse import urlencode

CR_BASE = "https://api.crossref.org/works"
SS_BASE = "https://api.semanticscholar.org/graph/v1/paper/DOI:"

@dataclass
class Row:
    doi: str
    title: str
    journal: str
    year: int
    abstract: str
    stratum: str  # "CSS" or "SS"
    url: str

def crossref_query(issn: str, year: int, rows: int = 1000):
    params = {
        "filter": f"type:journal-article,from-pub-date:{year}-01-01,until-pub-date:{year}-12-31,issn:{issn}",
        "rows": rows,
        "mailto": CROSSREF_MAILTO,
        "select": "DOI,title,container-title,issued,type,URL,abstract"
    }
    url = f"{CR_BASE}?{urlencode(params)}"
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    return r.json().get("message", {}).get("items", [])

def ss_abstract(doi: str) -> str:
    url = f"{SS_BASE}{doi}?fields=title,abstract,venue,year"
    try:
        r = requests.get(url, timeout=20)
        if r.status_code == 200:
            return r.json().get("abstract") or ""
    except Exception:
        pass
    return ""

def normalize_title(t):
    if isinstance(t, list) and t:
        return t[0]
    return t or ""

def normalize_journal(j):
    if isinstance(j, list) and j:
        return j[0]
    return j or ""

def issued_year(issued):
    try:
        return int(issued["date-parts"][0][0])
    except Exception:
        return None

def build_frame(issns, label):
    frame, seen = [], set()
    for issn in issns:
        for y in YEARS:
            try:
                items = crossref_query(issn, y)
                time.sleep(PAUSE)
            except Exception:
                continue
            for it in items:
                if it.get("type") != "journal-article":
                    continue
                doi = (it.get("DOI") or "").strip()
                if not doi or doi in seen:
                    continue
                seen.add(doi)
                title   = normalize_title(it.get("title"))
                journal = normalize_journal(it.get("container-title"))
                year    = issued_year(it.get("issued"))
                abstract= (it.get("abstract") or "").strip()
                url     = (it.get("URL") or "").strip()
                frame.append(Row(doi, title, journal, year, abstract, label, url))
    return frame

def fill_missing_abstracts(rows):
    if not USE_SEMANTIC_SCHOLAR:
        return rows
    out = []
    for r in rows:
        if not r.abstract:
            abs2 = ss_abstract(r.doi)
            if abs2:
                r = Row(r.doi, r.title, r.journal, r.year, abs2, r.stratum, r.url)
            time.sleep(0.15)
        out.append(r)
    return out

def clean_frame(rows):
    out, seen = [], set()
    for r in rows:
        if not r.doi or r.doi in seen:         continue
        if not r.title or not r.journal or not r.year: continue
        if not r.abstract:                     continue
        seen.add(r.doi)
        out.append(r)
    return out

def deterministic_sample(frame, n, seed):
    rng = random.Random(seed)
    if len(frame) < n:
        raise ValueError(f"Stratum has {len(frame)} items; need {n}. Add ISSNs or widen years.")
    return rng.sample(frame, n)

def write_csv(path: Path, rows, header=None):
    df = pd.DataFrame(rows) if isinstance(rows, list) else rows
    df.to_csv(path, index=False)

In [23]:
from pathlib import Path

# sanity check
if not CSS_ISSNS or not SS_ISSNS:
    raise SystemExit("Please fill CSS_ISSNS and SS_ISSNS with ONLINE (e-ISSN) values before running.")

out_dir = Path(OUT_DIR)
out_dir.mkdir(parents=True, exist_ok=True)

# build frames
css_frame = clean_frame(fill_missing_abstracts(build_frame(CSS_ISSNS, "CSS")))
ss_frame  = clean_frame(fill_missing_abstracts(build_frame(SS_ISSNS, "SS")))

css_pop, ss_pop = len(css_frame), len(ss_frame)
print(f"Frame sizes → CSS: {css_pop}, SS: {ss_pop}")

# reproducible samples
css_sample = deterministic_sample(css_frame, N_PER_STRATUM, RANDOM_SEED_BASE)
ss_sample  = deterministic_sample(ss_frame,  N_PER_STRATUM, RANDOM_SEED_BASE + 1)

# candidates table
def row_to_dict(idx, r):
    return {
        "id": idx,
        "doi": r.doi,
        "title": r.title,
        "journal": r.journal,
        "year": r.year,
        "abstract": r.abstract,
        "stratum": r.stratum,
        "url": r.url,
        "keep": 1,   # ← after human review, set to 0 to drop
    }

cand = []
i = 1
for r in css_sample: cand.append(row_to_dict(i, r)); i += 1
for r in ss_sample:  cand.append(row_to_dict(i, r)); i += 1

pilot_candidates_path = out_dir / "pilot_candidates.csv"
write_csv(pilot_candidates_path, cand)
print(f"\nWrote: {pilot_candidates_path}\n"
      f"Open it, skim quickly, and set keep=0 for non-research/irrelevant entries.\n")


Frame sizes → CSS: 1173, SS: 761

Wrote: out_sampling\pilot_candidates.csv
Open it, skim quickly, and set keep=0 for non-research/irrelevant entries.



In [24]:
# FULL 20+20 papers

from pathlib import Path
import time, random, pandas as pd

USE_SEMANTIC_SCHOLAR = False   # speed: fill missing abstracts later if needed
N_PER_STRATUM = 20
YEARS = list(range(2020, 2026))  # 2020–2025

# Cap CSN (Computational Social Networks, e-ISSN 2197-4314) 
YEARS_BY_ISSN = {"2197-4314": [2021, 2020]}

def years_for_issn(issn: str):
    return YEARS_BY_ISSN.get(issn, YEARS)

def build_frame_with_overrides_verbose(issns, label, per_year_limit=120):
    """Fetch CrossRef items per ISSN/year (with optional per-year cap)."""
    frame, seen = [], set()
    for issn in issns:
        print(f"[{label}] ISSN {issn}")
        for y in years_for_issn(issn):
            try:
                items = crossref_query(issn, y, rows=per_year_limit)
                print(f"  year {y}: fetched {len(items)} items (capped={per_year_limit})")
            except Exception as e:
                print(f"  year {y}: ERROR -> {e}")
                items = []
            time.sleep(PAUSE)
            for it in items:
                if it.get("type") != "journal-article":
                    continue
                doi = (it.get("DOI") or "").strip()
                if not doi or doi in seen:
                    continue
                seen.add(doi)
                frame.append(Row(
                    doi=doi,
                    title=normalize_title(it.get("title")),
                    journal=normalize_journal(it.get("container-title")),
                    year=issued_year(it.get("issued")),
                    abstract=(it.get("abstract") or "").strip(),
                    stratum=label,
                    url=(it.get("URL") or "").strip()
                ))
    print(f"[{label}] raw frame size: {len(frame)}")

    # Fill abstracts 
    rows = fill_missing_abstracts(frame) if USE_SEMANTIC_SCHOLAR else frame
    cleaned = clean_frame(rows)
    print(f"[{label}] cleaned frame size: {len(cleaned)}")
    return cleaned

out_dir = Path(OUT_DIR); out_dir.mkdir(parents=True, exist_ok=True)

css_frame = build_frame_with_overrides_verbose(CSS_ISSNS, "CSS", per_year_limit=120)
ss_frame  = build_frame_with_overrides_verbose(SS_ISSNS,  "SS",  per_year_limit=120)

css_pop, ss_pop = len(css_frame), len(ss_frame)
print(f"\nFrame sizes → CSS: {css_pop}, SS: {ss_pop}")

# Deterministic stratified samples (raise helpful error if too small)
def must_sample(frame, n, seed):
    if len(frame) < n:
        raise ValueError(
            f"Not enough items to sample {n}.\n"
            f"Have {len(frame)} in this stratum. Consider adding another ISSN or widening years."
        )
    rng = random.Random(seed)
    return rng.sample(frame, n)

css_sample = must_sample(css_frame, N_PER_STRATUM, RANDOM_SEED_BASE)
ss_sample  = must_sample(ss_frame,  N_PER_STRATUM, RANDOM_SEED_BASE + 1)

def row_to_dict(idx, r):
    return {
        "id": idx,
        "doi": r.doi,
        "title": r.title,
        "journal": r.journal,
        "year": r.year,
        "abstract": r.abstract,
        "stratum": r.stratum,
        "url": r.url,
        "keep": 1,  # set to 0 during manual verification
    }

cand = []
i = 1
for r in css_sample: cand.append(row_to_dict(i, r)); i += 1
for r in ss_sample:  cand.append(row_to_dict(i, r)); i += 1

pilot_candidates_path = out_dir / "pilot_candidates.csv"
pd.DataFrame(cand).to_csv(pilot_candidates_path, index=False)
print(f"\nWROTE → {pilot_candidates_path}")

# Minimal audit log 
log_path = out_dir / "sampling_log.csv"
log_entry = pd.DataFrame([{
    "phase": "discover_sample",
    "population_css": css_pop,
    "population_ss": ss_pop,
    "random_seed_base": RANDOM_SEED_BASE,
    "timestamp": int(time.time()),
}])
if log_path.exists():
    log_df = pd.read_csv(log_path)
    log_df = pd.concat([log_df, log_entry], ignore_index=True)
else:
    log_df = log_entry
log_df.to_csv(log_path, index=False)
print(f"Audit log updated → {log_path}\n"
      f"Next: open pilot_candidates.csv, mark keep=0 where needed, then run FINALIZE cell.")

[CSS] ISSN 2432-2725
  year 2020: fetched 36 items (capped=120)
  year 2021: fetched 57 items (capped=120)
  year 2022: fetched 41 items (capped=120)
  year 2023: fetched 38 items (capped=120)
  year 2024: fetched 114 items (capped=120)
  year 2025: fetched 79 items (capped=120)
[CSS] ISSN 2053-9517
  year 2020: fetched 86 items (capped=120)
  year 2021: fetched 105 items (capped=120)
  year 2022: fetched 93 items (capped=120)
  year 2023: fetched 120 items (capped=120)
  year 2024: fetched 108 items (capped=120)
  year 2025: fetched 100 items (capped=120)
[CSS] ISSN 2193-1127
  year 2020: fetched 36 items (capped=120)
  year 2021: fetched 57 items (capped=120)
  year 2022: fetched 62 items (capped=120)
  year 2023: fetched 63 items (capped=120)
  year 2024: fetched 77 items (capped=120)
  year 2025: fetched 75 items (capped=120)
[CSS] ISSN 2197-4314
  year 2021: fetched 22 items (capped=120)
  year 2020: fetched 6 items (capped=120)
[CSS] raw frame size: 1375
[CSS] cleaned frame size:

In [29]:
# FINALIZE VERIFIED SAMPLE

from pathlib import Path
import pandas as pd, time

out_dir = Path(OUT_DIR)
cand_path = out_dir / "pilot_candidates.csv"
final_path = out_dir / "pilot_40.csv"

print(f"Loading: {cand_path}")
df = pd.read_csv(cand_path)

# Keep only manually approved rows
df_keep = df[df["keep"] == 1].copy()
print(f"Rows kept: {len(df_keep)} of {len(df)}")

# Sort by stratum for readability
df_keep = df_keep.sort_values(by=["stratum", "journal", "year"]).reset_index(drop=True)

# Write the final dataset
df_keep.to_csv(final_path, index=False)
print(f"WROTE → {final_path}")

# Append audit log
log_path = out_dir / "sampling_log.csv"
entry = pd.DataFrame([{
    "phase": "finalize",
    "timestamp": int(time.time()),
    "num_kept": len(df_keep),
    "num_discarded": len(df) - len(df_keep)
}])
if log_path.exists():
    old = pd.read_csv(log_path)
    pd.concat([old, entry], ignore_index=True).to_csv(log_path, index=False)
else:
    entry.to_csv(log_path, index=False)
print(f"Audit log updated → {log_path}")


Loading: out_sampling\pilot_candidates.csv
Rows kept: 40 of 40
WROTE → out_sampling\pilot_40.csv
Audit log updated → out_sampling\sampling_log.csv
