In [1]:
import json
from typing import Dict, Iterator, List
from pathlib import Path

import pandas as pd # type: ignore

In [2]:
ASSETS = Path("../assets/").resolve()

Tabular = Dict[str, str]
Matches = Dict[str, Dict[str, List[Dict[str, str]]]]

In [3]:
def read_jsonl(p: Path) -> Iterator[Matches]:
    with p.open("r", encoding="utf-8") as lines:
        for line in lines:
            yield json.loads(line)

In [33]:
def transform(line: Matches) -> Tabular:
    for document_id, data in line.items():
        for match in data["patterns-SOV-ruinf"]:
            yield {
                "document_id": document_id, 
                "match": f'{match["subject"]} {match["anchor_influence"]} {match["influence_object"]}',
                "sentence": match["sentence"],
                "paragraph": match["fulltext"]
            }

In [30]:
data = []
for line in read_jsonl(ASSETS / "matches.jsonl"):
    for result in transform(line):
        data.append(result)

In [36]:
data[4]

{'document_id': '03d3ca77557b42e19bb49ccb26c43ddb',
 'match': 'canada urge russia',
 'sentence': '2 Canada should urge Russia to make a case for membership in NATO, through a series of slow and discreet steps.',
 'paragraph': "2 Canada should urge Russia to make a case for membership in NATO, through a series of slow and discreet steps. It should work out a strategy for convincing its 'old NATO' partners that Russian membership is the logical culmination of the course on which Canada is now embarked and point out that an all-inclusive NATO is a better guarantee of security than one that perpetuates new divisions in Europe. Russia's inclusion would be the final and conclusive step in NATO reform, in changing it from a collective security to a co-operative security organization ."}

In [38]:
pd.DataFrame(data)