In [23]:
import sqlite3
import pandas as pd

con = sqlite3.connect("../../wikipedia.sqlite")
# df = pd.read_sql_query('select * from Wikipedia', con)

## Training data

Training data is in metadata format and hence
1. train.json has the following format:
```json
  "75397": {
    "claim": "Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.",
    "label": "SUPPORTS",
    "evidence": [
      [
        "Fox_Broadcasting_Company",
        0
      ],
      [
        "Nikolaj_Coster-Waldau",
        7
      ]
    ]
  },
```

This meta information is used to extract the data, hence the sentence to train the classifier.

In [24]:
cur = con.cursor()
def select_sentence(title: str, idx: int) -> str:
    title = title.replace("'", "''")        # escape single quotes
    return f"SELECT sentence FROM Wikipedia WHERE title='{title}' AND idx={idx} LIMIT 1;"

# print(select_sentence("Fox_Broadcasting_Company", 0))
row = cur.execute(select_sentence("Fox_Broadcasting_Company", 0))
cur.fetchone()
cur.close()

#### Use the meta data to build training data into a dataframe.

The columns are:
```text
claim, label, evidence_title, evidence_sentence
```

In [3]:
import json
with open('../../dataset/train.json', 'r') as h:
    meta_train = json.load(h)


print(f"Number of claims: {len(meta_train)}")
print(f"Number of exmaples: {len([e for j in meta_train.values() for e in j.get('evidence')])}")

Number of claims: 145449
Number of exmaples: 204041


In [25]:
from typing import List
from tqdm import tqdm
claims: List[str] = list()
labels: List[str] = list()
ev_titles: List[str] = list()
ev_sents: List[str] = list()

skipped = list()
cur = con.cursor()
for data in tqdm(meta_train.values()):
    for ev in data.get('evidence'):
        title = ev[0]
        idx = ev[1]
        cur.execute(select_sentence(title, idx))
        res = cur.fetchone()
        if res is None:
            skipped.append(f"-- Title:{title} Idx:{idx} skipped. No results found.")
            continue

        claims.append(data.get('claim'))
        labels.append(data.get('label'))
        ev_titles.append(title)
        ev_sents.append(res[0])

print(f"Claims: {len(claims)}, Labels: {len(labels)}")
cur.close()

print(f"Skipped: {len(skipped)}")

100%|██████████| 145449/145449 [00:04<00:00, 33441.27it/s]

Claims: 202563, Labels: 202563
Skipped: 1478





In [26]:
cols = [ 'claim', 'label', 'evidence_title', 'evidence_sentence' ]

df = pd.DataFrame(zip(claims, labels, ev_titles, ev_sents), columns=cols)

len(df)

202563

In [27]:
df.head()

Unnamed: 0,claim,label,evidence_title,evidence_sentence
0,Nikolaj Coster-Waldau worked with the Fox Broa...,SUPPORTS,Fox_Broadcasting_Company,The Fox Broadcasting Company -LRB- often short...
1,Nikolaj Coster-Waldau worked with the Fox Broa...,SUPPORTS,Nikolaj_Coster-Waldau,He then played Detective John Amsterdam in the...
2,Roman Atwood is a content creator.,SUPPORTS,Roman_Atwood,"He is best known for his vlogs , where he post..."
3,Roman Atwood is a content creator.,SUPPORTS,Roman_Atwood,He also has another YouTube channel called `` ...
4,"History of art includes architecture, dance, s...",SUPPORTS,History_of_art,The subsequent expansion of the list of princi...


In [28]:
df.to_csv('../../train.csv')