<a href="https://colab.research.google.com/github/kingsdigitallab/lwm-davizct/blob/main/notebooks/Export.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Export data for _frontend_ visualisations
## Instructions

- [Download data from Zooniverse](https://github.com/Living-with-machines/zoonyper/blob/v0.1.0/docs/source/getting-started/tutorials/setting-up-your-first-project.rst)
- Set up the data path settings
- Run the rest of the notebook


## Settings

In [1]:
in_path = "../data/0_raw"
ext_path = "../data/0_external"
downloads_path = "../data/downloads"
out_path = "../data/1_interim"

## System set up

In [2]:
import base64
import json
import os
import zlib
from pprint import pprint

import pandas as pd

from zoonyper.project import Project

In [3]:
project = Project(in_path)
subjects_df = project.disambiguate_subjects(downloads_path)

disambiguated. If you want to do so, run the
`.disambiguate_subjects(<download-dir>)` method.[0m


100%|████████████████████████████████████████████| 51488/51488 [00:34<00:00, 1504.46it/s]


## Workflows

In [4]:
workflows_df = project.workflows

projects = [
    dict(
        title="Language of accidents",
        workflows=[
            dict(
                workflow_id=18831, name=workflows_df.loc[18831]["display_name"].iloc[0]
            ),
            dict(
                workflow_id=20921, name=workflows_df.loc[20921]["display_name"].iloc[0]
            ),
            dict(
                workflow_id=21525, name=workflows_df.loc[21525]["display_name"].iloc[0]
            ),
            dict(
                workflow_id=23457, name=workflows_df.loc[23457]["display_name"].iloc[0]
            ),
        ],
    ),
    dict(
        title="Language of mechanisation",
        workflows=[
            dict(
                workflow_id=23672, name=workflows_df.loc[23672]["display_name"].iloc[0]
            ),
            dict(
                workflow_id=23681, name=workflows_df.loc[23681]["display_name"].iloc[0]
            ),
            dict(
                workflow_id=23628, name=workflows_df.loc[23628]["display_name"].iloc[0]
            ),
            dict(
                workflow_id=23452, name=workflows_df.loc[23452]["display_name"].iloc[0]
            ),
        ],
    ),
]

projects_workflows = [w["workflow_id"] for p in projects for w in p["workflows"]]
pprint(projects)

with open(os.path.join(out_path, "projects.json"), "w") as f:
    json.dump(projects, f)

[{'title': 'Language of accidents',
  'workflows': [{'name': 'Accident detail: age and gender of victims',
                 'workflow_id': 18831},
                {'name': 'How did machines change accidents?',
                 'workflow_id': 20921},
                {'name': 'Accident detail: in which sites were accidents '
                         'reported?',
                 'workflow_id': 21525},
                {'name': 'Accident detail: where were accidents reported?',
                 'workflow_id': 23457}]},
 {'title': 'Language of mechanisation',
  'workflows': [{'name': 'Bicycle or motorcycle?', 'workflow_id': 23672},
                {'name': "How did the word 'coach' change over time and place?",
                 'workflow_id': 23681},
                {'name': "How did the word 'car' change over time and place?",
                 'workflow_id': 23628},
                {'name': "How did the word 'trolley' change over time and "
                         'place?',
              

In [5]:
timelines = project.get_workflow_timelines()


def get_date(workflow_id, date_field):
    found = [t for t in timelines if t["workflow_id"] == workflow_id]
    if found:
        return found[0][date_field]

    return None


def parse_tasks(row):
    first = row["first_task"]
    tasks = json.loads(row["tasks"])

    return [k for k, v in tasks.items() if v["type"] != "drawing"]


workflows_df = project.workflows
workflows_df = workflows_df[~workflows_df.index.duplicated(keep="last")]
workflows_df = workflows_df.drop(
    columns=[
        "pairwise",
        "grouped",
        "prioritized",
        "primary_language",
        "tutorial_subject_id",
        "retired_set_member_subjects_count",
        "retirement",
        "aggregation",
        "strings",
    ]
)
workflows_df = workflows_df[workflows_df.index.isin(projects_workflows)]
workflows_df["start_date"] = workflows_df.index.map(lambda x: get_date(x, "start_date"))
workflows_df["end_date"] = workflows_df.index.map(lambda x: get_date(x, "end_date"))
workflows_df["tasks"] = workflows_df.apply(parse_tasks, axis=1)
workflows_df.to_json(os.path.join(out_path, "workflows.json"), orient="index")
workflows_df.head()

Unnamed: 0_level_0,display_name,version,active,classifications_count,first_task,tasks,minor_version,start_date,end_date
workflow_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
18831,Accident detail: age and gender of victims,52,True,12251,T2,"[T1, T2, T3, T4, T6]",157,2021-06-21,2023-04-18
20921,How did machines change accidents?,10,True,54720,T0,[T0],81,2022-02-16,2023-04-18
21525,Accident detail: in which sites were accidents...,20,True,18744,T0,"[T0, T1]",56,2022-05-25,2023-04-18
23452,How did the word 'trolley' change over time an...,26,False,3346,T0,[T0],77,2023-02-06,2023-03-19
23457,Accident detail: where were accidents reported?,13,True,4952,T1,"[T0, T1, T2, T3]",54,2023-02-05,2023-04-18


## Participants

In [6]:
participants_df = project.classifications[["workflow_id", "user_name"]]
participants_df = participants_df[
    participants_df["workflow_id"].isin(projects_workflows)
]
participants_df = participants_df.value_counts(
    ["workflow_id", "user_name"]
).reset_index(name="count")

participants = (
    participants_df.groupby("workflow_id")
    .apply(lambda x: x.set_index("user_name")["count"].to_dict())
    .to_dict()
)

with open(os.path.join(out_path, "participants.json"), "w") as f:
    json.dump(participants, f)

participants_df.head()

Unnamed: 0,workflow_id,user_name,count
0,20921,99c3512d,5588
1,20921,6dc0ad7d,3017
2,20921,0f1402a3,2844
3,23672,5f4f6fc2,1983
4,20921,bab0f6bb,1864


## Newspapers

In [7]:
source_df = pd.read_csv(os.path.join(ext_path, "Mitchell_1846_1920.csv"))
source_links_df = pd.read_csv(os.path.join(ext_path, "newspapers_links.csv"))
source_df = source_df.merge(
    source_links_df[["NLP", "link_to_mpd"]], left_on="ID", right_on="link_to_mpd"
)
source_df = source_df.drop(columns=["Unnamed: 0", "link_to_mpd"])
source_df = source_df.rename(columns=str.lower)
source_df = source_df.rename(
    columns={
        "established_date": "established",
        "publised_date": "published",
        "place_pub": "place",
        "place_pub_wiki": "place_id",
        "place_pub_coord": "coord",
    }
)
source_df = source_df.drop(
    columns=["persons", "organizations", "places", "places_tres", "text"]
)
source_df = source_df.drop_duplicates(subset="nlp")
# source_df = source_df[source_df["nlp"].isin(subjects["nlp"])]
source_df["politics"] = source_df["politics"].fillna("")
source_df["politics"] = (
    source_df["politics"].astype(str).apply(lambda x: x.split("<SEP>"))
)
source_df["price"] = source_df["price"].astype(str).apply(lambda x: x.split("<SEP>"))

source_df.set_index("nlp").to_json(
    os.path.join(out_path, "newspapers.json"), orient="index"
)

source_df.head()

Unnamed: 0,id,chain_id,year,title,price,politics,established,published,place,place_id,coord,nlp
0,MPD_1846_65,CID_000001,1846,LLOYD'S WEEKLY LONDON NEWSPAPER .,[3d],"[democratic, anti-poor-law]","November , 1842",Saunday,london,Q84,"(51.507222, -0.1275)",78
6,MPD_1847_73,CID_000001,1847,LLOYD'S WEEKLY LONDON NEWSPAPER .,[3d],"[democratic, anti-poor-law]","November , 1842",Sunday,london,Q84,"(51.507222, -0.1275)",79
60,MPD_1846_156,CID_000004,1846,BRIGHTON HERALD .,[5d],[liberal],"September , 1806",Saturday,brighton,Q131491,"(50.824167, -0.133889)",1663
65,MPD_1846_170,CID_000006,1846,KENTISH GAZETTE .,[4d],[conservative],1718<SEP>1768,Tuesday,canterbury,Q29303,"(51.278333, 1.0775)",235
141,MPD_1920_705,CID_000006,1920,"KENTISH GAZETTE , AND CANTERBURY TIMES .",[1 ½ d],[independent],1717,Friday<SEP>afternoons,canterbury,Q29303,"(51.278333, 1.0775)",1834


## Subjects

In [8]:
subjects = subjects_df.reset_index()

subjects = subjects[subjects["workflow_id"].isin(projects_workflows)]
subjects = subjects[subjects["classifications_count"] > 0]
subjects = subjects.rename(
    columns={
        "subject_id_disambiguated": "sid",
        "!inferred_nlp": "nlp",
        "!text": "text",
        "!image": "img",
    }
)
subjects["sid"] = pd.to_numeric(subjects["sid"]).astype("Int64")
subjects["nlp"] = pd.to_numeric(subjects["nlp"], errors="coerce").astype("Int64")
subjects["nlp"] = subjects["nlp"].fillna(0)

subjects = subjects[
    subjects.columns.drop(list(subjects.filter(regex="!original_metadata")))
]
subjects = subjects[subjects.columns.drop(list(subjects.filter(regex="newspaper")))]
subjects = subjects.drop(
    columns=[
        "!article_id",
        "!crop_rectangle",
        "!issue",
        "!myid",
        "!origin",
        "!original_id",
        "!page",
        "!text_source",
        "!textblock_id",
        "!zooniverse_file_md5",
        "!zooniverse_file_stem",
        "attribution",
        "locations",
        "subject_set_id",
        "origin",
        "project_id",
        "",
    ]
)
subjects = subjects.sort_values(by="sid")

# subject id disambuguated/subject ids links
subjects[["sid", "subject_id"]].drop_duplicates().set_index("subject_id").to_json(
    os.path.join(out_path, "subjects_ids.json"), orient="index"
)

# subjects metadata
subjects.drop_duplicates(subset="sid").drop(
    columns=["subject_id", "text", "img"]
).set_index("sid",).to_json(os.path.join(out_path, "subjects.json"), orient="index")

# subjects texts, compressed
subjects_text = subjects[["sid", "text"]].drop_duplicates(subset="sid")
subjects_text.set_index("sid")
subjects_text["text"] = (
    subjects_text["text"]
    .fillna("")
    .apply(lambda x: base64.b64encode(zlib.compress(x.encode("utf-8"))))
)
subjects_text.to_json(os.path.join(out_path, "subjects_text.json"), orient="index")

# subjects images
subjects[["sid", "img"]].drop_duplicates(subset="sid").set_index("sid").to_json(
    os.path.join(out_path, "subjects_image.json"), orient="records"
)

subjects.head()

Unnamed: 0,subject_id,sid,workflow_id,classifications_count,retired_at,retirement_reason,created_at,updated_at,img,nlp,text
288213,86518535,2,23681,3,2023-03-31,classification_count,2023-03-22,2023-04-01,0003075-18951026-0005-coach_pa0005103.png,3075,ortune in fuer seet — t — a:coinpativing indus...
243648,82929991,3,20921,4,2023-01-28,consensus,2022-11-23,2023-04-01,0003089-18540516-art0053-pa0004079-accidents-1...,3089,"Flats.—On Saturdav night, shortly after ten o'..."
153781,82894129,5,20921,4,2022-12-06,consensus,2022-11-22,2023-04-02,0002597-18740523-art0004-pa0001014-accidents-1...,2597,aREENHOUSES anti - glass 10 ft. by 5 ft. £6. ‘...
227582,82927959,6,20921,5,2023-01-30,consensus,2022-11-23,2023-04-02,0003089-18460924-art0005-pa0001011-accidents-1...,3089,_ scramble through the residue of the late Sir...
284381,85830671,9,23628,3,2023-03-26,classification_count,2023-02-27,2023-04-02,0003040-19010622-0003-car_pa0003046.png,3040,The Ravens had a very pleasant game on Saturda...


## Annotations

In [9]:
def agg_task_values(task):
    values = [value.strip() for value in task if is_value_valid(value)]
    return ",".join(values)


def is_value_valid(value):
    return value not in ["", "None"]


def convert_to_unique_list(value):
    if not value:
        return None

    unique_values = set({v.casefold(): v for v in value.split(",")}.values())

    return sorted(list(unique_values), key=lambda x: x.casefold())


annotations_df = project.annotations_flattened
annotations_df = annotations_df[annotations_df["workflow_id"].isin(projects_workflows)]
annotations_df = annotations_df.merge(
    subjects[["subject_id", "sid"]],
    how="left",
    left_on="subject_ids",
    right_on="subject_id",
)
annotations_df = annotations_df.drop(
    columns=["workflow_version", "subject_ids", "subject_id"]
)
annotations_df = (
    annotations_df.groupby(["workflow_id", "sid"]).agg(agg_task_values).reset_index()
)
for idx in range(0, 7):
    annotations_df[f"T{idx}"] = annotations_df[f"T{idx}"].apply(convert_to_unique_list)
annotations_df = annotations_df.dropna(
    subset=["T0", "T1", "T2", "T3", "T4", "T5", "T6"], how="all"
)

annotations_df.to_json(os.path.join(out_path, "annotations.json"), orient="records")
annotations_df.head()

Unnamed: 0,workflow_id,sid,T0,T1,T2,T3,T4,T5,T6
1,18831,42,,[Male],[One],[57],,,
3,18831,70,,,[More than one],,[All male],,"[Adults (the default), Mixed age groups]"
5,18831,93,,[Male],"[More than one, One]",[Adult],[Majority male group],,[Adults (the default)]
6,18831,102,,,[More than one],,[Majority male group],,"[Adults (the default), Mixed age groups]"
7,18831,112,,,[More than one],,[All male],,[Adults (the default)]
