In [1]:
import numpy as np
import pandas as pd

# Extract sample of 30+ texts:

In [2]:
N = 30

## Load Incidents:

In [3]:
# load data:
incidents_raw = pd.read_csv("../incidents.csv").drop(columns=["Unnamed: 0"])

# fix faulty entries:
incidents_raw.loc[incidents_raw.url == 'http://www.cfs.gov.hkhttps://www.cfs.gov.hk/english/press/20210827_8853.html', 'url'] = 'https://www.cfs.gov.hk/english/press/20210827_8853.html'

# print unique counts:
for c in incidents_raw.columns:
    print(f'{c}:{" "*(20-len(c))}{len(incidents_raw[c].drop_duplicates()):5d}')

date:                 3314
originalTitle:        7405
description:          7619
product:              1941
hazard:                409
productCategory:        30
hazardCategory:         11
supplier:             5319
url:                  7549


## Split into data and labels:

In [4]:
incidents_data = incidents_raw[["date", "url", "originalTitle", "description"]]
incidents_data = pd.DataFrame(
    data=[s[1].sort_values(by='date', ascending=False).values[0] for s in incidents_data.groupby(by='url')],
    columns=["date", "url", "originalTitle", "description"]
)

incidents_data

Unnamed: 0,date,url,originalTitle,description
0,2011-12-13T00:00:00,http://healthycanadians.gc.ca/recall-alert-rap...,Certain cheese and dairy products produced by ...,Certain cheese and dairy products produced by ...
1,2011-11-16T00:00:00,http://healthycanadians.gc.ca/recall-alert-rap...,Certain GINGER BEEF CHOICE brand READY-TO-EAT ...,Certain GINGER BEEF CHOICE brand READY-TO-EAT ...
2,2011-11-16T00:00:00,http://healthycanadians.gc.ca/recall-alert-rap...,Certain Ginger Beef Choice brand ready-to-eat ...,Certain Ginger Beef Choice brand ready-to-eat ...
3,2011-09-21T00:00:00,http://healthycanadians.gc.ca/recall-alert-rap...,Certain Gatorade brand products are being reca...,Certain Gatorade brand products are being reca...
4,2011-12-06T00:00:00,http://healthycanadians.gc.ca/recall-alert-rap...,Undeclared peanut in certain Kawartha brand ic...,Undeclared peanut in certain Kawartha brand ic...
...,...,...,...,...
7544,2019-07-09T00:00:00,https://www.sfa.gov.sg/docs/default-source/def...,"Recall of ""Crottin De Pays"" Goat Cheese from F...",MEDIA RELEASE \r\n \r\n \r\n \r\nPage 1 of 2 \...
7545,2020-11-03T00:00:00,https://www.sfa.gov.sg/docs/default-source/def...,Recall of “Bellamy's Organic - Organic Brown R...,MEDIA RELEASE \r\n \r\n \r\nPage 1 of 2 \r\n \...
7546,2020-06-02T00:00:00,https://www.sfa.gov.sg/docs/default-source/def...,Recall of “Chang Soda Water” due to presence o...,MEDIA RELEASE \r\n \r\n \r\n \r\nPage 1 of 2 \...
7547,2020-12-30T00:00:00,https://www.sfa.gov.sg/docs/default-source/def...,"Recall of Woolworths Macro Almond, Brazil & Ca...",MEDIA RELEASE \r\n \r\n \r\nPage 1 of 2 \r\n \...


## Find unique values:

In [5]:
incidents_sample = incidents_data[["url", "originalTitle", "description"]].copy()

In [6]:
incidents_sample['year'] = [int(date[:4]) for date in incidents_data.date]
years = incidents_sample.year.unique()
years = {y:1/len(years) for y in years}
years

{2011: 0.034482758620689655,
 2012: 0.034482758620689655,
 2017: 0.034482758620689655,
 2018: 0.034482758620689655,
 2020: 0.034482758620689655,
 2009: 0.034482758620689655,
 2002: 0.034482758620689655,
 2005: 0.034482758620689655,
 2004: 0.034482758620689655,
 2006: 0.034482758620689655,
 2010: 0.034482758620689655,
 2003: 0.034482758620689655,
 2007: 0.034482758620689655,
 2008: 0.034482758620689655,
 2015: 0.034482758620689655,
 2016: 0.034482758620689655,
 2013: 0.034482758620689655,
 2014: 0.034482758620689655,
 2019: 0.034482758620689655,
 2021: 0.034482758620689655,
 2022: 0.034482758620689655,
 1998: 0.034482758620689655,
 1999: 0.034482758620689655,
 2001: 0.034482758620689655,
 2000: 0.034482758620689655,
 1994: 0.034482758620689655,
 1995: 0.034482758620689655,
 1996: 0.034482758620689655,
 1997: 0.034482758620689655}

In [7]:
incidents_sample['domain'] = [(url[:8] + url[8:].split('/')[0]).split('//')[1] for url in incidents_data.url]
domains = incidents_sample.domain.unique()

drop = ['de','at','lu','gr','it']
prob = 1/(len(domains)-len(drop))
domains = {d:(0 if d[-2:] in drop else prob) for d in domains}

domains

{'healthycanadians.gc.ca': 0.05263157894736842,
 'portal.efet.gr': 0,
 'tna.europarchive.org': 0.05263157894736842,
 'wayback.archive-it.org': 0.05263157894736842,
 'www.cfs.gov.hk': 0.05263157894736842,
 'www.collectionscanada.gc.ca': 0.05263157894736842,
 'www.fda.gov': 0.05263157894736842,
 'www.food.gov.uk': 0.05263157894736842,
 'www.foodstandards.gov.au': 0.05263157894736842,
 'www.fsis.usda.gov': 0.05263157894736842,
 'www.inspection.gc.ca': 0.05263157894736842,
 'www.lebensmittelwarnung.de': 0,
 'inspection.canada.ca': 0.05263157894736842,
 'recalls-rappels.canada.ca': 0.05263157894736842,
 'securite-alimentaire.public.lu': 0,
 'webarchive.nationalarchives.gov.uk': 0.05263157894736842,
 'www.accessdata.fda.gov': 0.05263157894736842,
 'www.ages.at': 0,
 'www.foedevarestyrelsen.dk': 0.05263157894736842,
 'www.foodstandards.gov.scot': 0.05263157894736842,
 'www.fsai.ie': 0.05263157894736842,
 'www.productsafety.gov.au': 0.05263157894736842,
 'www.salute.gov.it': 0,
 'www.sfa.gov.s

In [8]:
incidents_sample['hazard'] = [str(incidents_raw.hazardCategory[incidents_raw.url == url].unique()) for url in incidents_data.url]
hazards = incidents_sample.hazard.unique()
hazards = {h:1/len(hazards) for h in hazards}
hazards

{"['biological']": 0.0625,
 "['organoleptic aspects']": 0.0625,
 "['allergens']": 0.0625,
 "['chemical']": 0.0625,
 "['foreign bodies']": 0.0625,
 "['fraud']": 0.0625,
 "['packaging defect']": 0.0625,
 "['food additives and flavourings']": 0.0625,
 "['other hazard']": 0.0625,
 "['food contact materials']": 0.0625,
 "['migration']": 0.0625,
 "['chemical' 'foreign bodies' 'biological']": 0.0625,
 "['biological' 'chemical' 'foreign bodies']": 0.0625,
 "['fraud' 'biological']": 0.0625,
 "['biological' 'foreign bodies' 'allergens']": 0.0625,
 "['fraud' 'allergens']": 0.0625}

In [9]:
incidents_sample

Unnamed: 0,url,originalTitle,description,year,domain,hazard
0,http://healthycanadians.gc.ca/recall-alert-rap...,Certain cheese and dairy products produced by ...,Certain cheese and dairy products produced by ...,2011,healthycanadians.gc.ca,['biological']
1,http://healthycanadians.gc.ca/recall-alert-rap...,Certain GINGER BEEF CHOICE brand READY-TO-EAT ...,Certain GINGER BEEF CHOICE brand READY-TO-EAT ...,2011,healthycanadians.gc.ca,['biological']
2,http://healthycanadians.gc.ca/recall-alert-rap...,Certain Ginger Beef Choice brand ready-to-eat ...,Certain Ginger Beef Choice brand ready-to-eat ...,2011,healthycanadians.gc.ca,['biological']
3,http://healthycanadians.gc.ca/recall-alert-rap...,Certain Gatorade brand products are being reca...,Certain Gatorade brand products are being reca...,2011,healthycanadians.gc.ca,['organoleptic aspects']
4,http://healthycanadians.gc.ca/recall-alert-rap...,Undeclared peanut in certain Kawartha brand ic...,Undeclared peanut in certain Kawartha brand ic...,2011,healthycanadians.gc.ca,['allergens']
...,...,...,...,...,...,...
7544,https://www.sfa.gov.sg/docs/default-source/def...,"Recall of ""Crottin De Pays"" Goat Cheese from F...",MEDIA RELEASE \r\n \r\n \r\n \r\nPage 1 of 2 \...,2019,www.sfa.gov.sg,['biological']
7545,https://www.sfa.gov.sg/docs/default-source/def...,Recall of “Bellamy's Organic - Organic Brown R...,MEDIA RELEASE \r\n \r\n \r\nPage 1 of 2 \r\n \...,2020,www.sfa.gov.sg,['chemical']
7546,https://www.sfa.gov.sg/docs/default-source/def...,Recall of “Chang Soda Water” due to presence o...,MEDIA RELEASE \r\n \r\n \r\n \r\nPage 1 of 2 \...,2020,www.sfa.gov.sg,['chemical']
7547,https://www.sfa.gov.sg/docs/default-source/def...,"Recall of Woolworths Macro Almond, Brazil & Ca...",MEDIA RELEASE \r\n \r\n \r\nPage 1 of 2 \r\n \...,2020,www.sfa.gov.sg,['allergens']


## Take uniform sample:

In [10]:
data = incidents_sample.copy()
sample = pd.DataFrame(columns=["url", "title", "text"])

def update_probs(dict, key):
    dict[key] = max(dict[key] - 1/N, 0.0001)

for i in range(N):
    data['p'] = [years[y] * domains[d] * hazards[h] for y,d,h in data[['year', 'domain', 'hazard']].values]
    data['p'] = data.p.values / data.p.values.sum()

    url = np.random.choice(data.url.values, size=1, p=data.p.values)[0]
    row = data[data.url == url]

    sample.loc[len(sample)] = [url, row.originalTitle.values[0], row.description.values[0]]

    update_probs(years,   row.year.values[0])
    update_probs(domains, row.domain.values[0])
    update_probs(hazards, row.hazard.values[0])

    print(years, domains, hazards)

sample.values

{2011: 0.034482758620689655, 2012: 0.034482758620689655, 2017: 0.034482758620689655, 2018: 0.034482758620689655, 2020: 0.034482758620689655, 2009: 0.034482758620689655, 2002: 0.034482758620689655, 2005: 0.034482758620689655, 2004: 0.034482758620689655, 2006: 0.034482758620689655, 2010: 0.034482758620689655, 2003: 0.034482758620689655, 2007: 0.034482758620689655, 2008: 0.034482758620689655, 2015: 0.034482758620689655, 2016: 0.034482758620689655, 2013: 0.034482758620689655, 2014: 0.034482758620689655, 2019: 0.0011494252873563218, 2021: 0.034482758620689655, 2022: 0.034482758620689655, 1998: 0.034482758620689655, 1999: 0.034482758620689655, 2001: 0.034482758620689655, 2000: 0.034482758620689655, 1994: 0.034482758620689655, 1995: 0.034482758620689655, 1996: 0.034482758620689655, 1997: 0.034482758620689655} {'healthycanadians.gc.ca': 0.05263157894736842, 'portal.efet.gr': 0, 'tna.europarchive.org': 0.05263157894736842, 'wayback.archive-it.org': 0.05263157894736842, 'www.cfs.gov.hk': 0.05263

        'Parador Plus brand Bouillon Chicken Concentrate recalled due to yeast and mould',
        'Notification - Parador Plus brand Bouillon Chicken Concentrate recalled due to yeast and mould Recall / advisory date: September 20, 2019 Reason for recall / advisory: Microbiological - Non harmful (Quality/Spoilage) Hazard classification: Class 3 Company / Firm: Les Aliments Parador Foods Inc. Distribution: Quebec Extent of the distribution: Hotel/Restaurant/Institutional Reference number: 13227 Contents Affected products Public enquiries and media Affected products Brand Name Common Name Size UPC Code(s) on Product Parador Plus Bouillon Chicken Concentrate 2 L 7 72288 00075 0 08824 Public enquiries and media Public enquiries Toll-free: 1-800-442-2342 (Canada and U.S.) Telephone: 1-613-773-2342 (local or international) Email: cfia.enquiries-demandederenseignements.acia@canada.ca Media relations Telephone: 613-773-6600 Email: cfia.media.acia@canada.ca Report a problem on this page Date m

## Add empty columns:

In [11]:
sample['year'] = [incidents_sample.year[incidents_data.url == url].values[0] for url in sample.url]

sample['productCategory'] = "..."
sample['productSpan'] = "..."

sample['hazardCategory'] = "..."
sample['hazardSpan'] = "..."

sample


Unnamed: 0,url,title,text,year,productCategory,productSpan,hazardCategory,hazardSpan
0,http://www.inspection.gc.ca/about-the-cfia/new...,Parador Plus brand Bouillon Chicken Concentrat...,Notification - Parador Plus brand Bouillon Chi...,2019,...,...,...,...
1,https://www.food.gov.uk/news-alerts/alert/fsa-...,KP Snacks recalls Popchips Veg Vibes Sea Salt ...,KP Snacks is recalling Popchips Veg Vibes Sea ...,2022,...,...,...,...
2,https://www.fda.gov/Safety/Recalls/ucm528271.htm,SHRI SHIVA Foods Inc. Recalls,"Long Island City, NY - SHRI SHIVA Foods Inc. i...",2016,...,...,...,...
3,https://www.productsafety.gov.au/recall/kerry-...,Kerry Pinnacle Pty Ltd—Choc Cherry Slice & Cho...,PRA No. 2011/12883 Date published 25 Oct 2011 ...,2011,...,...,...,...
4,https://www.fsai.ie/news_centre/food_alerts/se...,Recall of Batches of Seven Seas Omega-3 and Im...,Recall of Batches of Seven Seas Omega-3 and Im...,2021,...,...,...,...
5,https://www.fsai.ie/news_centre/food_alerts/Wi...,Plastic fragment in prepared dishes by Asian A...,Recall of a Batch of The Wild Wok Vegan Gyoza ...,2020,...,...,...,...
6,http://www.fsis.usda.gov/wps/wcm/connect/FSIS-...,Oscar's Hickory House Recalls Beef Jerky Produ...,"WASHINGTON, March 1, 2015 – Oscar’s Hickory Ho...",2015,...,...,...,...
7,http://www.foodstandards.gov.au/industry/foodr...,Foster Clark’s dairy desserts – foreign matter...,Page Content Cerebos (Australia) Ltd has recal...,2012,...,...,...,...
8,http://tna.europarchive.org/content/2011040522...,The Co-operative Group recalls certain batch c...,The Co-operative Group recalls certain batch c...,2010,...,...,...,...
9,https://www.fsis.usda.gov/wps/portal/fsis/topi...,"Star Natural Meats, LLC Recalls Raw Pork Sausa...","WASHINGTON, Aug. 3, 2018 – Star Natural Meats,...",2018,...,...,...,...


## Create ground truth:

In [12]:
ground_truth = sample.copy()

ground_truth['hazardCategory'] = [str(incidents_raw.hazardCategory[incidents_raw.url == url].unique()) for url in ground_truth.url]
ground_truth['productCategory'] = [str(incidents_raw.productCategory[incidents_raw.url == url].unique()) for url in ground_truth.url]

ground_truth

Unnamed: 0,url,title,text,year,productCategory,productSpan,hazardCategory,hazardSpan
0,http://www.inspection.gc.ca/about-the-cfia/new...,Parador Plus brand Bouillon Chicken Concentrat...,Notification - Parador Plus brand Bouillon Chi...,2019,"['soups, broths, sauces and condiments']",...,['biological'],...
1,https://www.food.gov.uk/news-alerts/alert/fsa-...,KP Snacks recalls Popchips Veg Vibes Sea Salt ...,KP Snacks is recalling Popchips Veg Vibes Sea ...,2022,['prepared dishes and snacks'],...,['allergens'],...
2,https://www.fda.gov/Safety/Recalls/ucm528271.htm,SHRI SHIVA Foods Inc. Recalls,"Long Island City, NY - SHRI SHIVA Foods Inc. i...",2016,['herbs and spices'],...,['biological'],...
3,https://www.productsafety.gov.au/recall/kerry-...,Kerry Pinnacle Pty Ltd—Choc Cherry Slice & Cho...,PRA No. 2011/12883 Date published 25 Oct 2011 ...,2011,['cereals and bakery products'],...,['allergens'],...
4,https://www.fsai.ie/news_centre/food_alerts/se...,Recall of Batches of Seven Seas Omega-3 and Im...,Recall of Batches of Seven Seas Omega-3 and Im...,2021,"['dietetic foods, food supplements, fortified ...",...,['chemical'],...
5,https://www.fsai.ie/news_centre/food_alerts/Wi...,Plastic fragment in prepared dishes by Asian A...,Recall of a Batch of The Wild Wok Vegan Gyoza ...,2020,['prepared dishes and snacks'],...,['foreign bodies'],...
6,http://www.fsis.usda.gov/wps/wcm/connect/FSIS-...,Oscar's Hickory House Recalls Beef Jerky Produ...,"WASHINGTON, March 1, 2015 – Oscar’s Hickory Ho...",2015,['meat and meat products (other than poultry)'],...,['other hazard'],...
7,http://www.foodstandards.gov.au/industry/foodr...,Foster Clark’s dairy desserts – foreign matter...,Page Content Cerebos (Australia) Ltd has recal...,2012,['ices and desserts'],...,['foreign bodies'],...
8,http://tna.europarchive.org/content/2011040522...,The Co-operative Group recalls certain batch c...,The Co-operative Group recalls certain batch c...,2010,['non-alcoholic beverages'],...,['organoleptic aspects'],...
9,https://www.fsis.usda.gov/wps/portal/fsis/topi...,"Star Natural Meats, LLC Recalls Raw Pork Sausa...","WASHINGTON, Aug. 3, 2018 – Star Natural Meats,...",2018,['meat and meat products (other than poultry)'],...,['fraud'],...


# Save sample and ground truth:

In [13]:
for i in sample.index:
    def caption(txt):
        return f"\n\n{'#'*100}\n{txt}:\n{'#'*100}\n\n"

    tick = "\n\t\t - "

    out = f'[{sample.year.values[i]:d} - {sample.url.values[i]}]'

    out += caption("Text") + sample.text.values[i]

    out += caption("Labelling instructions") + f"""\t1.\tPlease assign labels to the columns "productCategory" and
    \t\t"hazardCategory" describing the contents of the text.
    \t\tValues for the field "productCategory" should be chosen from the following:
    {tick}{tick.join(incidents_raw.productCategory.unique())}

    \t\tValues for the field "hazardCategory" should be chosen from the following:
    {tick}{tick.join(incidents_raw.hazardCategory.unique())}

    \t2.\tPlease copy and paste the text span(s) from the text that
    \t\tare most descriptive of the respective assigned label into the columns
    \t\t"productSpan" and "hazardSpan".Spans should be separated by linefeeds."""

    out += caption("productCategory") + sample.productCategory.values[i]
    out += caption("productSpan") + sample.productSpan.values[i]
    out += caption("hazardCategory") + sample.hazardCategory.values[i]
    out += caption("hazardSpan") + sample.hazardSpan.values[i]

    with open(f'samples/#{i:d}.txt', 'w', encoding='utf-8') as f:
        f.write(out)

In [15]:
sample.to_csv('sample.csv', sep=';', encoding='utf-8')
ground_truth.to_csv('ground_truth.csv', sep=';', encoding='utf-8')