In [8]:
import _base_path
import json
import numpy as np
import pandas as pd

import matplotlib
if 'init_done' in globals():
    matplotlib.use("pgf")
    matplotlib.rcParams.update({
        "pgf.texsystem": "pdflatex",
        'font.family': 'serif',
        'text.usetex': True,
        'pgf.rcfonts': False,
    })
import matplotlib.pyplot as plt

from resources.spans import SpanCollection

init_done = True

# Load data:

In [9]:
# load data:
incidents = pd.read_csv("../data/incidents/incidents_final.csv").drop(columns=["Unnamed: 0", "text", "product_text", "hazard_text"])

# parse products:
incidents['product']          = [p.split('|') for p in incidents['product'].fillna('')]
incidents['product_category'] = [p.split('|') for p in incidents['product_category'].fillna('')]
#incidents['product_title']    = [SpanCollection.parse(p) for p in incidents['product_title'].fillna('')]

# parse hazards:
incidents['hazard']           = [h.split('|') for h in incidents['hazard'].fillna('')]
incidents['hazard_category']  = [h.split('|') for h in incidents['hazard_category'].fillna('')]
incidents['hazard_title']     = [SpanCollection.parse(h) for h in incidents['hazard_title'].fillna('')]

# parse suppliers:
incidents['supplier_title']   = [SpanCollection.parse(s) for s in incidents['supplier_title'].fillna('')]

# fill nan-values:
incidents['country'].fillna('na', inplace=True)

def print_column(column:str, n:int=10):
    try:               values = np.unique(np.concatenate(incidents[column].values))
    except ValueError: values = np.unique(incidents[column].values)
    counts = np.array([sum([v in label for label in incidents[column].values]) for v in values])

    idx = np.argsort(counts)[::-1]
    values = values[idx]
    counts = counts[idx]
    
    print(f'Column "{column}" (n = {len(values):d}):\n')
    for v, n in zip(values[:n], counts[:n]):
        print(f'  {v}:{" "*(50-len(v))}{sum([v in label for label in incidents[column].values]):5d}')

# print unique counts:
for c in incidents.columns:
    print(f'  {c}:{" "*(20-len(c))}{len(incidents[c].drop_duplicates()):5d}')

  year:                   29
  month:                  12
  day:                    31
  url:                  7546
  title:                7389
  product:              1931
  product_category:       60
  hazard:                409
  hazard_category:        12
  supplier_title:       7619
  supplier_text:        7181
  language:                6
  country:                15
  product_title:        3337
  hazard_title:         7619


In [10]:
print(incidents.shape)
incidents.head()

(7619, 15)


Unnamed: 0,year,month,day,url,title,product,product_category,hazard,hazard_category,supplier_title,supplier_text,language,country,product_title,hazard_title
0,2015,5,26,https://www.fda.gov/Safety/Recalls/ArchiveReca...,2015 - House of Spices (India) Inc. Issues Ale...,[dried apricots],[fruits and vegetables],[undeclared sulphite],[allergens],"(slice(7, 35, None))","(33,47)",en,us,"(49,50)","(slice(16, 22, None), slice(43, 51, None))"
1,2022,5,25,https://www.fda.gov/safety/recalls-market-with...,Supplier J.M. Smucker Co.’s Jif Recall Prompts...,[peanuts],"[nuts, nut products and seeds]",[salmonella],[biological],"(slice(53, 62, None))","(0,9)|(129,137)|(360,368)|(448,456)|(665,673)|...",en,us,"(14,20)|(28,30)|(85,90)","(slice(0, 8, None), slice(47, 52, None), slice..."
2,2020,6,2,http://www.cfs.gov.hk/english/whatsnew/whatsne...,*(Updated on 2 June 2020) Not to consume a bat...,[apple juice],[non-alcoholic beverages],[patulin],[chemical],(),"(354,365)|(581,592)|(1616,1620)",en,hk,"(72,76)","(slice(30, 32, None), slice(96, 103, None))"
3,2022,7,5,http://www.cfs.gov.hk/english/whatsnew/whatsne...,*(Updated on 5 July 2022) Not to consume smoke...,[chilled smoked salmon],[fish and fish products],[listeria monocytogenes],[biological],(),"(484,491)|(1197,1217)",en,hk,"(41,67)","(slice(48, 63, None), slice(101, 109, None))"
4,2021,3,20,http://www.fsis.usda.gov/recalls-alerts/avanza...,"Avanza Pasta, LLC Recalls Beef and Poultry Pro...",[pasta products],[other food product / mixed],[inspection issues],[fraud],"(slice(0, 17, None))","(294,310)|(654,671)|(767,771)|(2266,2283)|(334...",en,us,,"(slice(43, 51, None), slice(62, 77, None))"


# Plots:

Histograms:

In [11]:
def plot_hist(ax, values, title, n_named=3):
    # count label occurences:
    labels = np.unique(np.concatenate(values))
    counts = np.array([sum([l in v for v in values]) for l in labels], dtype=int)

    # sort by counts:
    idx = np.argsort(counts)[::-1]
    labels = labels[idx]
    counts = counts[idx]

    # plot:
    n = len(labels)
    x = np.arange(n, dtype=float)
    
    for i in range(n_named):
        # create lainebeaks in label:
        label = []
        line = ''
        for word in labels[i].split():
            if len(line) + len(word) < 20:
                line += word + ' '
            else:
                label.append(line[:-1])
                line = word  + ' '
        label.append(line[:-1])
        line = ''

        # rescale named bars for better visibility:
        f = 1. if n < 80 else n/80.
        x[i+1:] += (f - 1.) if (i+1) < n_named else (f - 1.)/2.
        
        ax.bar(x[i], counts[i], 0.8*f,
            label='\n'.join(label))

    ax.bar(x[n_named:], counts[n_named:], color='grey')
    ax.set_title(title)
    ax.legend(prop={'size': 8}, loc='upper right')
    ax.set_xticks([])

    # plot support based class sets:
    values_accumulated = np.cumsum(counts[::-1])[::-1]

    high_support = np.nonzero(values_accumulated >= values_accumulated[0] * .67)[0]
    ax.axvspan(
        x[high_support[0]] - .5 * (1. if n < 80 else n/80.), 
        .5 * (x[high_support[-1]] + x[high_support[-1] + 1]),
        facecolor='grey',
        alpha=0.5,
        zorder=0
    )

    low_support = np.nonzero(values_accumulated <= values_accumulated[0] * .33)[0]
    ax.axvspan(
        .5 * (x[low_support[0] - 1] + x[low_support[0]]),
        x[low_support[-1]] + (1. if n < 80 else n/80.),
        facecolor='grey',
        alpha=0.5,
        zorder=0
    )
    
    print(f'{title}: n_high = {sum(counts[high_support]):d}/{len(high_support):d}, n_low = {sum(counts[low_support]):d}/{len(low_support):d}')

    return list(labels[high_support]), list(labels[low_support])


In [12]:
fig, axs = plt.subplots(2, 2, figsize=[7, 4])

support_zones = {}

support_zones['hazard_category']  = plot_hist(axs[0, 0], incidents['hazard_category'].values,  '\\texttt{hazard\\_category}')
support_zones['product_category'] = plot_hist(axs[0, 1], incidents['product_category'].values, '\\texttt{product\\_category}')
support_zones['hazard']           = plot_hist(axs[1, 0], incidents['hazard'].values,           '\\texttt{hazard}')
support_zones['product']          = plot_hist(axs[1, 1], incidents['product'].values,          '\\texttt{product}')

with open('../data/incidents/support_zones.json', 'w') as file:
    json.dump(support_zones, file)

plt.tight_layout()
plt.savefig('plots/class_distribution.pdf')

\texttt{hazard\_category}: n_high = 2579/1, n_low = 2487/9
\texttt{product\_category}: n_high = 2852/3, n_low = 2297/21
\texttt{hazard}: n_high = 2654/3, n_low = 2490/392
\texttt{product}: n_high = 2538/73, n_low = 2528/1522


Language Distribution:

In [13]:
def plot(ax, values):
    # get x-values:
    x = np.unique(values[:,0])
    n = len(x)

    # sort x-value:
    x = np.sort(x)

    # get curve names:
    labels = np.unique(values[:,1])

    # count label occurences:
    counts = np.zeros((len(labels), n), dtype=int)
    for v in values:
        counts[labels == v[1], x == v[0]] += 1

    for y, l in zip(counts, labels):
        ax.plot(x, y, label=l)

    ax.legend()

In [14]:
fig, ax = plt.subplots(1, 1, figsize=[4, 2])
plot(ax, incidents[['year','language']].values)
plt.savefig('plots/language_per_year.pdf')