In [1]:
import _base_path
import json
import numpy as np
import pandas as pd
from resources.spans import SpanCollection

Setting base bath to "c:\Users\Korbi\Desktop\CICLe"


  from tqdm.autonotebook import tqdm, trange


In [2]:
DATA   = 'incidents'
LABEL  = 'hazard-category'

# Load data:

In [3]:
# load data:
incidents = pd.read_csv(f"{DATA}/{DATA}_final.csv", index_col=0)

# parse products:
incidents['product'].fillna('', inplace=True)
incidents['product-category'].fillna('', inplace=True)
incidents['product-title']    = [SpanCollection.parse(p) for p in incidents['product-title'].fillna('')]
incidents['product-text']     = [SpanCollection.parse(p) for p in incidents['product-text'].fillna('')]

# parse hazards:
incidents['hazard'].fillna('', inplace=True)
incidents['hazard-category'].fillna('', inplace=True)
incidents['hazard-title']     = [SpanCollection.parse(h) for h in incidents['hazard-title'].fillna('')]
incidents['hazard-text']      = [SpanCollection.parse(h) for h in incidents['hazard-text'].fillna('')]

# parse suppliers:
incidents['supplier-title']   = [SpanCollection.parse(s) for s in incidents['supplier-title'].fillna('')]
incidents['supplier-text']    = [SpanCollection.parse(s) for s in incidents['supplier-text'].fillna('')]

# fill nan-values:
incidents['country'].fillna('na', inplace=True)

def print_column(column:str, n:int=10):
    try:               values = np.unique(np.concatenate(incidents[column].values))
    except ValueError: values = np.unique(incidents[column].values)
    counts = np.array([sum([v in label for label in incidents[column].values]) for v in values])

    idx = np.argsort(counts)[::-1]
    values = values[idx]
    counts = counts[idx]
    
    print(f'Column "{column}" (n = {len(values):d}):\n')
    for v, n in zip(values[:n], counts[:n]):
        print(f'  {v}:{" "*(50-len(v))}{sum([v in label for label in incidents[column].values]):5d}')

# print unique counts:
for c in incidents.columns:
    print(f'  {c}:{" "*(20-len(c))}{len(incidents[c].drop_duplicates()):5d}')

  year:                   29
  month:                  12
  day:                    31
  url:                  7500
  title:                7331
  text:                 7548
  product:              1258
  product-raw:          1880
  product-category:       23
  product-title:        7548
  product-text:         7548
  hazard:                262
  hazard-raw:            408
  hazard-category:        11
  hazard-title:         7548
  hazard-text:          7548
  supplier-title:       7548
  supplier-text:        7548
  language:                6
  country:                15


# Create table:

## Extract columns:

In [4]:
labels = {l: incidents[LABEL].values == l
    for l in incidents[LABEL].unique()
}

labels

{'allergens': array([ True, False, False, ..., False, False,  True]),
 'biological': array([False,  True, False, ..., False, False, False]),
 'chemical': array([False, False,  True, ..., False, False, False]),
 'fraud': array([False, False, False, ...,  True,  True, False]),
 'other hazard': array([False, False, False, ..., False, False, False]),
 'foreign bodies': array([False, False, False, ..., False, False, False]),
 'packaging defect': array([False, False, False, ..., False, False, False]),
 'organoleptic aspects': array([False, False, False, ..., False, False, False]),
 'food additives and flavourings': array([False, False, False, ..., False, False, False]),
 'migration': array([False, False, False, ..., False, False, False]),
 'food contact materials': array([False, False, False, ..., False, False, False])}

In [5]:
with open(f'{DATA}/support_zones.json', 'r') as file:
    support_zones = json.load(file)[LABEL]

support_zones

[['biological'],
 ['foreign bodies',
  'chemical',
  'fraud',
  'other hazard',
  'packaging defect',
  'organoleptic aspects',
  'food additives and flavourings',
  'migration',
  'food contact materials']]

most and least frequent class:

In [6]:
most_frequent_class = ('', None)
least_frequent_class = ('', None)

for l in labels:
    n = sum(labels[l])

    if most_frequent_class[1] is None or n > sum(most_frequent_class[1]):
        most_frequent_class = (l, labels[l])

    if least_frequent_class[1] is None or n < sum(least_frequent_class[1]):
        least_frequent_class = (l, labels[l])

print(most_frequent_class)
print(least_frequent_class)

('biological', array([False,  True, False, ..., False, False, False]))
('food contact materials', array([False, False, False, ..., False, False, False]))


In [7]:
high_support = ('$C_{high}$'   , np.bitwise_or.reduce([labels[l] for l in support_zones[0]]))
low_support =  ('$C_{low}$'    , np.bitwise_or.reduce([labels[l] for l in support_zones[1]]))
mid_support =  ('$C_{medium}$' , ~(high_support[1] | low_support[1]))

print(high_support)
print(mid_support)
print(low_support)

('$C_{high}$', array([False,  True, False, ..., False, False, False]))
('$C_{medium}$', array([ True, False, False, ..., False, False,  True]))
('$C_{low}$', array([False, False,  True, ...,  True,  True, False]))


## Latex conversion functions:

In [8]:
def to_cell(mask, hl=False):
    sep = ' & '
    if hl: sep += '\cellcolor{gray!10}'

    count = sum(mask)
    size =  np.mean([len(s) for s in incidents.title[mask].values])

    if count > 0: return f'{sep}${count:d}${sep}$({size:.1f})$'
    else:         return f'{sep}{sep}'

def to_row(title, mask, hl=False):
    row_ltx = '\cellcolor{gray!25} ' + title

    # add most and least frequent:
    row_ltx += to_cell(mask & most_frequent_class[1], hl=hl)
    row_ltx += to_cell(mask & least_frequent_class[1], hl=hl)

    # add supports:
    row_ltx += to_cell(mask & high_support[1], hl=hl)
    row_ltx += to_cell(mask & mid_support[1], hl=hl)
    row_ltx += to_cell(mask & low_support[1], hl=hl)

    # add total:
    row_ltx += to_cell(mask, hl=True)

    return row_ltx + '\\\\'

def to_block(title, values):
    rows = [to_row(row_name, row_mask) for row_name, row_mask in values]

    for i, row in enumerate(rows[:-1]):
        rows[i] = '\\cellcolor{gray!25} & ' + row

    rows[-1] = '\\cellcolor{gray!25} \\multirow{-' + str(len(rows)) + '}*{\\rotatebox{90}{\\textbf{' + title + '}}} & ' + rows[-1]

    return '\n'.join(rows) + '\n\n\\hline\n'

## Add rows:

In [9]:
table = []

### Time:

In [10]:
time = [(f'{years[0]:d} - {years[-1]:d}', np.bitwise_or.reduce([incidents.year.values == y for y in years]))
    for years in [
        [1994, 1995, 1996, 1997, 1998], 
        [1999, 2000, 2001, 2002],
        [2003, 2004, 2005, 2006],
        [2007, 2008, 2009, 2010],
        [2011, 2012, 2013, 2014],
        [2015, 2016, 2017, 2018], 
        [2019, 2020, 2021, 2022]    
    ]
]
time.sort(key=lambda x: x[0])

time

[('1994 - 1998', array([False, False, False, ..., False, False, False])),
 ('1999 - 2002', array([False, False, False, ..., False, False, False])),
 ('2003 - 2006', array([False, False, False, ..., False, False, False])),
 ('2007 - 2010', array([False, False, False, ..., False, False, False])),
 ('2011 - 2014', array([False, False, False, ..., False, False, False])),
 ('2015 - 2018', array([ True, False, False, ..., False, False, False])),
 ('2019 - 2022', array([False,  True,  True, ...,  True,  True,  True]))]

In [11]:
table.append(to_block('By Year', time))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


### Languages:

In [12]:
language = [(l.upper(), incidents.language.values == l)
    for l in incidents.language.unique()
]
language.sort(key=lambda x: x[0])

language

[('DE', array([False, False, False, ..., False, False, False])),
 ('DK', array([False, False, False, ..., False, False, False])),
 ('EN', array([ True,  True,  True, ...,  True,  True,  True])),
 ('FR', array([False, False, False, ..., False, False, False])),
 ('GR', array([False, False, False, ..., False, False, False])),
 ('IT', array([False, False, False, ..., False, False, False]))]

In [13]:
table.append(to_block('By Language', language))

### Total:

In [14]:
table.append('\\multicolumn{2}{|c||}{' + to_row('\\textbf{Total}}', np.ones(len(incidents), dtype=bool), hl=True))

In [15]:
print(
    '\multicolumn{2}{l}{\\texttt{' + LABEL + '}} &\n' +
    '\multicolumn{2}{c}{\\tiny{' + most_frequent_class[0] + '}} &\n' +
    '\multicolumn{2}{c}{\\tiny{' + least_frequent_class[0] + '}} &\n' +
    '\multicolumn{8}{c}{} \\\\\n' +
    '\n\\hline\n\n' +
    '\n'.join(table)
)

\multicolumn{2}{l}{\texttt{hazard-category}} &
\multicolumn{2}{c}{\tiny{biological}} &
\multicolumn{2}{c}{\tiny{food contact materials}} &
\multicolumn{8}{c}{} \\

\hline

\cellcolor{gray!25} & \cellcolor{gray!25} 1994 - 1998 & $34$ & $(33.6)$ &  &  & $34$ & $(33.6)$ & $4$ & $(71.0)$ & $20$ & $(34.6)$ & \cellcolor{gray!10}$58$ & \cellcolor{gray!10}$(36.5)$\\
\cellcolor{gray!25} & \cellcolor{gray!25} 1999 - 2002 & $50$ & $(50.8)$ &  &  & $50$ & $(50.8)$ & $21$ & $(52.9)$ & $59$ & $(46.0)$ & \cellcolor{gray!10}$130$ & \cellcolor{gray!10}$(49.0)$\\
\cellcolor{gray!25} & \cellcolor{gray!25} 2003 - 2006 & $53$ & $(52.8)$ &  &  & $53$ & $(52.8)$ & $61$ & $(60.5)$ & $78$ & $(58.1)$ & \cellcolor{gray!10}$192$ & \cellcolor{gray!10}$(57.4)$\\
\cellcolor{gray!25} & \cellcolor{gray!25} 2007 - 2010 & $158$ & $(95.3)$ &  &  & $158$ & $(95.3)$ & $49$ & $(81.0)$ & $112$ & $(72.5)$ & \cellcolor{gray!10}$319$ & \cellcolor{gray!10}$(85.1)$\\
\cellcolor{gray!25} & \cellcolor{gray!25} 2011 - 2014 & $308$ &