In [None]:
# import tensorflow as tf
import numpy as np
import pandas as pd

from utils import utils

from collections import Counter
# from math import isnan

import matplotlib.pyplot as plt

import re

In [None]:
# import os
# print (os.name)
# 'posix'

In [None]:
print('tensorflow version is {}'.format(tf.__version__))
print('numpy version is {}'.format(np.__version__))
print('pd version is {}'.format(pd.__version__))

In [None]:
user_opt = utils.user_opt_gen()

main_data = pd.read_csv(user_opt['data_path'], 
                         sep=';', 
                         header=0, 
                         encoding='cp850')

atc_conversion_data = pd.read_csv(user_opt['atc_conversion_data_path'], 
                                  sep=';', 
                                  header=0, 
                                  encoding='cp850')

In [None]:
main_data.head(15)

In [None]:
# sum(main_data['FREETXT'] == main_data['INGR']) / float(main_data.shape[0])
# sum(main_data['FREETXT'] == main_data['BestOutput']) / float(main_data.shape[0])

In [None]:
atc_freq = Counter(main_data['ATC'])

# filter out 'nan'
atc_freq = {k: atc_freq[k] for k in atc_freq if isinstance(k, str)}
# atc_freq

In [None]:
print(
    'Number of observations: {}.'.format(sum(atc_freq.values())), '\n'
    'Number of ATC labels: {}.'.format(len(atc_freq.keys())), '\n'
    'Mean number of occurrences per ATC code: {:.3}.'.format(np.mean([float(k) for k in atc_freq.values()]))
)

In [None]:
atc_label_freq = [v for v in atc_freq.values()]
hist, bins = np.histogram(atc_label_freq, 
                          bins=np.max(atc_label_freq))
width = 1 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width)
plt.xlabel('Number of label appearance')
plt.ylabel('Frequency')
plt.title('Historgram of all label appearance')
plt.show()

# fig, ax = plt.subplots()
# ax.bar(center, hist, align='center', width=width)
# fig.savefig("1.png")

In [None]:
atc_label_freq_filtered = [v for v in atc_freq.values() if v <= 10]
hist, bins = np.histogram(atc_label_freq_filtered, 
                          bins=np.max(atc_label_freq_filtered))
width = 1 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width)
plt.xlabel('Number of label appearance')
plt.ylabel('Frequency')
plt.title('Historgram of label appearance, appearing 10 times or less')
plt.show()

appearing_once_labels = len([one for one in atc_label_freq if one == 1])
appearing_5more_labels = len([f for f in atc_label_freq if f >= 5])
appearing_10more_labels = len([f for f in atc_label_freq if f >= 10])

print('There are {} labels that appear only once.'.
      format(appearing_once_labels, ), 
      '\nThis leaves {} labels, {} of which appear 5 times or more and {} appearing at least 10 times'.
      format(len(atc_label_freq) - appearing_once_labels, 
             appearing_5more_labels, 
             appearing_10more_labels))

In [None]:
atc_label_freq_high_filtered = [v for v in atc_freq.values() if v >= 10]

hist, bins = np.histogram(atc_label_freq_high_filtered, 
                          bins=np.max(atc_label_freq_high_filtered))
width = 1 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width)
plt.xlabel('Number of label appearance')
plt.ylabel('Frequency')
plt.title('Historgram of label appearance, appearing at least 10 times')
plt.show()

In [None]:
atc_conversion_data.head(15)

In [None]:
# generate a dict of conversions,
# dropping keys that appear more than once (ambiguous)
atc_conversion_ingr_atc = {}
seen_keys = []
for j in range(len(atc_conversion_data)):
    key = atc_conversion_data['INGR'][j].split(' ')[0]
    # if key exists, delete and remember
    if key in atc_conversion_ingr_atc:
        del atc_conversion_ingr_atc[key]
        seen_keys.append(key)
    # if seen before, skip
    if key in seen_keys:
        continue
    # else add
    else:
        atc_conversion_ingr_atc[key] = atc_conversion_data['ATC'][j]

In [None]:
# removes characters from key, such as '(' and ')'.
atc_conversion_ingr_atc = {
    list(filter(None, re.compile(r"[;,()]").split(k)))[0]: atc_conversion_ingr_atc[k] 
    for k in atc_conversion_ingr_atc
}

# can verify:
# for key in atc_conversion_ingr_atc.keys():
#     print(key)
# or both key and ATC code
# for key in atc_conversion_ingr_atc.keys():
#     print('{}: {}'.format(key, atc_conversion_ingr_atc[key]))

In [None]:
# trying to find out if we can expand the dataset in a relatively cheap way
# does any of the FREETXT, currently unlabelled, 
# can be found in the ATC convertion table (not accurate, only indicative)
supplementary_candidates = {}
for txt in main_data['FREETXT'][main_data['CNT'] < 100]:
    freetxt_elements = list(filter(None, re.compile(r"[;,()]").split(txt)))
    atc_conversion_ingr_atc_keys = atc_conversion_ingr_atc.keys()
    if not set(freetxt_elements).isdisjoint(atc_conversion_ingr_atc_keys):
        currecnt_candidate = set(freetxt_elements).intersection(atc_conversion_ingr_atc_keys)
#         print('found one! It is {}. We have {} so far'.format(currecnt_candidate, counter))
#         print(atc_conversion_ingr_atc[list(currecnt_candidate)[0]])
        supplementary_candidates[txt] = atc_conversion_ingr_atc[list(currecnt_candidate)[0]]

# this list should be verified manually
# possible to generate an automatic URL
# to make verification a bit easier
# (repalce ATCCODE)
# URL=https://www.whocc.no/atc_ddd_index/?code=ATCCODE&showdescription=no
# consult with Carsten and Patrick, a physician should manually verify
# if so, export (to CSV?), clean some obvious mistakes and generate URLs
# 
# supplementary_candidates

In [None]:
# create a charachter dictionary (incrementing int)
char_dict = {}
for txt in main_data['FREETXT'][main_data['CNT'] >= 100]:
    txt = list(txt)
    for char in txt:
        if char in char_dict:
            continue
        else:
            char_dict[char] = len(char_dict)

In [None]:
len(char_dict)