In [14]:
import numpy as np

from utils import init_data, init_data_suggest, dict_addition, nice_dict, train_validation_split, unscale, scale_permute_data

from collections import Counter


In [15]:
def sliding_window(input_str, width):
    """
    Returns a list with a sliding window
    over the string with given width
    """
    assert len(input_str) >= width, 'Cannot slide with width larger than the string!'
    return [input_str[i:i + width] for i in range(len(input_str) - width + 1)]


def join_sliding_window(input, width):
    """
    Joins a list of strings (by applying a sliding window)
    into a list of contiguously joined strings
    """
    return [''.join(ngram) for ngram 
            in sliding_window(input, width)]

In [16]:
kwargs_data_process = nice_dict({'mk_chars': True, 
                                 'char_filter': 100, 'allowed_chars': None, 
                                 'mk_ngrams': True, 'ngram_width': 5, 
                                 'ngram_filter': 10, 'allowed_ngrams': None, 
                                 'keep_infreq_labels': False, 'label_count_thresh': 10, 
                                 'valid_ratio': 0.25, 
                                 'scale_func': unscale, 'to_permute': True, })

In [17]:
# initialize data from main (original) CSV file
x, y, n, main_data = init_data()
freq = [i for i in main_data['CNT'][:n]]  # frequencies, turned into a list
# initialize data from suggestions CSV file
x_suggest, y_suggest, freq_suggest = init_data_suggest()

In [18]:
# global counter: characters
if kwargs_data_process.mk_chars:
    char_counter = dict_addition([Counter(obs) for obs in x])
    allowed_chars = [key for key,value in char_counter.items() 
                     if value >= kwargs_data_process.char_filter]
    allowed_chars.sort()

    # replacing unknown characters with UNKNOWN symbol
    unknown_char = '<unk-char>'
    x_unk = [list(obs) for obs in x]
    x_unk = [[char if char in allowed_chars 
              else unknown_char for char in obs]
             for obs in x_unk]
    x_suggest_unk = [list(obs) for obs in x_suggest]  # same for x_suggest
    x_suggest_unk = [[char if char in allowed_chars 
                      else unknown_char for char in obs]
                     for obs in x_suggest_unk]
else:
    allowed_chars = list({char for obs in x for char in obs})
    allowed_chars.sort()
    x_unk = x
    x_suggest_unk = x_suggest

In [19]:
# global counter: ngrams
# note: AFTER applying 'unknown' to characters
if kwargs_data_process.mk_ngrams:
    ngram_counter = dict_addition(
        [Counter(join_sliding_window(obs, kwargs_data_process.ngram_width))
         for obs in x_unk])
    allowed_ngrams = [key for key,value in ngram_counter.items() 
                     if value >= kwargs_data_process.ngram_filter]
    allowed_ngrams.sort()  # len(.) 1925
else:
    ngrams_list = [join_sliding_window(obs, kwargs_data_process.ngram_width) 
                   for obs in x_unk]
    allowed_ngrams = list({ngram for obs in ngrams_list for ngram in obs})
    allowed_ngrams.sort()  # len(.) 17495

In [20]:
# merging
x_merge, y_merge, freq_merge = \
    x_unk + x_suggest_unk, \
    y + y_suggest, \
    freq + freq_suggest

In [21]:
# discard infrequent labels
# train-validation split
x_val, x_train, y_val, y_train, freq_val, freq_train, valid_index, statistics_dict = \
    train_validation_split(x=x_merge, y=y_merge, freq=freq_merge, 
                           label_count_thresh=kwargs_data_process.label_count_thresh, 
                           valid_ratio=kwargs_data_process.valid_ratio, 
                           keep_rare_labels=kwargs_data_process.keep_infreq_labels)

The are 2919 observations
Sampling from allowed 82 labels
82 labels in the validation set, with
1587 potential observation to draw from.
365 observations sampled for validation
1222 observations for training
The ratio of validation to *training* is about 0.299


In [47]:
# apply ngrams if needed
    unknown_ngram = '<unk-ngram>'
    [obs + [ngram 
            if ngram in allowed_ngrams
            else unknown_ngram
            for ngram in join_sliding_window(obs, 
                kwargs_data_process.ngram_width)]
     for obs in x_train]


[['N',
  'a',
  'C',
  'l',
  ' ',
  '0',
  '.',
  '<unk-char>',
  '%',
  'NaCl ',
  'aCl 0',
  'Cl 0.',
  'l 0.<unk-char>',
  ' 0.<unk-char>%'],
 ['D',
  'a',
  'f',
  'a',
  'l',
  'g',
  'a',
  'n',
  ' ',
  '(',
  'F',
  'i',
  'l',
  'm',
  't',
  'a',
  'b',
  'l',
  ' ',
  '1',
  ' ',
  'g',
  ')',
  ' ',
  '/',
  ' ',
  'P',
  'a',
  'r',
  'a',
  'c',
  'e',
  't',
  'a',
  'm',
  'o',
  'l',
  ' ',
  '1',
  '0',
  '0',
  '0',
  'm',
  'g',
  'Dafal',
  'afalg',
  'falga',
  'algan',
  'lgan ',
  'gan (',
  'an (F',
  'n (Fi',
  ' (Fil',
  '(Film',
  'Filmt',
  'ilmta',
  'lmtab',
  'mtabl',
  'tabl ',
  'abl 1',
  'bl 1 ',
  '<unk-ngram>',
  ' 1 g)',
  '1 g) ',
  '<unk-ngram>',
  'g) / ',
  ') / P',
  ' / Pa',
  '/ Par',
  ' Para',
  'Parac',
  'arace',
  'racet',
  'aceta',
  'cetam',
  'etamo',
  'tamol',
  'amol ',
  '<unk-ngram>',
  'ol 10',
  'l 100',
  ' 1000',
  '1000m',
  '000mg'],
 ['P',
  'e',
  'r',
  'f',
  'a',
  'l',
  'g',
  'a',
  'n',
  ' ',
  '(',
  'I',
  '

In [None]:
# here can output to linear classifier
# apply ngrams if needed
# output:
# x_val, x_train, y_val, y_train

In [None]:
# continue for NN model
# scale permute
# padding (text_filter_pad) ?
# lookup_dicts_chars_labels
# index_transorm_xy
# output:
# X_train, Y_train, X_val, Y_val

In [None]:
# scale data (proportional to frequency)
# training data
x_train_scaled, y_train_scaled, _ = \
    scale_permute_data(x=x_train, 
                       y=y_train, 
                       freq=freq_train, 
                       scale_func=kwargs_data_process.scale_func, 
                       to_permute=kwargs_data_process.to_permute)

# validation data
# x_val_scaled, y_val_scaled, _ = \
#     scale_permute_data(x=x_val, 
#                        y=y_val, 
#                        freq=freq_val, 
#                        scale_func=kwargs_data_process.scale_func, 
#                        to_permute=kwargs_data_process.to_permute)

In [None]:
# discard infrequent labels
if not kwargs_data_process.keep_rare_labels:
    # count each label occurence, filter out those less frequent than label_count_thresh
    label_freq_dict = Counter(y_merge)
    label_freq_dict = {label:count for 
                       label,count in label_freq_dict.items() 
                       if count >= kwargs_data_process.label_count_thresh}
    # create a dict with a list of label:list(index), for filtered labels
    y_enum = [(i,label) for (i,label) in enumerate(y) if label in label_freq_dict.keys()]
    label_index_dict = {label:[] for label in label_freq_dict.keys()}
    for (i,label) in y_enum:
        label_index_dict[label].append(i)    


    
    
    
# kwargs_data_process.label_count_thresh

In [None]:
# train-validation split




In [None]:
# scale / unscale (for LSTM)




In [None]:
# LSTM:
# padding
# create look-up dictionaries (and inverse) for an index representation
# index_transorm_xy