In [None]:
import numpy as np
import pandas as pd

from utils.utils import *
from utils.utils_baseline_svm import *

import csv

# from collections import Counter

from math import inf
from time import time
# from utils.utils_nn import *

In [None]:
kwargs_lin_clf = nice_dict({'mk_chars': True, 
                            'char_filter': 100, 'allowed_chars': None, 
                            'mk_ngrams': True, 'ngram_width': 5, 
                            'ngram_filter': 10, 'allowed_ngrams': None, 
                            })

In [None]:
def jaccard_sim(d1, d2):
    # collect union keys
    key_union = list(dict(d1, **d2))
    # for each key, get max(d1[key], d2[key]), with default of zero
    val_union = [max(d1.get(k, 0), d2.get(k, 0)) for k in key_union]
    # collect intersection keys
    key_intersect = [k for k in d1 if k in d2]
    # for each key, get min(d1[key], d2[key]), (with default of inf)
    val_intersect = [min(d1.get(k, inf), d2.get(k, inf)) for k in key_intersect]
    return float(sum(val_intersect) / sum(val_union))

In [None]:
def labeld_unlabeled_similarities(*, sim_thereshold, 
                                  x_unlabeled, X_unlabeled, X_labeled, y):
    """
    Takes in the labeled, unlabeled data and the labels
    Returns sim_result_list, sim_dict_filtered, sim_dict,
    where sim_result_list,
    sim_dict_filtered,
    sim_dict is a dict from unlabeled text to (label, Jaccard sim) tuples for all labeled observations.
    """ 
    sim_dict = {}
    timenow = time()
    for text, features in zip(x_unlabeled, X_unlabeled):
        sim_dict[text] = [(y[i], jaccard_sim(obs, features)) for i, obs in enumerate(X_labeled)]

    print('Calculating all Jaccard similarities took {:.2f} seconds'.format(time() - timenow))
    
    # filtering out entries with similarity below threshold
    timenow = time()
    sim_dict_filtered = {}
    for k, v in sim_dict.items():
        sim_dict_filtered[k] = [(label, sim) for (label, sim) in v if sim >= sim_thereshold]
        # remove labels that don't meet the criteria
        if sim_dict_filtered[k] == []: del sim_dict_filtered[k]    
    print('Removing entries with Jaccard sim lower than the threshold took {:.2f} seconds'.
          format(time() - timenow))
    
    # return only the top result
    sim_result_list = []
    for k, v in sim_dict_filtered.items():
        for label, sim in v:
            # keep the entry with the highest similarity
            top_sim = 0
            if sim > top_sim:
                top_label, top_sim = label, sim
            sim_result_list.append([k, top_label, top_sim])
    
    return sim_result_list, sim_dict_filtered, sim_dict

In [None]:
x, y, n, main_data = init_data()
x_unlabeled = main_data['FREETXT'][n:]
# removing text fields shorter than sliding window width
x_unlabeled = x_unlabeled.loc[[len(k) >= kwargs_lin_clf.ngram_width 
                               for k in x_unlabeled]]

### global counter: characters ###
if kwargs_lin_clf.mk_chars:
    char_counter = dict_addition([Counter(obs) for obs in x])
    allowed_chars = [key for key,value in char_counter.items() 
                     if value >= kwargs_lin_clf.char_filter]
    allowed_chars.sort()

    # replacing unknown characters with UNKNOWN symbol
    unknown_char = '<unk-char>'
    # for labeled x
    x_unk = [list(obs) for obs in x]
    x_unk = [[char if char in allowed_chars 
              else unknown_char for char in obs]
             for obs in x_unk]
    # for unlabeled x
    x_unk_unlabeled = [list(obs) for obs in x_unlabeled]
    x_unk_unlabeled = [[char if char in allowed_chars 
                        else unknown_char for char in obs]
                       for obs in x_unk_unlabeled]
else:
    allowed_chars = list({char for obs in x for char in obs})
    allowed_chars.sort()
    x_unk = x
    x_unk_unlabeled = x_unlabeled

### global counter: ngrams ###
if kwargs_lin_clf.mk_ngrams:
    ngram_counter = dict_addition(
        [Counter(join_sliding_window(obs, kwargs_lin_clf.ngram_width))
         for obs in x_unk])
    allowed_ngrams = [key for key,value in ngram_counter.items() 
                     if value >= kwargs_lin_clf.ngram_filter]
    allowed_ngrams.sort()
    
    ### apply ngrams ###
    unknown_ngram = '<unk-ngram>'
    x_unk = [obs + [ngram
                    if ngram in allowed_ngrams
                    else unknown_ngram
                    for ngram in join_sliding_window(obs, 
                        kwargs_lin_clf.ngram_width)]
             for obs in x_unk]
    x_unk_unlabeled = [obs + [ngram
                    if ngram in allowed_ngrams
                    else unknown_ngram
                    for ngram in join_sliding_window(obs, 
                        kwargs_lin_clf.ngram_width)]
             for obs in x_unk_unlabeled]
    
else:
    pass

X_labeled = [Counter(obs) for obs in x_unk]
X_unlabeled = [Counter(obs) for obs in x_unk_unlabeled]

In [None]:
sim_result_list, sim_dict_filtered, sim_dict = \
    labeld_unlabeled_similarities(
        sim_thereshold=0.5, 
        x_unlabeled=x_unlabeled, 
        X_unlabeled=X_unlabeled, 
        X_labeled=X_labeled, 
        y=y)

# save to file (python pickle format)
for name, obj in [('sim_result_list', sim_result_list), 
                  ('sim_dict', sim_dict), 
                  ('sim_dict_filtered', sim_dict_filtered)]:
    save(name, obj)
#     print(name, str(obj))

In [None]:
# # remove duplicate entries
# seen_results = set()
# uniqe_results = [line for line in sim_result_list if 
#                  line[0] not in seen_results and not seen_results.add(line[0])]  
# # sort by text (0 element in list)
# uniqe_results.sort(key=lambda line: line[0])

# # save result to file
# with open('similarity_labels_suggestion.csv', 'w', 
#           encoding='cp850') as f:
#     writer = csv.writer(f, delimiter=',', quoting=csv.QUOTE_ALL)
#     writer.writerows(uniqe_results)