In [None]:
import numpy as np
import pandas as pd

from utils.utils import user_opt_gen, nice_dict, seed, init_data, print_source, pcp1, pcp2, pcp3, pcp4
from utils.utils_baseline_svm import *

import csv

from collections import Counter

from math import inf
from time import time
# from utils.utils_nn import *

In [None]:
def jaccard_sim(d1, d2):
    # collect union keys
    key_union = list(dict(d1, **d2))
    # for each key, get max(d1[key], d2[key]), with default of zero
    val_union = [max(d1.get(k, 0), d2.get(k, 0)) for k in key_union]
    # collect intersection keys
    key_intersect = [k for k in d1 if k in d2]
    # for each key, get min(d1[key], d2[key]), (with default of inf)
    val_intersect = [min(d1.get(k, inf), d2.get(k, inf)) for k in key_intersect]
    return float(sum(val_intersect) / sum(val_union))
#     return key_union, val_union
#     return key_intersect, val_intersect

In [None]:
def labeld_unlabeled_similarities(*, sim_thereshold, x_unlabeled, X_unlabeled, y):
    # takes in the labeled, unlabeled data and the labels
    # outputs a dict with (label, sim)
    sim_dict = {}
    timenow = time()
    for text, features in zip(x_unlabeled, X_unlabeled):
    #     print(d1)
    #     sim_dict[text] = [(y[i], jaccard_sim(obs, d1)) for i, obs in enumerate(X_labeled) if jaccard_sim(obs, d1) >= sim_thereshold]
    #     sim_dict[text] = [(y[i], jaccard_sim(obs, d1)) if jaccard_sim(obs, d1) >= sim_thereshold else (None, 0) for i, obs in enumerate(X_labeled)]
        sim_dict[text] = [(y[i], jaccard_sim(obs, features)) for i, obs in enumerate(X_labeled)]

    print('Calculating all Jaccard similarities took {:.2f} seconds'.format(time() - timenow))

    # filtering out entries with similarity below threshold
    sim_dict_filtered = {}
    for k, v in sim_dict.items():
    #     print (k)
    #     print(v)
        sim_dict_filtered[k] = [(label, sim) for (label, sim) in v if sim >= sim_thereshold]
        # remove labels that don't meet the criteria
        if sim_dict_filtered[k] == []: del sim_dict_filtered[k]    
    
    # return only the top result
    sim_result_list = []
    for k, v in sim_dict_filtered.items():
        for label, sim in v:
            # keep the entry with the highest similarity
            top_sim = 0
            if sim > top_sim:
                top_label, top_sim = label, sim
            sim_result_list.append([k, top_label, top_sim])
    
    return sim_result_list, sim_dict_filtered, sim_dict

In [None]:
if __name__ == '__main__':
    # replicate results from the linear classifier
    x, y, n, main_data = init_data()
    x_unlabeled = main_data['FREETXT'][n:]

    # create a sparse matrix (X) to hold features 
    kwargs_lin_clf = nice_dict({'input_data': x, 
                                'width': 5, 
                                'mk_ngrams': True, 'ngram_filter': 10, 'filter_keys_ngrams': None,
                                'mk_chars': True, 'char_filter': 100, 'filter_keys_chars': None})

    X_labeled, filter_keys_ngrams, filter_keys_chars = lin_clf_features(**kwargs_lin_clf)

    # removing text fields shorter than sliding window width
    x_unlabeled = x_unlabeled.loc[[len(k) >= kwargs_lin_clf.width for k in x_unlabeled]]

    X_unlabeled, _, _ = lin_clf_features(**{**kwargs_lin_clf, 
                                            **{'input_data': x_unlabeled, 
                                               'filter_keys_ngrams': filter_keys_ngrams, 
                                               'filter_keys_chars': filter_keys_chars}})

    sim_result_list, sim_dict_filtered, sim_dict = \
        labeld_unlabeled_similarities(
            sim_thereshold=0.85, 
            x_unlabeled=x_unlabeled, 
            X_unlabeled=X_unlabeled, 
            y=y)

    # remove duplicate entries
    seen_results = set()
    uniqe_results = [line for line in sim_result_list if 
                     line[0] not in seen_results and not seen_results.add(line[0])]  
    # sort by text (0 element in list)
    uniqe_results.sort(key=lambda line: line[0])

    # save result to file
    with open('similarity_labels_suggestion.csv', 'w') as f:
        writer = csv.writer(f, delimiter=',', quoting=csv.QUOTE_ALL)
        writer.writerows(uniqe_results)