In [2]:
import blocking_hash as hash 
import blocking_ngram as ngram
import blocking_structured_and_sort as ss
import blocking_length as vl
import matchers as m
import similarity as sim
import cluster as c
import csv

In [3]:
import pandas as pd
dblp_csv = '../CSV-files/dblp_stem.csv'
dblp = pd.read_csv(dblp_csv)

acm_csv = '../CSV-files/acm_stem.csv'
acm = pd.read_csv(acm_csv)

In [6]:
def similar_pairs_to_csv(similar_pairs, output_csv_file):
    header = ['dblp_index', 'acm_index']
    with open(output_csv_file, mode='w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(header)
        for pair in similar_pairs:
            writer.writerow(pair)

def evaluate_similarity(baseline, comparison):
    baseline_set, comparison_set = set(baseline), set(comparison)

    tp = len(baseline_set.intersection(comparison_set))
    fp = len(comparison_set - baseline_set)
    fn = len(baseline_set - comparison_set)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f_measure = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    result = {'precision': precision, 'recall': recall, 'f_measure': f_measure}

    return str(result)

For the Baselines we make a row wise comparison where we compare respective columns
The idea is also to not use id, because both datasets have different ids even for corresponding enteties (no need to run saved as csv (instead run reconstructed method))

In [15]:
# Baselines 0.7, 0.85 

dblp['year'] = dblp['year'].astype(str)
acm['year'] = acm['year'].astype(str)

selected_columns = ['author_names','paper_title', 'year', 'publication_venue']
base_7_jac = m.apply_similarity_baseline(dblp, acm, 0.7, selected_columns, sim.jaccard_similarity)
similar_pairs_to_csv(base_7_jac,'../baselines/base_7_jac_stem.csv')

selected_columns = ['author_names','paper_title', 'year', 'publication_venue']
base_85_jac = m.apply_similarity_baseline(dblp, acm, 0.85, selected_columns, sim.jaccard_similarity)
similar_pairs_to_csv(base_85_jac,'../baselines/base_85_jac_stem.csv')

In [None]:
# too many pairs therefore we dont try to compare with a blocking tehcnique cause they would only achieve few thousands

selected_columns = ['author_names','paper_title', 'year', 'publication_venue']
base_7_n = m.apply_similarity_baseline(dblp, acm, 0.7, selected_columns, sim.n_gram_similarity)
similar_pairs_to_csv(base_7_n,'../baselines/base_7_n_stem.csv')

selected_columns = ['author_names','paper_title', 'year', 'publication_venue']
base_85_n = m.apply_similarity_baseline(dblp, acm, 0.85, selected_columns, sim.n_gram_similarity)
similar_pairs_to_csv(base_85_n,'../baselines/base_85_n_stem.csv')

In [9]:
selected_columns = ['author_names','paper_title', 'year', 'publication_venue']
base_7_n = m.apply_similarity_baseline(dblp, acm, 0.7, selected_columns, sim.exact_length_similarity)
similar_pairs_to_csv(base_7_n,'../baselines/base_7_l_stem.csv')

# 0 pairs therefore not used
selected_columns = ['author_names','paper_title', 'year', 'publication_venue']
base_85_n = m.apply_similarity_baseline(dblp, acm, 0.85, selected_columns, sim.exact_length_similarity)
similar_pairs_to_csv(base_85_n,'../baselines/base_85_l_stem.csv')

In [17]:

selected_columns = ['author_names','paper_title', 'year', 'publication_venue']
base_7_lev = m.apply_similarity_baseline(dblp, acm, 0.7, selected_columns, sim.levensthein_distance)
similar_pairs_to_csv(base_7_lev,'../baselines/base_7_lev_stem.csv')

selected_columns = ['author_names','paper_title', 'year', 'publication_venue']
base_85_lev = m.apply_similarity_baseline(dblp, acm, 0.85, selected_columns, sim.levensthein_distance)
similar_pairs_to_csv(base_85_lev,'../baselines/base_85_lev_stem.csv')

In [5]:
# the baselines take a time to compute so to not compute it again save csv and transform to list pairs
def reconstructed_pairs(path):
    df_pairs = pd.read_csv(path)
    return list(zip(df_pairs['dblp_index'], df_pairs['acm_index']))

base_7_jac = reconstructed_pairs('../baselines/base_7_jac_stem.csv')
base_85_jac = reconstructed_pairs('../baselines/base_85_jac_stem.csv')

base_7_l = reconstructed_pairs('../baselines/base_7_l_stem.csv')
base_85_l = reconstructed_pairs('../baselines/base_85_l_stem.csv')

base_7_n = reconstructed_pairs('../baselines/base_7_n_stem.csv')
base_85_n = reconstructed_pairs('../baselines/base_85_n_stem.csv')

base_7_lev = reconstructed_pairs('../baselines/base_7_lev_stem.csv')
base_85_lev = reconstructed_pairs('../baselines/base_85_lev_stem.csv')

In [4]:
# when using sorted bloking based on year or year & publication_venue we can block created blocks for example with ngram or hash
# therefore we would transform blocks to dataframes witg blocks_to_df and with blocks_df apply on all the df the blocking function like ngram

def blocks_to_df(blocks):
    dfs = []
    for block in blocks:
        df_block = pd.DataFrame(block)
        dfs.append(df_block)
    return dfs

def block_dfs(dataframes, blocking_function, *args):
    blocks = {}
    for df in dataframes:
        block = blocking_function(df, *args)
        blocks.update(block)
    return blocks

Below are many possible matches, but sorted blocking with additional blocking like ngram will not be used when columns year and publisher are used, because this would lead to more matches due to sorted each block compared block has the same year and publisher -> influences match negatively more pairs

example it sorts trough years lets say 1995 but then for 1995 we would have two blocks one with vldb and another with sigmod as venue 

In [35]:
dblp_csv = '../CSV-files/dblp_stem.csv'
dblp = pd.read_csv(dblp_csv)

acm_csv = '../CSV-files/acm_stem.csv'
acm = pd.read_csv(acm_csv)



threshold = 0.7

year_block = [1995,1996,1997, 1998, 1999,2000,2001, 2002, 2003, 2004,2005]
labels = ["1995", "1996", "1997", "1998", "1999", "2000", "2001", "2002", "2003", "2004"]

hash_indices = ['hash_value']
ngram_indices = ['ngram_values']


selected_columns = ['author_names']

# initial n-gram with n = 2,3 
dblp_n2 = ngram.initial_ngram(dblp, 2, selected_columns)
acm_n2 = ngram.initial_ngram(acm, 2, selected_columns)
initial_n2_a_07_jac = m.apply_similarity_blocks(dblp_n2, acm_n2, threshold, sim.jaccard_similarity_ngrams, ngram_indices)

dblp_n3 = ngram.initial_ngram(dblp, 3, selected_columns)
acm_n3 = ngram.initial_ngram(acm, 3, selected_columns)
initial_n3_a_07_jac = m.apply_similarity_blocks(dblp_n3, acm_n3, threshold, sim.jaccard_similarity_ngrams, ngram_indices)

# n-gram blocking with n = 2, 3
dblp_n2 = ngram.n_gram_blocking(dblp, 2, selected_columns)
acm_n2 = ngram.n_gram_blocking(acm, 2, selected_columns)
n2_a_07_jac = m.apply_similarity_blocks(dblp_n2, acm_n2, threshold, sim.jaccard_similarity_ngrams, selected_columns)

dblp_n3 = ngram.n_gram_blocking(dblp, 3, selected_columns)
acm_n3 = ngram.n_gram_blocking(acm, 3, selected_columns)
n3_a_07_jac = m.apply_similarity_blocks(dblp_n3, acm_n3, threshold, sim.jaccard_similarity_ngrams, selected_columns)

# initial hash blocking
dblp_h = hash.initial_hash(dblp, selected_columns)
acm_h = hash.initial_hash(acm, selected_columns)
initial_h_a_jac = m.apply_similarity_blocks(dblp_h, acm_h, threshold, sim.jaccard_similarity, hash_indices)

# hash blocking
dblp_h = hash.hash_blocking(dblp, selected_columns)
acm_h = hash.hash_blocking(acm, selected_columns)
h_a_jac = m.apply_similarity_blocks(dblp_h, acm_h, threshold, sim.jaccard_similarity, hash_indices)

# length blocking
dblp_l = vl.length_blocking_multi_columns_named(dblp, selected_columns)
acm_l = vl.length_blocking_multi_columns_named(acm, selected_columns)
length_a = m.apply_similarity_lengths(dblp_l, acm_l, threshold, sim.jaccard_similarity, 'lengths')

# sorted blocking by year and publisher
dblp_s = ss.block_by_year_and_publisher(dblp, year_block, labels)
acm_s = ss.block_by_year_and_publisher(acm, year_block, labels)
sorted_a = m.apply_similarity_sorted(dblp_s, acm_s, threshold, sim.jaccard_similarity, selected_columns)

# sorted blocking by year and publisher with initial_ngram = 2, 3, initial hash and hash blocking 
dblp_sn2 = blocks_to_df(dblp_s)
dblp_sn2 = block_dfs(dblp_sn2, ngram.initial_ngram, 2, selected_columns)
acm_sn2 = blocks_to_df(acm_s)
acm_sn2 = block_dfs(acm_sn2, ngram.initial_ngram, 2, selected_columns)
sorted_initial_n2_a = m.apply_similarity_sorted_dictionary(dblp_sn2, acm_sn2, threshold, sim.jaccard_similarity_ngrams, 'ngram_values')

dblp_sn3 = blocks_to_df(dblp_s)
dblp_sn3 = block_dfs(dblp_sn3, ngram.initial_ngram, 3, selected_columns)
acm_sn3 = blocks_to_df(acm_s)
acm_sn3 = block_dfs(acm_sn3, ngram.initial_ngram, 3, selected_columns)
sorted_initial_n3_a = m.apply_similarity_sorted_dictionary(dblp_sn3, acm_sn3, threshold, sim.jaccard_similarity_ngrams, 'ngram_values')

dblp_h = blocks_to_df(dblp_s)
dblp_h = block_dfs(dblp_h, hash.hash_blocking, selected_columns)
acm_h = blocks_to_df(acm_s)
acm_h = block_dfs(acm_h, hash.hash_blocking , selected_columns)
sorted_h_a = m.apply_similarity_sorted_dictionary(dblp_h, acm_h, threshold, sim.jaccard_similarity, 'hash_value')

dblp_h = blocks_to_df(dblp_s)
dblp_h = block_dfs(dblp_h, hash.initial_hash, selected_columns)
acm_h = blocks_to_df(acm_s)
acm_h = block_dfs(acm_h, hash.initial_hash , selected_columns)
sorted_initial_h_a = m.apply_similarity_sorted_dictionary(dblp_h, acm_h, threshold, sim.jaccard_similarity, 'hash_value')

selected_columns = ['paper_title']

# initial n-gram with n = 2,3 
dblp_n2 = ngram.initial_ngram(dblp, 2, selected_columns)
acm_n2 = ngram.initial_ngram(acm, 2, selected_columns)
initial_n2_p_07_jac = m.apply_similarity_blocks(dblp_n2, acm_n2, threshold, sim.jaccard_similarity_ngrams, ngram_indices)

dblp_n3 = ngram.initial_ngram(dblp, 3, selected_columns)
acm_n3 = ngram.initial_ngram(acm, 3, selected_columns)
initial_n3_p_07_jac = m.apply_similarity_blocks(dblp_n3, acm_n3, threshold, sim.jaccard_similarity_ngrams, ngram_indices)

# n-gram blocking with n = 2, 3
dblp_n2 = ngram.n_gram_blocking(dblp, 2, selected_columns)
acm_n2 = ngram.n_gram_blocking(acm, 2, selected_columns)
n2_p_07_jac = m.apply_similarity_blocks(dblp_n2, acm_n2, threshold, sim.jaccard_similarity_ngrams, selected_columns)

dblp_n3 = ngram.n_gram_blocking(dblp, 3, selected_columns)
acm_n3 = ngram.n_gram_blocking(acm, 3, selected_columns)
n3_p_07_jac = m.apply_similarity_blocks(dblp_n3, acm_n3, threshold, sim.jaccard_similarity_ngrams, selected_columns)

# initial hash blocking
dblp_h = hash.initial_hash(dblp, selected_columns)
acm_h = hash.initial_hash(acm, selected_columns)
initial_h_p_jac = m.apply_similarity_blocks(dblp_h, acm_h, threshold, sim.jaccard_similarity, hash_indices)

# hash blocking
dblp_h = hash.hash_blocking(dblp, selected_columns)
acm_h = hash.hash_blocking(acm, selected_columns)
h_p_jac = m.apply_similarity_blocks(dblp_h, acm_h, threshold, sim.jaccard_similarity, hash_indices)

# length blocking
dblp_l = vl.length_blocking_multi_columns_named(dblp, selected_columns)
acm_l = vl.length_blocking_multi_columns_named(acm, selected_columns)
length_p = m.apply_similarity_lengths(dblp_l, acm_l, threshold, sim.jaccard_similarity, 'lengths')

# sorted blocking by year and publisher
dblp_s = ss.block_by_year_and_publisher(dblp, year_block, labels)
acm_s = ss.block_by_year_and_publisher(acm, year_block, labels)
sorted_p = m.apply_similarity_sorted(dblp_s, acm_s, threshold, sim.jaccard_similarity, selected_columns)

# sorted blocking by year and publisher with initial_ngram = 2, 3, initial hash and hash blocking 
dblp_sn2 = blocks_to_df(dblp_s)
dblp_sn2 = block_dfs(dblp_sn2, ngram.initial_ngram, 2, selected_columns)
acm_sn2 = blocks_to_df(acm_s)
acm_sn2 = block_dfs(acm_sn2, ngram.initial_ngram, 2, selected_columns)
sorted_initial_n2_p = m.apply_similarity_sorted_dictionary(dblp_sn2, acm_sn2, threshold, sim.jaccard_similarity_ngrams, 'ngram_values')

dblp_sn3 = blocks_to_df(dblp_s)
dblp_sn3 = block_dfs(dblp_sn3, ngram.initial_ngram, 3, selected_columns)
acm_sn3 = blocks_to_df(acm_s)
acm_sn3 = block_dfs(acm_sn3, ngram.initial_ngram, 3, selected_columns)
sorted_initial_n3_p = m.apply_similarity_sorted_dictionary(dblp_sn3, acm_sn3, threshold, sim.jaccard_similarity_ngrams, 'ngram_values')

dblp_h = blocks_to_df(dblp_s)
dblp_h = block_dfs(dblp_h, hash.hash_blocking, selected_columns)
acm_h = blocks_to_df(acm_s)
acm_h = block_dfs(acm_h, hash.hash_blocking , selected_columns)
sorted_h_p = m.apply_similarity_sorted_dictionary(dblp_h, acm_h, threshold, sim.jaccard_similarity, 'hash_value')

dblp_h = blocks_to_df(dblp_s)
dblp_h = block_dfs(dblp_h, hash.initial_hash, selected_columns)
acm_h = blocks_to_df(acm_s)
acm_h = block_dfs(acm_h, hash.initial_hash , selected_columns)
sorted_initial_h_p = m.apply_similarity_sorted_dictionary(dblp_h, acm_h, threshold, sim.jaccard_similarity, 'hash_value')


selected_columns = ['author_names', 'paper_title']

# initial n-gram with n = 2,3 
dblp_n2 = ngram.initial_ngram(dblp, 2, selected_columns)
acm_n2 = ngram.initial_ngram(acm, 2, selected_columns)
initial_n2_ap_07_jac = m.apply_similarity_blocks(dblp_n2, acm_n2, threshold, sim.jaccard_similarity_ngrams, ngram_indices)

dblp_n3 = ngram.initial_ngram(dblp, 3, selected_columns)
acm_n3 = ngram.initial_ngram(acm, 3, selected_columns)
initial_n3_ap_07_jac = m.apply_similarity_blocks(dblp_n3, acm_n3, threshold, sim.jaccard_similarity_ngrams, ngram_indices)

# n-gram blocking with n = 2, 3
dblp_n2 = ngram.n_gram_blocking(dblp, 2, selected_columns)
acm_n2 = ngram.n_gram_blocking(acm, 2, selected_columns)
n2_ap_07_jac = m.apply_similarity_blocks(dblp_n2, acm_n2, threshold, sim.jaccard_similarity_ngrams, selected_columns)

dblp_n3 = ngram.n_gram_blocking(dblp, 3, selected_columns)
acm_n3 = ngram.n_gram_blocking(acm, 3, selected_columns)
n3_ap_07_jac = m.apply_similarity_blocks(dblp_n3, acm_n3, threshold, sim.jaccard_similarity_ngrams, selected_columns)

# initial hash blocking
dblp_h = hash.initial_hash(dblp, selected_columns)
acm_h = hash.initial_hash(acm, selected_columns)
initial_h_ap_jac = m.apply_similarity_blocks(dblp_h, acm_h, threshold, sim.jaccard_similarity, hash_indices)

# hash blocking
dblp_h = hash.hash_blocking(dblp, selected_columns)
acm_h = hash.hash_blocking(acm, selected_columns)
h_ap_jac = m.apply_similarity_blocks(dblp_h, acm_h, threshold, sim.jaccard_similarity, hash_indices)

# length blocking
dblp_l = vl.length_blocking_multi_columns_named(dblp, selected_columns)
acm_l = vl.length_blocking_multi_columns_named(acm, selected_columns)
length_ap = m.apply_similarity_lengths(dblp_l, acm_l, threshold, sim.jaccard_similarity, 'lengths')

# sorted blocking by year and publisher
dblp_s = ss.block_by_year_and_publisher(dblp, year_block, labels)
acm_s = ss.block_by_year_and_publisher(acm, year_block, labels)
sorted_ap = m.apply_similarity_sorted(dblp_s, acm_s, threshold, sim.jaccard_similarity, selected_columns)

# sorted blocking by year and publisher with initial_ngram = 2, 3, initial hash and hash blocking 
dblp_sn2 = blocks_to_df(dblp_s)
dblp_sn2 = block_dfs(dblp_sn2, ngram.initial_ngram, 2, selected_columns)
acm_sn2 = blocks_to_df(acm_s)
acm_sn2 = block_dfs(acm_sn2, ngram.initial_ngram, 2, selected_columns)
sorted_initial_n2_ap = m.apply_similarity_sorted_dictionary(dblp_sn2, acm_sn2, threshold, sim.jaccard_similarity_ngrams, 'ngram_values')

dblp_sn3 = blocks_to_df(dblp_s)
dblp_sn3 = block_dfs(dblp_sn3, ngram.initial_ngram, 3, selected_columns)
acm_sn3 = blocks_to_df(acm_s)
acm_sn3 = block_dfs(acm_sn3, ngram.initial_ngram, 3, selected_columns)
sorted_initial_n3_ap = m.apply_similarity_sorted_dictionary(dblp_sn3, acm_sn3, threshold, sim.jaccard_similarity_ngrams, 'ngram_values')

dblp_h = blocks_to_df(dblp_s)
dblp_h = block_dfs(dblp_h, hash.hash_blocking, selected_columns)
acm_h = blocks_to_df(acm_s)
acm_h = block_dfs(acm_h, hash.hash_blocking , selected_columns)
sorted_h_ap = m.apply_similarity_sorted_dictionary(dblp_h, acm_h, threshold, sim.jaccard_similarity, 'hash_value')

dblp_h = blocks_to_df(dblp_s)
dblp_h = block_dfs(dblp_h, hash.initial_hash, selected_columns)
acm_h = blocks_to_df(acm_s)
acm_h = block_dfs(acm_h, hash.initial_hash , selected_columns)
sorted_initial_h_ap = m.apply_similarity_sorted_dictionary(dblp_h, acm_h, threshold, sim.jaccard_similarity, 'hash_value')

selected_columns = ['author_names', 'paper_title', 'year']

# initial n-gram with n = 2,3 
dblp_n2 = ngram.initial_ngram(dblp, 2, selected_columns)
acm_n2 = ngram.initial_ngram(acm, 2, selected_columns)
initial_n2_apy_07_jac = m.apply_similarity_blocks(dblp_n2, acm_n2, threshold, sim.jaccard_similarity_ngrams, ngram_indices)

dblp_n3 = ngram.initial_ngram(dblp, 3, selected_columns)
acm_n3 = ngram.initial_ngram(acm, 3, selected_columns)
initial_n3_apy_07_jac = m.apply_similarity_blocks(dblp_n3, acm_n3, threshold, sim.jaccard_similarity_ngrams, ngram_indices)

# n-gram blocking with n = 2, 3
dblp_n2 = ngram.n_gram_blocking(dblp, 2, selected_columns)
acm_n2 = ngram.n_gram_blocking(acm, 2, selected_columns)
n2_apy_07_jac = m.apply_similarity_blocks(dblp_n2, acm_n2, threshold, sim.jaccard_similarity_ngrams, selected_columns)

dblp_n3 = ngram.n_gram_blocking(dblp, 3, selected_columns)
acm_n3 = ngram.n_gram_blocking(acm, 3, selected_columns)
n3_apy_07_jac = m.apply_similarity_blocks(dblp_n3, acm_n3, threshold, sim.jaccard_similarity_ngrams, selected_columns)

# initial hash blocking
dblp_h = hash.initial_hash(dblp, selected_columns)
acm_h = hash.initial_hash(acm, selected_columns)
initial_h_apy_jac = m.apply_similarity_blocks(dblp_h, acm_h, threshold, sim.jaccard_similarity, hash_indices)

# hash blocking
dblp_h = hash.hash_blocking(dblp, selected_columns)
acm_h = hash.hash_blocking(acm, selected_columns)
h_apy_jac = m.apply_similarity_blocks(dblp_h, acm_h, threshold, sim.jaccard_similarity, hash_indices)

# length blocking
dblp_l = vl.length_blocking_multi_columns_named(dblp, selected_columns)
acm_l = vl.length_blocking_multi_columns_named(acm, selected_columns)
length_apy = m.apply_similarity_lengths(dblp_l, acm_l, threshold, sim.jaccard_similarity, 'lengths')

selected_columns = ['author_names', 'paper_title', 'publication_venue']

# initial n-gram with n = 2,3 
dblp_n2 = ngram.initial_ngram(dblp, 2, selected_columns)
acm_n2 = ngram.initial_ngram(acm, 2, selected_columns)
initial_n2_appv_07_jac = m.apply_similarity_blocks(dblp_n2, acm_n2, threshold, sim.jaccard_similarity_ngrams, ngram_indices)

dblp_n3 = ngram.initial_ngram(dblp, 3, selected_columns)
acm_n3 = ngram.initial_ngram(acm, 3, selected_columns)
initial_n3_appv_07_jac = m.apply_similarity_blocks(dblp_n3, acm_n3, threshold, sim.jaccard_similarity_ngrams, ngram_indices)

# n-gram blocking with n = 2, 3
dblp_n2 = ngram.n_gram_blocking(dblp, 2, selected_columns)
acm_n2 = ngram.n_gram_blocking(acm, 2, selected_columns)
n2_appv_07_jac = m.apply_similarity_blocks(dblp_n2, acm_n2, threshold, sim.jaccard_similarity_ngrams, selected_columns)

dblp_n3 = ngram.n_gram_blocking(dblp, 3, selected_columns)
acm_n3 = ngram.n_gram_blocking(acm, 3, selected_columns)
n3_appv_07_jac = m.apply_similarity_blocks(dblp_n3, acm_n3, threshold, sim.jaccard_similarity_ngrams, selected_columns)

# initial hash blocking
dblp_h = hash.initial_hash(dblp, selected_columns)
acm_h = hash.initial_hash(acm, selected_columns)
initial_h_appv_jac = m.apply_similarity_blocks(dblp_h, acm_h, threshold, sim.jaccard_similarity, hash_indices)

# hash blocking
dblp_h = hash.hash_blocking(dblp, selected_columns)
acm_h = hash.hash_blocking(acm, selected_columns)
h_appv_jac = m.apply_similarity_blocks(dblp_h, acm_h, threshold, sim.jaccard_similarity, hash_indices)

# length blocking
dblp_l = vl.length_blocking_multi_columns_named(dblp, selected_columns)
acm_l = vl.length_blocking_multi_columns_named(acm, selected_columns)
length_appv = m.apply_similarity_lengths(dblp_l, acm_l, threshold, sim.jaccard_similarity, 'lengths')

selected_columns = ['author_names', 'paper_title', 'publication_venue', 'year']

# initial n-gram with n = 2,3 
dblp_n2 = ngram.initial_ngram(dblp, 2, selected_columns)
acm_n2 = ngram.initial_ngram(acm, 2, selected_columns)
initial_n2_appvy_07_jac = m.apply_similarity_blocks(dblp_n2, acm_n2, threshold, sim.jaccard_similarity_ngrams, ngram_indices)

dblp_n3 = ngram.initial_ngram(dblp, 3, selected_columns)
acm_n3 = ngram.initial_ngram(acm, 3, selected_columns)
initial_n3_appvy_07_jac = m.apply_similarity_blocks(dblp_n3, acm_n3, threshold, sim.jaccard_similarity_ngrams, ngram_indices)

# n-gram blocking with n = 2, 3
dblp_n2 = ngram.n_gram_blocking(dblp, 2, selected_columns)
acm_n2 = ngram.n_gram_blocking(acm, 2, selected_columns)
n2_appvy_07_jac = m.apply_similarity_blocks(dblp_n2, acm_n2, threshold, sim.jaccard_similarity_ngrams, selected_columns)

dblp_n3 = ngram.n_gram_blocking(dblp, 3, selected_columns)
acm_n3 = ngram.n_gram_blocking(acm, 3, selected_columns)
n3_appvy_07_jac = m.apply_similarity_blocks(dblp_n3, acm_n3, threshold, sim.jaccard_similarity_ngrams, selected_columns)

# initial hash blocking
dblp_h = hash.initial_hash(dblp, selected_columns)
acm_h = hash.initial_hash(acm, selected_columns)
initial_h_appvy_jac = m.apply_similarity_blocks(dblp_h, acm_h, threshold, sim.jaccard_similarity, hash_indices)

# hash blocking
dblp_h = hash.hash_blocking(dblp, selected_columns)
acm_h = hash.hash_blocking(acm, selected_columns)
h_appvy_jac = m.apply_similarity_blocks(dblp_h, acm_h, threshold, sim.jaccard_similarity, hash_indices)

# length blocking
dblp_l = vl.length_blocking_multi_columns_named(dblp, selected_columns)
acm_l = vl.length_blocking_multi_columns_named(acm, selected_columns)
length_appvy = m.apply_similarity_lengths(dblp_l, acm_l, threshold, sim.jaccard_similarity, 'lengths')




Processing time: 7.1769859790802 seconds. Number of similar pairs: 2495
Processing time: 4.740327835083008 seconds. Number of similar pairs: 90801
Processing time: 8.75877594947815 seconds. Number of similar pairs: 2118
Processing time: 9.542505025863647 seconds. Number of similar pairs: 2118
Processing time: 3.751330614089966 seconds. Number of similar pairs: 2480
Processing time: 3.907388210296631 seconds. Number of similar pairs: 2216
Processing time: 0.026314973831176758 seconds. Number of similar pairs: 50143
Processing time: 0.8830711841583252 seconds. Number of similar pairs: 19261
Processing time: 0.09289216995239258 seconds. Number of similar pairs: 221
Processing time: 0.03512310981750488 seconds. Number of similar pairs: 29
Processing time: 0.795835018157959 seconds. Number of similar pairs: 70890
Processing time: 0.767827033996582 seconds. Number of similar pairs: 69936
Processing time: 5.524066925048828 seconds. Number of similar pairs: 1056
Processing time: 5.541990280151

In [17]:
# evaluate all ngram variants

result_combined_ngram_07 = (
    evaluate_similarity(base_7_jac, initial_n2_a_07_jac)  +  "\n" +
    evaluate_similarity(base_7_jac, initial_n3_a_07_jac)  +  "\n" +
    evaluate_similarity(base_7_jac, n2_a_07_jac) + "\n" +
    evaluate_similarity(base_7_jac, n3_a_07_jac) + "\n" +
    evaluate_similarity(base_7_jac, initial_n2_p_07_jac)  +  "\n" +
    evaluate_similarity(base_7_jac, initial_n3_p_07_jac)  +  "\n" +
    evaluate_similarity(base_7_jac, n2_p_07_jac) + "\n" +
    evaluate_similarity(base_7_jac, n3_p_07_jac) + "\n" +
    evaluate_similarity(base_7_jac, initial_n2_ap_07_jac) + "\n" +
    evaluate_similarity(base_7_jac, initial_n3_ap_07_jac) + "\n" +
    evaluate_similarity(base_7_jac, n2_ap_07_jac) + "\n" +
    evaluate_similarity(base_7_jac, n3_ap_07_jac) + "\n" +
    evaluate_similarity(base_7_jac, initial_n2_apy_07_jac) + "\n" +
    evaluate_similarity(base_7_jac, initial_n3_apy_07_jac) + "\n" +
    evaluate_similarity(base_7_jac, n2_apy_07_jac) + "\n" +
    evaluate_similarity(base_7_jac, n3_apy_07_jac) + "\n" +
    evaluate_similarity(base_7_jac, initial_n2_appv_07_jac) + "\n" +
    evaluate_similarity(base_7_jac, initial_n3_appv_07_jac) + "\n" +
    evaluate_similarity(base_7_jac, n2_appv_07_jac) + "\n" +
    evaluate_similarity(base_7_jac, n3_appv_07_jac) + "\n" +
    evaluate_similarity(base_7_jac, initial_n2_appvy_07_jac) + "\n" +
    evaluate_similarity(base_7_jac, initial_n3_appvy_07_jac) + "\n" +
    evaluate_similarity(base_7_jac, n2_appvy_07_jac) + "\n" +
    evaluate_similarity(base_7_jac, n3_appvy_07_jac)
)

print(result_combined_ngram_07)

# initial_n2_p_07_jac and all ap variants
# 

{'precision': 0.3438877755511022, 'recall': 0.7951807228915663, 'f_measure': 0.4801343033016229}
{'precision': 0.009449235140582152, 'recall': 0.7951807228915663, 'f_measure': 0.01867653461036134}
{'precision': 0.37063267233238906, 'recall': 0.7275254865616312, 'f_measure': 0.4910853925555208}
{'precision': 0.37063267233238906, 'recall': 0.7275254865616312, 'f_measure': 0.4910853925555208}
{'precision': 0.9005681818181818, 'recall': 0.881371640407785, 'f_measure': 0.8908665105386417}
{'precision': 0.45372137404580154, 'recall': 0.881371640407785, 'f_measure': 0.5990551181102363}
{'precision': 0.9137254901960784, 'recall': 0.8637627432808156, 'f_measure': 0.88804192472606}
{'precision': 0.9137254901960784, 'recall': 0.8637627432808156, 'f_measure': 0.88804192472606}
{'precision': 0.9757033248081841, 'recall': 0.7071362372567191, 'f_measure': 0.8199892530897367}
{'precision': 0.9757033248081841, 'recall': 0.7071362372567191, 'f_measure': 0.8199892530897367}
{'precision': 0, 'recall': 0.0

In [18]:
result_combined_hash_07 = (
    evaluate_similarity(base_7_jac, initial_h_a_jac)  +  "\n" +
    evaluate_similarity(base_7_jac, h_a_jac)  +  "\n" +
    evaluate_similarity(base_7_jac, initial_h_p_jac)  +  "\n" +
    evaluate_similarity(base_7_jac, h_p_jac)  +  "\n" +
    evaluate_similarity(base_7_jac, initial_h_ap_jac)  +  "\n" +
    evaluate_similarity(base_7_jac, h_ap_jac)  +  "\n" +
    evaluate_similarity(base_7_jac, initial_h_apy_jac)  +  "\n" +
    evaluate_similarity(base_7_jac, h_apy_jac)  +  "\n" +
    evaluate_similarity(base_7_jac, initial_h_appv_jac)  +  "\n" +
    evaluate_similarity(base_7_jac, h_appv_jac)  +  "\n" +
    evaluate_similarity(base_7_jac, initial_h_appvy_jac)  +  "\n" +
    evaluate_similarity(base_7_jac, h_appvy_jac)
    
)

print(result_combined_hash_07)

# initial h_p_jac (match), h_p_jac and initial_h_ap_jac are the best ones

{'precision': 0.34596774193548385, 'recall': 0.7951807228915663, 'f_measure': 0.48215790952514753}
{'precision': 0.37063267233238906, 'recall': 0.7275254865616312, 'f_measure': 0.4910853925555208}
{'precision': 0.9039923954372624, 'recall': 0.881371640407785, 'f_measure': 0.8925387142186767}
{'precision': 0.9137254901960784, 'recall': 0.8637627432808156, 'f_measure': 0.88804192472606}
{'precision': 0.9757033248081841, 'recall': 0.7071362372567191, 'f_measure': 0.8199892530897367}
{'precision': 0.9785407725321889, 'recall': 0.633920296570899, 'f_measure': 0.7694038245219348}
{'precision': 0.9947712418300654, 'recall': 0.7052826691380908, 'f_measure': 0.8253796095444687}
{'precision': 1.0, 'recall': 0.6320667284522706, 'f_measure': 0.7745599091425327}
{'precision': 0.9769526248399488, 'recall': 0.7071362372567191, 'f_measure': 0.8204301075268817}
{'precision': 0.9799426934097422, 'recall': 0.633920296570899, 'f_measure': 0.7698368036015757}
{'precision': 0.9947712418300654, 'recall': 0.7

In [19]:
result_combined_length_07 = (
    evaluate_similarity(base_7_jac, length_a)  +  "\n" +
    evaluate_similarity(base_7_jac, length_p)  +  "\n" +
    evaluate_similarity(base_7_jac, length_ap)  +  "\n" +
    evaluate_similarity(base_7_jac, length_apy)  +  "\n" +
    evaluate_similarity(base_7_jac, length_appv)  +  "\n" +
    evaluate_similarity(base_7_jac, length_appvy)      
)

print(result_combined_length_07)

# length_appv and length_appvy are ok - good

{'precision': 0.016834258330090723, 'recall': 0.7618164967562558, 'f_measure': 0.03294061072373166}
{'precision': 0.015784401688100223, 'recall': 0.8804448563484708, 'f_measure': 0.03101281318860687}
{'precision': 0.4600253807106599, 'recall': 0.6719184430027804, 'f_measure': 0.5461393596986818}
{'precision': 0.4600253807106599, 'recall': 0.6719184430027804, 'f_measure': 0.5461393596986818}
{'precision': 0.606694560669456, 'recall': 0.6719184430027804, 'f_measure': 0.6376429199648197}
{'precision': 0.606694560669456, 'recall': 0.6719184430027804, 'f_measure': 0.6376429199648197}


In [20]:
result_combined_sorted_07 = (
    evaluate_similarity(base_7_jac, sorted_a) +  "\n" +
    evaluate_similarity(base_7_jac, sorted_initial_n2_a) +  "\n" +
    evaluate_similarity(base_7_jac, sorted_initial_n3_a) +  "\n" +
    evaluate_similarity(base_7_jac, sorted_h_a) +  "\n" +
    evaluate_similarity(base_7_jac, sorted_initial_h_a) +  "\n" +
    evaluate_similarity(base_7_jac, sorted_p) +  "\n" +
    evaluate_similarity(base_7_jac, sorted_initial_n2_p) +  "\n" +
    evaluate_similarity(base_7_jac, sorted_initial_n3_p) +  "\n" +
    evaluate_similarity(base_7_jac, sorted_h_p) +  "\n" +
    evaluate_similarity(base_7_jac, sorted_initial_h_p) +  "\n" +
    evaluate_similarity(base_7_jac, sorted_ap) +  "\n" +
    evaluate_similarity(base_7_jac, sorted_initial_n2_ap) +  "\n" +
    evaluate_similarity(base_7_jac, sorted_initial_n3_ap) +  "\n" +
    evaluate_similarity(base_7_jac, sorted_h_ap) +  "\n" +
    evaluate_similarity(base_7_jac, sorted_initial_h_ap) 
)

print(result_combined_sorted_07)

# all bad

{'precision': 0.055831460086779236, 'recall': 0.989805375347544, 'f_measure': 0.10570071258907363}
{'precision': 0.006535947712418301, 'recall': 0.0009267840593141798, 'f_measure': 0.0016233766233766235}
{'precision': 0.0, 'recall': 0.0, 'f_measure': 0}
{'precision': 0.0, 'recall': 0.0, 'f_measure': 0}
{'precision': 0.0009208103130755065, 'recall': 0.0009267840593141798, 'f_measure': 0.0009237875288683603}
{'precision': 0.020760355439450612, 'recall': 0.9721964782205746, 'f_measure': 0.040652611998139826}
{'precision': 0.006493506493506494, 'recall': 0.0018535681186283596, 'f_measure': 0.0028839221341023795}
{'precision': 0.03389830508474576, 'recall': 0.0018535681186283596, 'f_measure': 0.00351493848857645}
{'precision': 0.0008264462809917355, 'recall': 0.0009267840593141798, 'f_measure': 0.0008737439930100481}
{'precision': 0.001658374792703151, 'recall': 0.0018535681186283596, 'f_measure': 0.00175054704595186}
{'precision': 0.07020413504318242, 'recall': 0.9944392956441149, 'f_measu

In [7]:
similar_pairs_to_csv(initial_h_p_jac,'../Matched/Matched Entities.csv')


Processing time: 5.074635744094849 seconds. Number of similar pairs: 1052


In [21]:

dblp_csv = '../CSV-files/dblp_stem.csv'
dblp = pd.read_csv(dblp_csv)

acm_csv = '../CSV-files/acm_stem.csv'
acm = pd.read_csv(acm_csv)

threshold = 0.85

year_block = [1995,1996,1997, 1998, 1999,2000,2001, 2002, 2003, 2004,2005]
labels = ["1995", "1996", "1997", "1998", "1999", "2000", "2001", "2002", "2003", "2004"]

hash_indices = ['hash_value']
ngram_indices = ['ngram_values']


selected_columns = ['author_names']

# initial n-gram with n = 2,3 
dblp_n2 = ngram.initial_ngram(dblp, 2, selected_columns)
acm_n2 = ngram.initial_ngram(acm, 2, selected_columns)
initial_n2_a_85_jac = m.apply_similarity_blocks(dblp_n2, acm_n2, threshold, sim.jaccard_similarity_ngrams, ngram_indices)

dblp_n3 = ngram.initial_ngram(dblp, 3, selected_columns)
acm_n3 = ngram.initial_ngram(acm, 3, selected_columns)
initial_n3_a_85_jac = m.apply_similarity_blocks(dblp_n3, acm_n3, threshold, sim.jaccard_similarity_ngrams, ngram_indices)

# n-gram blocking with n = 2, 3
dblp_n2 = ngram.n_gram_blocking(dblp, 2, selected_columns)
acm_n2 = ngram.n_gram_blocking(acm, 2, selected_columns)
n2_a_85_jac = m.apply_similarity_blocks(dblp_n2, acm_n2, threshold, sim.jaccard_similarity_ngrams, selected_columns)

dblp_n3 = ngram.n_gram_blocking(dblp, 3, selected_columns)
acm_n3 = ngram.n_gram_blocking(acm, 3, selected_columns)
n3_a_85_jac = m.apply_similarity_blocks(dblp_n3, acm_n3, threshold, sim.jaccard_similarity_ngrams, selected_columns)

# initial hash blocking
dblp_h = hash.initial_hash(dblp, selected_columns)
acm_h = hash.initial_hash(acm, selected_columns)
initial_h_a_jac = m.apply_similarity_blocks(dblp_h, acm_h, threshold, sim.jaccard_similarity, hash_indices)

# hash blocking
dblp_h = hash.hash_blocking(dblp, selected_columns)
acm_h = hash.hash_blocking(acm, selected_columns)
h_a_jac = m.apply_similarity_blocks(dblp_h, acm_h, threshold, sim.jaccard_similarity, hash_indices)

# length blocking
dblp_l = vl.length_blocking_multi_columns_named(dblp, selected_columns)
acm_l = vl.length_blocking_multi_columns_named(acm, selected_columns)
length_a = m.apply_similarity_lengths(dblp_l, acm_l, threshold, sim.jaccard_similarity, 'lengths')

# sorted blocking by year and publisher
dblp_s = ss.block_by_year_and_publisher(dblp, year_block, labels)
acm_s = ss.block_by_year_and_publisher(acm, year_block, labels)
sorted_a = m.apply_similarity_sorted(dblp_s, acm_s, threshold, sim.jaccard_similarity, selected_columns)

# sorted blocking by year and publisher with initial_ngram = 2, 3, initial hash and hash blocking 
dblp_sn2 = blocks_to_df(dblp_s)
dblp_sn2 = block_dfs(dblp_sn2, ngram.initial_ngram, 2, selected_columns)
acm_sn2 = blocks_to_df(acm_s)
acm_sn2 = block_dfs(acm_sn2, ngram.initial_ngram, 2, selected_columns)
sorted_initial_n2_a = m.apply_similarity_sorted_dictionary(dblp_sn2, acm_sn2, threshold, sim.jaccard_similarity_ngrams, 'ngram_values')

dblp_sn3 = blocks_to_df(dblp_s)
dblp_sn3 = block_dfs(dblp_sn3, ngram.initial_ngram, 3, selected_columns)
acm_sn3 = blocks_to_df(acm_s)
acm_sn3 = block_dfs(acm_sn3, ngram.initial_ngram, 3, selected_columns)
sorted_initial_n3_a = m.apply_similarity_sorted_dictionary(dblp_sn3, acm_sn3, threshold, sim.jaccard_similarity_ngrams, 'ngram_values')

dblp_h = blocks_to_df(dblp_s)
dblp_h = block_dfs(dblp_h, hash.hash_blocking, selected_columns)
acm_h = blocks_to_df(acm_s)
acm_h = block_dfs(acm_h, hash.hash_blocking , selected_columns)
sorted_h_a = m.apply_similarity_sorted_dictionary(dblp_h, acm_h, threshold, sim.jaccard_similarity, 'hash_value')

dblp_h = blocks_to_df(dblp_s)
dblp_h = block_dfs(dblp_h, hash.initial_hash, selected_columns)
acm_h = blocks_to_df(acm_s)
acm_h = block_dfs(acm_h, hash.initial_hash , selected_columns)
sorted_initial_h_a = m.apply_similarity_sorted_dictionary(dblp_h, acm_h, threshold, sim.jaccard_similarity, 'hash_value')

selected_columns = ['paper_title']

# initial n-gram with n = 2,3 
dblp_n2 = ngram.initial_ngram(dblp, 2, selected_columns)
acm_n2 = ngram.initial_ngram(acm, 2, selected_columns)
initial_n2_p_85_jac = m.apply_similarity_blocks(dblp_n2, acm_n2, threshold, sim.jaccard_similarity_ngrams, ngram_indices)

dblp_n3 = ngram.initial_ngram(dblp, 3, selected_columns)
acm_n3 = ngram.initial_ngram(acm, 3, selected_columns)
initial_n3_p_85_jac = m.apply_similarity_blocks(dblp_n3, acm_n3, threshold, sim.jaccard_similarity_ngrams, ngram_indices)

# n-gram blocking with n = 2, 3
dblp_n2 = ngram.n_gram_blocking(dblp, 2, selected_columns)
acm_n2 = ngram.n_gram_blocking(acm, 2, selected_columns)
n2_p_85_jac = m.apply_similarity_blocks(dblp_n2, acm_n2, threshold, sim.jaccard_similarity_ngrams, selected_columns)

dblp_n3 = ngram.n_gram_blocking(dblp, 3, selected_columns)
acm_n3 = ngram.n_gram_blocking(acm, 3, selected_columns)
n3_p_85_jac = m.apply_similarity_blocks(dblp_n3, acm_n3, threshold, sim.jaccard_similarity_ngrams, selected_columns)

# initial hash blocking
dblp_h = hash.initial_hash(dblp, selected_columns)
acm_h = hash.initial_hash(acm, selected_columns)
initial_h_p_jac = m.apply_similarity_blocks(dblp_h, acm_h, threshold, sim.jaccard_similarity, hash_indices)

# hash blocking
dblp_h = hash.hash_blocking(dblp, selected_columns)
acm_h = hash.hash_blocking(acm, selected_columns)
h_p_jac = m.apply_similarity_blocks(dblp_h, acm_h, threshold, sim.jaccard_similarity, hash_indices)

# length blocking
dblp_l = vl.length_blocking_multi_columns_named(dblp, selected_columns)
acm_l = vl.length_blocking_multi_columns_named(acm, selected_columns)
length_p = m.apply_similarity_lengths(dblp_l, acm_l, threshold, sim.jaccard_similarity, 'lengths')

# sorted blocking by year and publisher
dblp_s = ss.block_by_year_and_publisher(dblp, year_block, labels)
acm_s = ss.block_by_year_and_publisher(acm, year_block, labels)
sorted_p = m.apply_similarity_sorted(dblp_s, acm_s, threshold, sim.jaccard_similarity, selected_columns)

# sorted blocking by year and publisher with initial_ngram = 2, 3, initial hash and hash blocking 
dblp_sn2 = blocks_to_df(dblp_s)
dblp_sn2 = block_dfs(dblp_sn2, ngram.initial_ngram, 2, selected_columns)
acm_sn2 = blocks_to_df(acm_s)
acm_sn2 = block_dfs(acm_sn2, ngram.initial_ngram, 2, selected_columns)
sorted_initial_n2_p = m.apply_similarity_sorted_dictionary(dblp_sn2, acm_sn2, threshold, sim.jaccard_similarity_ngrams, 'ngram_values')

dblp_sn3 = blocks_to_df(dblp_s)
dblp_sn3 = block_dfs(dblp_sn3, ngram.initial_ngram, 3, selected_columns)
acm_sn3 = blocks_to_df(acm_s)
acm_sn3 = block_dfs(acm_sn3, ngram.initial_ngram, 3, selected_columns)
sorted_initial_n3_p = m.apply_similarity_sorted_dictionary(dblp_sn3, acm_sn3, threshold, sim.jaccard_similarity_ngrams, 'ngram_values')

dblp_h = blocks_to_df(dblp_s)
dblp_h = block_dfs(dblp_h, hash.hash_blocking, selected_columns)
acm_h = blocks_to_df(acm_s)
acm_h = block_dfs(acm_h, hash.hash_blocking , selected_columns)
sorted_h_p = m.apply_similarity_sorted_dictionary(dblp_h, acm_h, threshold, sim.jaccard_similarity, 'hash_value')

dblp_h = blocks_to_df(dblp_s)
dblp_h = block_dfs(dblp_h, hash.initial_hash, selected_columns)
acm_h = blocks_to_df(acm_s)
acm_h = block_dfs(acm_h, hash.initial_hash , selected_columns)
sorted_initial_h_p = m.apply_similarity_sorted_dictionary(dblp_h, acm_h, threshold, sim.jaccard_similarity, 'hash_value')


selected_columns = ['author_names', 'paper_title']

# initial n-gram with n = 2,3 
dblp_n2 = ngram.initial_ngram(dblp, 2, selected_columns)
acm_n2 = ngram.initial_ngram(acm, 2, selected_columns)
initial_n2_ap_85_jac = m.apply_similarity_blocks(dblp_n2, acm_n2, threshold, sim.jaccard_similarity_ngrams, ngram_indices)

dblp_n3 = ngram.initial_ngram(dblp, 3, selected_columns)
acm_n3 = ngram.initial_ngram(acm, 3, selected_columns)
initial_n3_ap_85_jac = m.apply_similarity_blocks(dblp_n3, acm_n3, threshold, sim.jaccard_similarity_ngrams, ngram_indices)

# n-gram blocking with n = 2, 3
dblp_n2 = ngram.n_gram_blocking(dblp, 2, selected_columns)
acm_n2 = ngram.n_gram_blocking(acm, 2, selected_columns)
n2_ap_85_jac = m.apply_similarity_blocks(dblp_n2, acm_n2, threshold, sim.jaccard_similarity_ngrams, selected_columns)

dblp_n3 = ngram.n_gram_blocking(dblp, 3, selected_columns)
acm_n3 = ngram.n_gram_blocking(acm, 3, selected_columns)
n3_ap_07_jac = m.apply_similarity_blocks(dblp_n3, acm_n3, threshold, sim.jaccard_similarity_ngrams, selected_columns)

# initial hash blocking
dblp_h = hash.initial_hash(dblp, selected_columns)
acm_h = hash.initial_hash(acm, selected_columns)
initial_h_ap_jac = m.apply_similarity_blocks(dblp_h, acm_h, threshold, sim.jaccard_similarity, hash_indices)

# hash blocking
dblp_h = hash.hash_blocking(dblp, selected_columns)
acm_h = hash.hash_blocking(acm, selected_columns)
h_ap_jac = m.apply_similarity_blocks(dblp_h, acm_h, threshold, sim.jaccard_similarity, hash_indices)

# length blocking
dblp_l = vl.length_blocking_multi_columns_named(dblp, selected_columns)
acm_l = vl.length_blocking_multi_columns_named(acm, selected_columns)
length_ap = m.apply_similarity_lengths(dblp_l, acm_l, threshold, sim.jaccard_similarity, 'lengths')

# sorted blocking by year and publisher
dblp_s = ss.block_by_year_and_publisher(dblp, year_block, labels)
acm_s = ss.block_by_year_and_publisher(acm, year_block, labels)
sorted_ap = m.apply_similarity_sorted(dblp_s, acm_s, threshold, sim.jaccard_similarity, selected_columns)

# sorted blocking by year and publisher with initial_ngram = 2, 3, initial hash and hash blocking 
dblp_sn2 = blocks_to_df(dblp_s)
dblp_sn2 = block_dfs(dblp_sn2, ngram.initial_ngram, 2, selected_columns)
acm_sn2 = blocks_to_df(acm_s)
acm_sn2 = block_dfs(acm_sn2, ngram.initial_ngram, 2, selected_columns)
sorted_initial_n2_ap = m.apply_similarity_sorted_dictionary(dblp_sn2, acm_sn2, threshold, sim.jaccard_similarity_ngrams, 'ngram_values')

dblp_sn3 = blocks_to_df(dblp_s)
dblp_sn3 = block_dfs(dblp_sn3, ngram.initial_ngram, 3, selected_columns)
acm_sn3 = blocks_to_df(acm_s)
acm_sn3 = block_dfs(acm_sn3, ngram.initial_ngram, 3, selected_columns)
sorted_initial_n3_ap = m.apply_similarity_sorted_dictionary(dblp_sn3, acm_sn3, threshold, sim.jaccard_similarity_ngrams, 'ngram_values')

dblp_h = blocks_to_df(dblp_s)
dblp_h = block_dfs(dblp_h, hash.hash_blocking, selected_columns)
acm_h = blocks_to_df(acm_s)
acm_h = block_dfs(acm_h, hash.hash_blocking , selected_columns)
sorted_h_ap = m.apply_similarity_sorted_dictionary(dblp_h, acm_h, threshold, sim.jaccard_similarity, 'hash_value')

dblp_h = blocks_to_df(dblp_s)
dblp_h = block_dfs(dblp_h, hash.initial_hash, selected_columns)
acm_h = blocks_to_df(acm_s)
acm_h = block_dfs(acm_h, hash.initial_hash , selected_columns)
sorted_initial_h_ap = m.apply_similarity_sorted_dictionary(dblp_h, acm_h, threshold, sim.jaccard_similarity, 'hash_value')

selected_columns = ['author_names', 'paper_title', 'year']

# initial n-gram with n = 2,3 
dblp_n2 = ngram.initial_ngram(dblp, 2, selected_columns)
acm_n2 = ngram.initial_ngram(acm, 2, selected_columns)
initial_n2_apy_85_jac = m.apply_similarity_blocks(dblp_n2, acm_n2, threshold, sim.jaccard_similarity_ngrams, ngram_indices)

dblp_n3 = ngram.initial_ngram(dblp, 3, selected_columns)
acm_n3 = ngram.initial_ngram(acm, 3, selected_columns)
initial_n3_apy_85_jac = m.apply_similarity_blocks(dblp_n3, acm_n3, threshold, sim.jaccard_similarity_ngrams, ngram_indices)

# n-gram blocking with n = 2, 3
dblp_n2 = ngram.n_gram_blocking(dblp, 2, selected_columns)
acm_n2 = ngram.n_gram_blocking(acm, 2, selected_columns)
n2_apy_85_jac = m.apply_similarity_blocks(dblp_n2, acm_n2, threshold, sim.jaccard_similarity_ngrams, selected_columns)

dblp_n3 = ngram.n_gram_blocking(dblp, 3, selected_columns)
acm_n3 = ngram.n_gram_blocking(acm, 3, selected_columns)
n3_apy_85_jac = m.apply_similarity_blocks(dblp_n3, acm_n3, threshold, sim.jaccard_similarity_ngrams, selected_columns)

# initial hash blocking
dblp_h = hash.initial_hash(dblp, selected_columns)
acm_h = hash.initial_hash(acm, selected_columns)
initial_h_apy_jac = m.apply_similarity_blocks(dblp_h, acm_h, threshold, sim.jaccard_similarity, hash_indices)

# hash blocking
dblp_h = hash.hash_blocking(dblp, selected_columns)
acm_h = hash.hash_blocking(acm, selected_columns)
h_apy_jac = m.apply_similarity_blocks(dblp_h, acm_h, threshold, sim.jaccard_similarity, hash_indices)

# length blocking
dblp_l = vl.length_blocking_multi_columns_named(dblp, selected_columns)
acm_l = vl.length_blocking_multi_columns_named(acm, selected_columns)
length_apy = m.apply_similarity_lengths(dblp_l, acm_l, threshold, sim.jaccard_similarity, 'lengths')

selected_columns = ['author_names', 'paper_title', 'publication_venue']

# initial n-gram with n = 2,3 
dblp_n2 = ngram.initial_ngram(dblp, 2, selected_columns)
acm_n2 = ngram.initial_ngram(acm, 2, selected_columns)
initial_n2_appv_85_jac = m.apply_similarity_blocks(dblp_n2, acm_n2, threshold, sim.jaccard_similarity_ngrams, ngram_indices)

dblp_n3 = ngram.initial_ngram(dblp, 3, selected_columns)
acm_n3 = ngram.initial_ngram(acm, 3, selected_columns)
initial_n3_appv_85_jac = m.apply_similarity_blocks(dblp_n3, acm_n3, threshold, sim.jaccard_similarity_ngrams, ngram_indices)

# n-gram blocking with n = 2, 3
dblp_n2 = ngram.n_gram_blocking(dblp, 2, selected_columns)
acm_n2 = ngram.n_gram_blocking(acm, 2, selected_columns)
n2_appv_85_jac = m.apply_similarity_blocks(dblp_n2, acm_n2, threshold, sim.jaccard_similarity_ngrams, selected_columns)

dblp_n3 = ngram.n_gram_blocking(dblp, 3, selected_columns)
acm_n3 = ngram.n_gram_blocking(acm, 3, selected_columns)
n3_appv_85_jac = m.apply_similarity_blocks(dblp_n3, acm_n3, threshold, sim.jaccard_similarity_ngrams, selected_columns)

# initial hash blocking
dblp_h = hash.initial_hash(dblp, selected_columns)
acm_h = hash.initial_hash(acm, selected_columns)
initial_h_appv_jac = m.apply_similarity_blocks(dblp_h, acm_h, threshold, sim.jaccard_similarity, hash_indices)

# hash blocking
dblp_h = hash.hash_blocking(dblp, selected_columns)
acm_h = hash.hash_blocking(acm, selected_columns)
h_appv_jac = m.apply_similarity_blocks(dblp_h, acm_h, threshold, sim.jaccard_similarity, hash_indices)

# length blocking
dblp_l = vl.length_blocking_multi_columns_named(dblp, selected_columns)
acm_l = vl.length_blocking_multi_columns_named(acm, selected_columns)
length_appv = m.apply_similarity_lengths(dblp_l, acm_l, threshold, sim.jaccard_similarity, 'lengths')

selected_columns = ['author_names', 'paper_title', 'publication_venue', 'year']

# initial n-gram with n = 2,3 
dblp_n2 = ngram.initial_ngram(dblp, 2, selected_columns)
acm_n2 = ngram.initial_ngram(acm, 2, selected_columns)
initial_n2_appvy_85_jac = m.apply_similarity_blocks(dblp_n2, acm_n2, threshold, sim.jaccard_similarity_ngrams, ngram_indices)

dblp_n3 = ngram.initial_ngram(dblp, 3, selected_columns)
acm_n3 = ngram.initial_ngram(acm, 3, selected_columns)
initial_n3_appvy_85_jac = m.apply_similarity_blocks(dblp_n3, acm_n3, threshold, sim.jaccard_similarity_ngrams, ngram_indices)

# n-gram blocking with n = 2, 3
dblp_n2 = ngram.n_gram_blocking(dblp, 2, selected_columns)
acm_n2 = ngram.n_gram_blocking(acm, 2, selected_columns)
n2_appvy_85_jac = m.apply_similarity_blocks(dblp_n2, acm_n2, threshold, sim.jaccard_similarity_ngrams, selected_columns)

dblp_n3 = ngram.n_gram_blocking(dblp, 3, selected_columns)
acm_n3 = ngram.n_gram_blocking(acm, 3, selected_columns)
n3_appvy_85_jac = m.apply_similarity_blocks(dblp_n3, acm_n3, threshold, sim.jaccard_similarity_ngrams, selected_columns)

# initial hash blocking
dblp_h = hash.initial_hash(dblp, selected_columns)
acm_h = hash.initial_hash(acm, selected_columns)
initial_h_appvy_jac = m.apply_similarity_blocks(dblp_h, acm_h, threshold, sim.jaccard_similarity, hash_indices)

# hash blocking
dblp_h = hash.hash_blocking(dblp, selected_columns)
acm_h = hash.hash_blocking(acm, selected_columns)
h_appvy_jac = m.apply_similarity_blocks(dblp_h, acm_h, threshold, sim.jaccard_similarity, hash_indices)

# length blocking
dblp_l = vl.length_blocking_multi_columns_named(dblp, selected_columns)
acm_l = vl.length_blocking_multi_columns_named(acm, selected_columns)
length_appvy = m.apply_similarity_lengths(dblp_l, acm_l, threshold, sim.jaccard_similarity, 'lengths')

Processing time: 7.127413988113403 seconds. Number of similar pairs: 2495
Processing time: 4.592610836029053 seconds. Number of similar pairs: 90801
Processing time: 11.911108016967773 seconds. Number of similar pairs: 2118
Processing time: 8.95640516281128 seconds. Number of similar pairs: 2118
Processing time: 3.7122960090637207 seconds. Number of similar pairs: 2480
Processing time: 3.9715867042541504 seconds. Number of similar pairs: 2216
Processing time: 0.02616119384765625 seconds. Number of similar pairs: 50143
Processing time: 0.882011890411377 seconds. Number of similar pairs: 1877
Processing time: 0.04324603080749512 seconds. Number of similar pairs: 221
Processing time: 0.035524845123291016 seconds. Number of similar pairs: 29
Processing time: 0.7886888980865479 seconds. Number of similar pairs: 70890
Processing time: 0.7748508453369141 seconds. Number of similar pairs: 69936
Processing time: 5.499645948410034 seconds. Number of similar pairs: 1056
Processing time: 5.3984420

In [22]:

result_combined_ngram_85 = (
    evaluate_similarity(base_85_jac, initial_n2_a_85_jac)  +  "\n" +
    evaluate_similarity(base_85_jac, initial_n3_a_85_jac)  +  "\n" +
    evaluate_similarity(base_85_jac, n2_a_07_jac) + "\n" +
    evaluate_similarity(base_85_jac, n3_a_07_jac) + "\n" +
    evaluate_similarity(base_85_jac, initial_n2_p_85_jac)  +  "\n" +
    evaluate_similarity(base_85_jac, initial_n3_p_85_jac)  +  "\n" +
    evaluate_similarity(base_85_jac, n2_p_07_jac) + "\n" +
    evaluate_similarity(base_85_jac, n3_p_07_jac) + "\n" +
    evaluate_similarity(base_85_jac, initial_n2_ap_85_jac) + "\n" +
    evaluate_similarity(base_85_jac, initial_n3_ap_85_jac) + "\n" +
    evaluate_similarity(base_85_jac, n2_ap_07_jac) + "\n" +
    evaluate_similarity(base_85_jac, n3_ap_07_jac) + "\n" +
    evaluate_similarity(base_85_jac, initial_n2_apy_85_jac) + "\n" +
    evaluate_similarity(base_85_jac, initial_n3_apy_85_jac) + "\n" +
    evaluate_similarity(base_85_jac, n2_apy_07_jac) + "\n" +
    evaluate_similarity(base_85_jac, n3_apy_07_jac) + "\n" +
    evaluate_similarity(base_85_jac, initial_n2_appv_85_jac) + "\n" +
    evaluate_similarity(base_85_jac, initial_n3_appv_85_jac) + "\n" +
    evaluate_similarity(base_85_jac, n2_appv_07_jac) + "\n" +
    evaluate_similarity(base_85_jac, n3_appv_07_jac) + "\n" +
    evaluate_similarity(base_85_jac, initial_n2_appvy_85_jac) + "\n" +
    evaluate_similarity(base_85_jac, initial_n3_appvy_85_jac) + "\n" +
    evaluate_similarity(base_85_jac, n2_appvy_07_jac) + "\n" +
    evaluate_similarity(base_85_jac, n3_appvy_07_jac)
)

print(result_combined_ngram_07)

# initial_n2_p_85_jac (match), n2_p_07_jac, initial_n2_apy_85_jac

{'precision': 0.3438877755511022, 'recall': 0.7951807228915663, 'f_measure': 0.4801343033016229}
{'precision': 0.009449235140582152, 'recall': 0.7951807228915663, 'f_measure': 0.01867653461036134}
{'precision': 0.37063267233238906, 'recall': 0.7275254865616312, 'f_measure': 0.4910853925555208}
{'precision': 0.37063267233238906, 'recall': 0.7275254865616312, 'f_measure': 0.4910853925555208}
{'precision': 0.9005681818181818, 'recall': 0.881371640407785, 'f_measure': 0.8908665105386417}
{'precision': 0.45372137404580154, 'recall': 0.881371640407785, 'f_measure': 0.5990551181102363}
{'precision': 0.9137254901960784, 'recall': 0.8637627432808156, 'f_measure': 0.88804192472606}
{'precision': 0.9137254901960784, 'recall': 0.8637627432808156, 'f_measure': 0.88804192472606}
{'precision': 0.9757033248081841, 'recall': 0.7071362372567191, 'f_measure': 0.8199892530897367}
{'precision': 0.9757033248081841, 'recall': 0.7071362372567191, 'f_measure': 0.8199892530897367}
{'precision': 0, 'recall': 0.0

In [23]:
result_combined_hash_85 = (
    evaluate_similarity(base_85_jac, initial_h_a_jac)  +  "\n" +
    evaluate_similarity(base_85_jac, h_a_jac)  +  "\n" +
    evaluate_similarity(base_85_jac, initial_h_p_jac)  +  "\n" +
    evaluate_similarity(base_85_jac, h_p_jac)  +  "\n" +
    evaluate_similarity(base_85_jac, initial_h_ap_jac)  +  "\n" +
    evaluate_similarity(base_85_jac, h_ap_jac)  +  "\n" +
    evaluate_similarity(base_85_jac, initial_h_apy_jac)  +  "\n" +
    evaluate_similarity(base_85_jac, h_apy_jac)  +  "\n" +
    evaluate_similarity(base_85_jac, initial_h_appv_jac)  +  "\n" +
    evaluate_similarity(base_85_jac, h_appv_jac)  +  "\n" +
    evaluate_similarity(base_85_jac, initial_h_appvy_jac)  +  "\n" +
    evaluate_similarity(base_85_jac, h_appvy_jac)
    
)

print(result_combined_hash_07)

# initial_h_p_jac, h_p_jac, initial_h_apy_jac

{'precision': 0.34596774193548385, 'recall': 0.7951807228915663, 'f_measure': 0.48215790952514753}
{'precision': 0.37063267233238906, 'recall': 0.7275254865616312, 'f_measure': 0.4910853925555208}
{'precision': 0.9039923954372624, 'recall': 0.881371640407785, 'f_measure': 0.8925387142186767}
{'precision': 0.9137254901960784, 'recall': 0.8637627432808156, 'f_measure': 0.88804192472606}
{'precision': 0.9757033248081841, 'recall': 0.7071362372567191, 'f_measure': 0.8199892530897367}
{'precision': 0.9785407725321889, 'recall': 0.633920296570899, 'f_measure': 0.7694038245219348}
{'precision': 0.9947712418300654, 'recall': 0.7052826691380908, 'f_measure': 0.8253796095444687}
{'precision': 1.0, 'recall': 0.6320667284522706, 'f_measure': 0.7745599091425327}
{'precision': 0.9769526248399488, 'recall': 0.7071362372567191, 'f_measure': 0.8204301075268817}
{'precision': 0.9799426934097422, 'recall': 0.633920296570899, 'f_measure': 0.7698368036015757}
{'precision': 0.9947712418300654, 'recall': 0.7

In [24]:
result_combined_length_85 = (
    evaluate_similarity(base_85_jac, length_a)  +  "\n" +
    evaluate_similarity(base_85_jac, length_p)  +  "\n" +
    evaluate_similarity(base_85_jac, length_ap)  +  "\n" +
    evaluate_similarity(base_85_jac, length_apy)  +  "\n" +
    evaluate_similarity(base_85_jac, length_appv)  +  "\n" +
    evaluate_similarity(base_85_jac, length_appvy)      
)

print(result_combined_length_85)

# length_app and length_appy okay - good

{'precision': 0.014294783837473632, 'recall': 0.9136125654450262, 'f_measure': 0.028149133950355894}
{'precision': 0.012527830392450071, 'recall': 0.9869109947643979, 'f_measure': 0.024741591468416733}
{'precision': 0.4384517766497462, 'recall': 0.9044502617801047, 'f_measure': 0.5905982905982906}
{'precision': 0.4384517766497462, 'recall': 0.9044502617801047, 'f_measure': 0.5905982905982906}
{'precision': 0.5782426778242677, 'recall': 0.9044502617801047, 'f_measure': 0.7054619703930576}
{'precision': 0.5782426778242677, 'recall': 0.9044502617801047, 'f_measure': 0.7054619703930576}


In [25]:
result_combined_sorted_85 = (
    evaluate_similarity(base_85_jac, sorted_a) +  "\n" +
    evaluate_similarity(base_85_jac, sorted_initial_n2_a) +  "\n" +
    evaluate_similarity(base_85_jac, sorted_initial_n3_a) +  "\n" +
    evaluate_similarity(base_85_jac, sorted_h_a) +  "\n" +
    evaluate_similarity(base_85_jac, sorted_initial_h_a) +  "\n" +
    evaluate_similarity(base_85_jac, sorted_p) +  "\n" +
    evaluate_similarity(base_85_jac, sorted_initial_n2_p) +  "\n" +
    evaluate_similarity(base_85_jac, sorted_initial_n3_p) +  "\n" +
    evaluate_similarity(base_85_jac, sorted_h_p) +  "\n" +
    evaluate_similarity(base_85_jac, sorted_initial_h_p) +  "\n" +
    evaluate_similarity(base_85_jac, sorted_ap) +  "\n" +
    evaluate_similarity(base_85_jac, sorted_initial_n2_ap) +  "\n" +
    evaluate_similarity(base_85_jac, sorted_initial_n3_ap) +  "\n" +
    evaluate_similarity(base_85_jac, sorted_h_ap) +  "\n" +
    evaluate_similarity(base_85_jac, sorted_initial_h_ap) 
)

print(result_combined_sorted_85)

# only sorted_ap good

{'precision': 0.41106290672451196, 'recall': 0.9921465968586387, 'f_measure': 0.5812883435582822}
{'precision': 0.006535947712418301, 'recall': 0.0013089005235602095, 'f_measure': 0.0021810250817884407}
{'precision': 0.0, 'recall': 0.0, 'f_measure': 0}
{'precision': 0.0, 'recall': 0.0, 'f_measure': 0}
{'precision': 0.0009208103130755065, 'recall': 0.0013089005235602095, 'f_measure': 0.0010810810810810813}
{'precision': 0.16165890816758358, 'recall': 1.0, 'f_measure': 0.27832422586520944}
{'precision': 0.006493506493506494, 'recall': 0.002617801047120419, 'f_measure': 0.0037313432835820895}
{'precision': 0.03389830508474576, 'recall': 0.002617801047120419, 'f_measure': 0.004860267314702308}
{'precision': 0.0, 'recall': 0.0, 'f_measure': 0}
{'precision': 0.001658374792703151, 'recall': 0.002617801047120419, 'f_measure': 0.002030456852791878}
{'precision': 0.6773049645390071, 'recall': 1.0, 'f_measure': 0.8076109936575052}
{'precision': 0.004754358161648178, 'recall': 0.003926701570680628

In [34]:
similar_pairs_to_csv(initial_n2_p_85_jac,'../Matched/base_85_jac_match.csv')

In [31]:

dblp_csv = '../CSV-files/dblp_stem.csv'
dblp = pd.read_csv(dblp_csv)

acm_csv = '../CSV-files/acm_stem.csv'
acm = pd.read_csv(acm_csv)

threshold = 0.7

dblp['year'] = dblp['year'].astype(str)
acm['year'] = acm['year'].astype(str)


selected_columns = ['author_names']
dblp_l = vl.length_blocking_multi_columns_named(dblp, selected_columns)
acm_l = vl.length_blocking_multi_columns_named(acm, selected_columns)
length_a = m.apply_similarity_lengths(dblp_l, acm_l, threshold, sim.exact_length_similarity, 'lengths')

selected_columns = ['paper_title']
dblp_l = vl.length_blocking_multi_columns_named(dblp, selected_columns)
acm_l = vl.length_blocking_multi_columns_named(acm, selected_columns)
length_p = m.apply_similarity_lengths(dblp_l, acm_l, threshold, sim.exact_length_similarity, 'lengths')

selected_columns = ['author_names','paper_title']
dblp_l = vl.length_blocking_multi_columns_named(dblp, selected_columns)
acm_l = vl.length_blocking_multi_columns_named(acm, selected_columns)
length_ap = m.apply_similarity_lengths(dblp_l, acm_l, threshold, sim.exact_length_similarity, 'lengths')

selected_columns = ['author_names','publication_venue']
dblp_l = vl.length_blocking_multi_columns_named(dblp, selected_columns)
acm_l = vl.length_blocking_multi_columns_named(acm, selected_columns)
length_apv = m.apply_similarity_lengths(dblp_l, acm_l, threshold, sim.exact_length_similarity, 'lengths')

selected_columns = ['author_names','year']
dblp_l = vl.length_blocking_multi_columns_named(dblp, selected_columns)
acm_l = vl.length_blocking_multi_columns_named(acm, selected_columns)
length_ay = m.apply_similarity_lengths(dblp_l, acm_l, threshold, sim.exact_length_similarity, 'lengths')

selected_columns = ['author_names','paper_title', 'year']
dblp_l = vl.length_blocking_multi_columns_named(dblp, selected_columns)
acm_l = vl.length_blocking_multi_columns_named(acm, selected_columns)
length_apy = m.apply_similarity_lengths(dblp_l, acm_l, threshold, sim.exact_length_similarity, 'lengths')

selected_columns = ['author_names','publication_venue', 'year']
dblp_l = vl.length_blocking_multi_columns_named(dblp, selected_columns)
acm_l = vl.length_blocking_multi_columns_named(acm, selected_columns)
length_apvy = m.apply_similarity_lengths(dblp_l, acm_l, threshold, sim.exact_length_similarity, 'lengths')

selected_columns = ['author_names','paper_title', 'publication_venue', 'year']
dblp_l = vl.length_blocking_multi_columns_named(dblp, selected_columns)
acm_l = vl.length_blocking_multi_columns_named(acm, selected_columns)
length_appvy = m.apply_similarity_lengths(dblp_l, acm_l, threshold, sim.exact_length_similarity, 'lengths')


selected_columns = ['paper_title','publication_venue']
dblp_l = vl.length_blocking_multi_columns_named(dblp, selected_columns)
acm_l = vl.length_blocking_multi_columns_named(acm, selected_columns)
length_ppv = m.apply_similarity_lengths(dblp_l, acm_l, threshold, sim.exact_length_similarity, 'lengths')

selected_columns = ['paper_title','publication_venue', 'year']
dblp_l = vl.length_blocking_multi_columns_named(dblp, selected_columns)
acm_l = vl.length_blocking_multi_columns_named(acm, selected_columns)
length_ppvy = m.apply_similarity_lengths(dblp_l, acm_l, threshold, sim.exact_length_similarity, 'lengths')

Processing time: 0.19475889205932617 seconds. Number of similar pairs: 50143
Processing time: 0.010220050811767578 seconds. Number of similar pairs: 60887
Processing time: 0.8428208827972412 seconds. Number of similar pairs: 1588
Processing time: 0.10789728164672852 seconds. Number of similar pairs: 287
Processing time: 0.028612136840820312 seconds. Number of similar pairs: 50143
Processing time: 0.8015661239624023 seconds. Number of similar pairs: 1588
Processing time: 0.10407495498657227 seconds. Number of similar pairs: 287
Processing time: 1.1466290950775146 seconds. Number of similar pairs: 2
Processing time: 0.06393885612487793 seconds. Number of similar pairs: 89
Processing time: 0.06463217735290527 seconds. Number of similar pairs: 89


In [33]:
result_combined_length = (
    evaluate_similarity(base_7_l, length_a) +  "\n" +
    evaluate_similarity(base_7_l, length_p) +  "\n" +
    evaluate_similarity(base_7_l, length_ap) +  "\n" +
    evaluate_similarity(base_7_l, length_apv) +  "\n" +
    evaluate_similarity(base_7_l, length_ay) +  "\n" +
    evaluate_similarity(base_7_l, length_appv) +  "\n" +
    evaluate_similarity(base_7_l, length_apy) +  "\n" +
    evaluate_similarity(base_7_l, length_appvy) +  "\n" +
    evaluate_similarity(base_7_l, length_ppv) +  "\n" +
    evaluate_similarity(base_7_l, length_ppvy) 
)

print(result_combined_length)

# only length_appv is okay

{'precision': 0.014008068975403961, 'recall': 0.9985401459854014, 'f_measure': 0.027628549501151192}
{'precision': 0.011381384375103845, 'recall': 1.0, 'f_measure': 0.022506612344137603}
{'precision': 0.434010152284264, 'recall': 0.9985401459854014, 'f_measure': 0.6050420168067228}
{'precision': 0.0, 'recall': 0.0, 'f_measure': 0}
{'precision': 0.014008068975403961, 'recall': 0.9985401459854014, 'f_measure': 0.027628549501151192}
{'precision': 0.5723849372384937, 'recall': 0.9985401459854014, 'f_measure': 0.727659574468085}
{'precision': 0.434010152284264, 'recall': 0.9985401459854014, 'f_measure': 0.6050420168067228}
{'precision': 0.0, 'recall': 0.0, 'f_measure': 0}
{'precision': 0.0, 'recall': 0.0, 'f_measure': 0}
{'precision': 0.0, 'recall': 0.0, 'f_measure': 0}


Clustering

In [13]:
import csv

# return pairs in Format [1232, 2323]
def read_matched_entities(file_path):
    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        next(reader) 
        matched_entities = [row for row in reader]
    return matched_entities

# print cluster based on the specific 
def print_clusters(clusters):
    for i, cluster in enumerate(clusters):
        print(f'Cluster {i + 1}: {cluster}')

def cluster_to_csv(cluster_data, filename):
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        for idx, item in enumerate(cluster_data, start=1):
            writer.writerow([f'Cluster {idx}: {item}'])

# clustering
file_path = '../Matched/Matched Entities.csv'
matched_entities = read_matched_entities(file_path)
cluster1 = c.build_clusters(matched_entities)
cluster2 = c.clustering_matches(matched_entities)
cluster_to_csv(cluster1, '../Matched/Clustered_Matched_Entities.csv' )

