# Results overview

This notebooks presents an overview of the results and also checks that the output complies with the following restrictions:

- Only one combination of IDs should be in the data (no permutations).
- For a similarity = 1 for an specific name all combinations of the unique IDs should be present in the data.

In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = (SparkSession
         .builder
         .appName("NameMatching_Notebook")
#              .config('spark.dynamicAllocation.enabled', False)
         .config('spark.executorEnv.PYTHON_EGG_CACHE', '/tmp')
#              .config('spark.executor.instances', 4)
#              .config('spark.executor.cores', 13)
#              .config('spark.executor.memory', '14g')
         .config('spark.driver.memory', '7g')
         .getOrCreate())
sc = spark.sparkContext
sc.setLogLevel("INFO")

In [None]:
from pyspark.sql import functions as sf

In [None]:
operators_dir = 'adl://ulohubdldevne.azuredatalakestore.net/data/parquet/OPERATORS.parquet'
matches_dir = 'adl://ulohubdldevne.azuredatalakestore.net/data/parquet/OPERATORS_MATCHED.parquet'

**Load matches found**

In [None]:
matches = spark.read.parquet(matches_dir)
matches.persist()

**Load original data**

In [None]:
operators = spark.read.parquet(operators_dir)
opersators_count = (operators
                    .groupby('COUNTRY_CODE')
                    .count()
                    .selectExpr("COUNTRY_CODE",
                                "count as nr_operators"))

## Overview

**How many matches we have per country?**

In [None]:
matches_count = (matches
                 .groupby('COUNTRY_CODE')
                 .count()
                 .selectExpr("COUNTRY_CODE",
                             "count as nr_matches"))

In [None]:
matches_stats = (matches_count
                 .join(opersators_count, on='COUNTRY_CODE', how='outer')
                 .select('COUNTRY_CODE', 'nr_operators', 'nr_matches')
                 .withColumn('ratio_nr_matches_vs_nr_combinations',
                             sf.col('nr_matches') / sf.pow(sf.col('nr_operators'), 2)))

In [None]:
matches_stats.sort('nr_operators', ascending=False).show(truncate=False)

**Maximum nr. of matched element per country**

In [None]:
(matches
 .groupby(['COUNTRY_CODE', 'SOURCE_ID'])
 .count()
 .groupby('COUNTRY_CODE')
 .agg(sf.max("count").alias("count"))
 .sort('count', ascending=False)).show(10)

**Similarity average per country**

In [None]:
(matches
 .groupby('COUNTRY_CODE')
 .agg(sf.avg('SIMILARITY').alias('AVG_SIMILARITY'))
 .sort('AVG_SIMILARITY', ascending=True)).show()

**What ratio of the matches are a perfect match from the total matches per country**

In [None]:
perfect_matches_count = (matches
                         .filter(sf.col('similarity') >= 1.)
                         .groupby('COUNTRY_CODE')
                         .count()
                         .selectExpr("COUNTRY_CODE", "count as nr_perfect_matches"))

In [None]:
(matches_count
 .join(perfect_matches_count, on='COUNTRY_CODE')
 .withColumn('ratio', sf.col('nr_perfect_matches') / sf.col('nr_matches'))
 .sort('ratio', ascending=False)).show(20)

## Restrictions

In [None]:
from itertools import combinations
from pyspark.sql.types import BooleanType, IntegerType

**Check that only combinations are in the data (no permutations)**

In [None]:
repeated_combinations = (matches
 .filter(sf.concat_ws('-','SOURCE_ID', 'TARGET_ID') == sf.concat_ws('-', 'TARGET_ID', 'SOURCE_ID')))

nr_repeated_combinations = repeated_combinations.count()

if nr_repeated_combinations:
    print("Found", nr_repeated_combinations, 'repeated combinations. This can also be because more than one entry in the data has the same ID, which shouldn not be the case')
    repeated_combinations.select('COUNTRY_CODE', 'SOURCE_ID', 'TARGET_ID').show(10, truncate=False)

**Check that IDs in column j should be unique**

Here we check that an id (`id_j`) belongs to a single group.

In [None]:
matches.count() == matches.drop_duplicates(subset=['TARGET_ID']).count()

**Check that groups identifiers `id_i` do not belong to any group**

Here we check that the IDs identifying a group (`id_i`) do not appear as a member id (are not in column `id_j`)

In [None]:
if matches.count() != matches.select('TARGET_ID').subtract(matches.select('SOURCE_ID')).count():
    print("Found some groups ids (TARGET_ID) as members of a group (SOURCE_ID)")
    matches.selectExpr('TARGET_ID as ID').join(matches.selectExpr('SOURCE_ID as ID'), on='id', how='inner').show(truncate=False)
else:
    print('No group ids (TARGET_ID) found as a member of a group (SOURCE_ID)')

**Check that for exact matches all the combinations appear**

NOTE: This test applies only for ungrouped data

In [None]:
perfect_matches = matches[matches['SOURCE_NAME'] == matches['TARGET_NAME']]

In [None]:
def check_combinations(combs):
    nr_data_combinations = len(combs)
    unique_ids = set(_id for sublist in [item.split('<separator>') for item in combs] for _id in sublist)
    nr_possible_combinations = len(list(combinations(unique_ids, 2)))
    return int(nr_possible_combinations != nr_data_combinations)

def nr_unique_ids(combs):
    nr_data_combinations = len(combs)
    unique_ids = set(_id for sublist in [item.split('<separator>') for item in combs] for _id in sublist)
    return len(unique_ids)

def nr_combinations(combs):
    return len(combs)

udf_check_combinations = sf.udf(check_combinations, IntegerType())
udf_nr_combinations = sf.udf(nr_combinations, IntegerType())
udf_nr_unique_ids = sf.udf(nr_unique_ids, IntegerType())

In [None]:
grouped_perfect_matches = (perfect_matches
 .groupby('COUNTRY_CODE', 'SOURCE_NAME')
 .agg(sf.collect_list(sf.concat('SOURCE_ID', sf.lit('<separator>'), 'TARGET_ID')).alias('IDs'))
 .withColumn('fail_test', udf_check_combinations('IDs'))
 .withColumn('nr_combinations', udf_nr_combinations('IDs'))
 .withColumn('unique_ids', udf_nr_unique_ids('IDs'))                     
 .sort('fail_test', ascending=False))

In [None]:
failed_test = grouped_perfect_matches[grouped_perfect_matches['fail_test'] == 1]
failed_test.show(5)

In [None]:
(failed_test.where((sf.size(sf.split('SOURCE_NAME', '')) < 300) &
                   (sf.col('COUNTRY_CODE') == 'US'))
 .select('SOURCE_NAME', 'nr_combinations', 'unique_ids')
 .show(truncate=False))

In [None]:
(matches[(matches['COUNTRY_CODE'] == 'US') &
         (matches['SOURCE_NAME'] == '9ten restaurant newyork 7th ave910 10019') &
         (matches['TARGET_NAME'] == '9ten restaurant newyork 7th ave910 10019') &
         (matches['SIMILARITY'] >= .8)]
 .select('SOURCE_ID', 'TARGET_ID', 'SOURCE_NAME')
 .show(truncate=False))

In [None]:
matches[matches['SOURCE_NAME'] == 'business name city 21 12345'].show()

In [None]:
grouped_perfect_matches.select('SOURCE_NAME', 'fail_test', 'nr_combinations', 'unique_ids').show(truncate=False)

**per country**

In [None]:
(grouped_perfect_matches
 .groupby('COUNTRY_CODE')
 .agg(sf.sum('fail_test').alias('fail_test'))
 .sort('fail_test', ascending=False)).show()