# Spark Commands for the match algorithm

Step 1: We need to import our libraries


In [1]:
import requests
import datetime
import pandas as pd
from pyspark.sql import SparkSession 

Step 2: We need to start a spark session

In [3]:
spark = SparkSession.\
        builder.\
        appName("pyspark-notebook").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "512m").\
        getOrCreate()


Step 3: This is the main algorithm

In [4]:
def main(hashed_column, noise, csv_a, csv_b):
    
    start = datetime.datetime.now()

    myfiles =  {'file': open(csv_a,'rb')}  
    requests.post(f"http://cluster-a:9200//take_data/", files=myfiles)
    request = requests.get(f'http://cluster-a:9200//take_data/{hashed_column}/{noise}')
    url_content = request.content
    with open("/opt/workspace/a_download.csv", 'wb') as csv_file:
        csv_file.write(url_content)

    myfiles =  {'file': open(csv_b,'rb')}  
    requests.post(f"http://cluster-b:9300//take_data/", files=myfiles)
    request = requests.get(f'http://cluster-b:9300//take_data/{hashed_column}/{noise}')
    url_content = request.content
    with open("/opt/workspace/b_download.csv", 'wb') as csv_file:
        csv_file.write(url_content)


    download_time = datetime.datetime.now() - start

    df_1 = spark.read.csv(path="/opt/workspace/a_download.csv", sep=",", header=True)
    df_2 = spark.read.csv(path="/opt/workspace/b_download.csv", sep=",", header=True)

    df_1.createOrReplaceTempView("a_cluster_data")
    df_2.createOrReplaceTempView("b_cluster_data")
    
    
    start = datetime.datetime.now()
    

    matched_data = spark.sql("SELECT  a.NCID as a_f1, a.first_name as a_f2, a.last_name as a_f3, a.midl_name as a_f4, a.street_name as a_f5, a.res_city_desc as a_f6, \
                                      b.NCID as b_f1, b.first_name as b_f2, b.last_name as b_f3, b.midl_name as b_f4, b.street_name as b_f5, b.res_city_desc as b_f6  \
                              FROM a_cluster_data as a \
                              INNER JOIN b_cluster_data as b \
                              ON a.NCID == b.NCID AND a.first_name == b.first_name AND a.midl_name == b.midl_name AND a.street_name == b.street_name AND a.res_city_desc == b.res_city_desc")

    matched_data.createOrReplaceTempView("not_matched_data")

                                 
    matched_data.repartition(1).write.mode('overwrite').csv(f"/opt/workspace/joined_result", header=True)

    joined_time = datetime.datetime.now() - start

    matched_data.createOrReplaceTempView("matched_data")
    
    #The number of the true positives
    TP = spark.sql("  SELECT *\
                      FROM matched_data \
                      WHERE matched_data.a_f1 == matched_data.b_f1 AND matched_data.a_f2 == matched_data.b_f2 AND matched_data.a_f3 == matched_data.b_f3 AND matched_data.a_f4 == matched_data.b_f4 AND matched_data.a_f5 == matched_data.b_f5 AND matched_data.a_f6 == matched_data.b_f6")
    
    total_documents = df_1.count()
    total_matches = matched_data.count()
    
    TP = TP.count()
    FP = total_matches - TP
    
    precision = TP / ( TP + FP ) 
    
    recall    = TP / total_documents
    
    return pd.DataFrame([[noise, download_time, joined_time , precision, recall ,total_matches, TP, FP ]], columns=['noise', 'download_time', 'joined_time', 'precision', 'recall' ,'total_matches', 'TP', 'FP'])
    

In [6]:
hashed = 'NCID'
for times in [400]:
    csv_a = f'{times}K_A.csv'
    csv_b = f'{times}K_B.csv'
    result = pd.DataFrame(None, columns=['noise', 'download_time', 'joined_time', 'precision', 'recall', 'total_matches', 'TP' , 'FP' ])
    for x in range (0,-200,-200):
        result = pd.concat([result, main(hashed, x, csv_a, csv_b)], axis=0)

    result.to_csv(f'123_{times}.csv', encoding='utf-8', header=True, index=False)
    