# Spark Commands for the match algorithm

Step 1: We need to import our libraries


In [1]:
import pyspark
import requests
import datetime
import concurrent.futures
import glob
import pandas as pd
from pyspark.sql import SparkSession 
import json 

Step 2: We need to start a spark session

In [2]:
spark = SparkSession.\
        builder.\
        appName("pyspark-notebook").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "512m").\
        getOrCreate()


Step 3: This is the main algorithm

In [3]:
def main(hashed_column, noise, csv_a, csv_b):
    
    start = datetime.datetime.now()

    myfiles =  {'file': open(csv_a,'rb')}  
    requests.post(f"http://cluster-a:9200//take_data/", files=myfiles)
    request = requests.get(f'http://cluster-a:9200//take_data/{hashed_column}/{noise}')
    url_content = request.content
    with open("/opt/workspace/a_download.csv", 'wb') as csv_file:
        csv_file.write(url_content)

    myfiles =  {'file': open(csv_b,'rb')}  
    requests.post(f"http://cluster-b:9300//take_data/", files=myfiles)
    request = requests.get(f'http://cluster-b:9300//take_data/{hashed_column}/{noise}')
    url_content = request.content
    with open("/opt/workspace/b_download.csv", 'wb') as csv_file:
        csv_file.write(url_content)


    download_time = datetime.datetime.now() - start

    df_1 = spark.read.csv(path="/opt/workspace/a_download.csv", sep=",", header=True)
    df_2 = spark.read.csv(path="/opt/workspace/b_download.csv", sep=",", header=True)

    df_1.createOrReplaceTempView("a_cluster_data")
    df_2.createOrReplaceTempView("b_cluster_data")

    start = datetime.datetime.now()
    
    
    matched_data = spark.sql("  SELECT a.NCID as a_id, a.first_name as a_name, a.last_name as a_surname, b.NCID as b_id, b.first_name as b_name, b.last_name as b_surname   \
                                FROM a_cluster_data as a \
                                INNER JOIN b_cluster_data  as b\
                                ON a.first_name == b.first_name AND a.last_name ==  b.last_name ")  
    
    matched_data.createOrReplaceTempView("not_matched_data")

                                 
    #matched_data.repartition(1).write.mode('overwrite').csv(f"/opt/workspace/joined_result", header=True)

    joined_time = datetime.datetime.now() - start

    matched_data.createOrReplaceTempView("matched_data")
    
    #The number of the true positives
    TP = spark.sql("  SELECT *\
                      FROM matched_data \
                      WHERE matched_data.a_id == matched_data.b_id and matched_data.a_name == matched_data.b_name and matched_data.a_surname == matched_data.b_surname")
    
    total_documents = df_1.count()
    total_matches = matched_data.count()
    
    TP = TP.count()
    FP = total_matches - TP
    
    precision = TP / ( TP + FP ) 
    
    recall    = TP / total_documents
    
    return pd.DataFrame([[noise, download_time, joined_time , precision, recall ,total_matches, TP, FP ]], columns=['noise', 'download_time', 'joined_time', 'precision', 'recall' ,'total_matches', 'TP', 'FP'])
    

In [None]:
hashed = 'NCID'
for times in [100,200,400]:
    csv_a = f'book_chapter_table_25p_{times}k_A.csv'
    csv_b = f'book_chapter_table_25p_{times}k_1_B.csv'
    result = pd.DataFrame(None, columns=['noise', 'download_time', 'joined_time', 'precision', 'recall', 'total_matches', 'TP' , 'FP' ])
    for x in range (0,1000,10):
        result = pd.concat([result, main(hashed, x, csv_a, csv_b)], axis=0)

    result.to_csv(f'final_data_{times}.csv', encoding='utf-8', header=True, index=False)
    print(result)

   noise          download_time            joined_time  precision    recall  \
0      0 0 days 00:00:14.629907 0 days 00:00:00.051191   0.137448  0.210380   
0     10 0 days 00:00:11.012946 0 days 00:00:00.027296   0.137450  0.191291   
0     20 0 days 00:00:11.514408 0 days 00:00:00.035838   0.137525  0.175492   
0     30 0 days 00:00:12.820243 0 days 00:00:00.045089   0.137637  0.162185   
0     40 0 days 00:00:10.273616 0 days 00:00:00.031899   0.137689  0.150700   
..   ...                    ...                    ...        ...       ...   
0    950 0 days 00:01:40.218945 0 days 00:00:00.071240   0.278972  0.049461   
0    960 0 days 00:01:12.501140 0 days 00:00:00.022133   0.282023  0.049820   
0    970 0 days 00:01:26.401098 0 days 00:00:00.170046   0.283330  0.049682   
0    980 0 days 00:01:18.333027 0 days 00:00:00.519670   0.286270  0.049857   
0    990 0 days 00:01:30.780653 0 days 00:00:00.036341   0.288286  0.050011   

   total_matches     TP      FP  
0         153062 