In [1]:
!pip install pyspark
!pip install python-Levenshtein
!pip install fuzzywuzzy

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 36 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 45.8 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=a3b90a229b8f456deb847aba6964df62ada822038a4d16804de88fed4820f689
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1
Collecting python-Levenshtein
  Downloading python-Levenshtein-0.12.2.tar.gz (50 kB)
[K     |████████████████████████████████| 50 kB 3.0 MB/s 
Building wheels for collected packages: python-Levenshtein
  Building whee

In [2]:
import pyspark.sql.functions as F

from pyspark.sql import SparkSession
from fuzzywuzzy import fuzz
from pyspark.sql.functions import row_number,lit, when, col
from pyspark.sql.window import Window
from pyspark.sql import Window as window

In [3]:
spark = SparkSession.builder \
    .appName("SparkByExamples.com") \
    .getOrCreate()
df = spark.read.options( header=True).csv('./M_Data_v2_All.csv')

In [4]:
df.show()

+----------+--------------------+---------+--------------------+-------------------+--------------+--------------+
|First Name|             Surname|Full Name|               Email|             Mobile|Categorisation|Email_Validity|
+----------+--------------------+---------+--------------------+-------------------+--------------+--------------+
|      NULL|03rd Project Desi...|     NULL|3rdprojectdesign@...|         7595452048|          Live|             1|
|      NULL|1 Kingsdown Parad...|     NULL|mharris@andrewson...|               null|          Live|             1|
|      NULL|1 The Paragon (Ma...|     NULL|lydmick@talktalk.net|               null|          Live|             1|
|      NULL|12 Marlborough St...|     NULL|jeremy.laird@gmai...|       07817 783521|          Live|             1|
|      NULL|14 Sefton Park Ro...|     NULL|scott_blankley@ho...|       07817 117602|        Lapsed|             1|
|      NULL|15 Linton Road Ha...|     NULL|1thackerayroad@ti...|       07970 652

In [5]:
w = Window().orderBy(lit('row_num'))
df = df.withColumn("row_num", row_number().over(w))

In [6]:
joined = df.crossJoin(df)

In [7]:
def rename_duplicate_columns(dataframe):
    columns = dataframe.columns
    duplicate_column_indices = list(set([columns.index(col) for col in columns if columns.count(col) == 2]))
    for index in duplicate_column_indices:
        columns[index] = columns[index]+'_2'
    dataframe = dataframe.toDF(*columns)
    return dataframe
joined = rename_duplicate_columns(joined)

In [8]:
def calculate_fuzz_ratio(
   First_Name, Surname, Full_Name, Email, Mobile, Categorisation, Email_Validity, First_Name_2, Surname_2, Full_Name_2, Email_2, Mobile_2, Categorisation_2, Email_Validity_2
):
  str1 = [First_Name, Surname, Full_Name, Email, Mobile, Categorisation, Email_Validity]
  str2 = [First_Name_2, Surname_2, Full_Name_2, Email_2, Mobile_2, Categorisation_2, Email_Validity_2]
  return fuzz.ratio(str1, str2)

In [9]:
sch = ['group_no', 'row_no', 'match_%']
joined = joined.rdd.map(lambda x: (x.row_num, x.row_num_2, 
                                calculate_fuzz_ratio(x['First Name'], x.Surname, x['Full Name'], x.Email, x.Mobile, x.Categorisation, x.Email_Validity, x['First Name_2'], x.Surname_2, x['Full Name_2'], x.Email_2, x.Mobile_2, x.Categorisation_2, x.Email_Validity_2))).toDF(sch)

In [10]:
joined.show()

+--------+------+-------+
|group_no|row_no|match_%|
+--------+------+-------+
|       1|     1|    100|
|       2|     1|     59|
|       3|     1|     57|
|       4|     1|     64|
|       5|     1|     59|
|       6|     1|     62|
|       7|     1|     56|
|       8|     1|     56|
|       9|     1|     63|
|      10|     1|     65|
|      11|     1|     57|
|      12|     1|     57|
|      13|     1|     58|
|      14|     1|     56|
|      15|     1|     62|
|      16|     1|     56|
|      17|     1|     57|
|      18|     1|     65|
|      19|     1|     58|
|      20|     1|     59|
+--------+------+-------+
only showing top 20 rows



In [11]:
joined = joined.where(joined['match_%'] >= 80)

## picking the best group number for each row of data

In [12]:
w = window.partitionBy('group_no')
joined = joined.select('group_no', 'row_no', 'match_%', F.count('group_no').over(w).alias('count')).sort('group_no', 'match_%')

In [13]:
joined.show()

+--------+------+-------+-----+
|group_no|row_no|match_%|count|
+--------+------+-------+-----+
|       1|     1|    100|    1|
|       2|     8|     81|   11|
|       2|     7|     83|   11|
|       2|    13|     83|   11|
|       2|    20|     83|   11|
|       2|    30|     85|   11|
|       2|    17|     86|   11|
|       2|    28|     86|   11|
|       2|    16|     87|   11|
|       2|    21|     90|   11|
|       2|    38|     91|   11|
|       2|     2|    100|   11|
|       3|     3|    100|    1|
|       4|     4|    100|    1|
|       5|     5|    100|    1|
|       6|     6|    100|    1|
|       7|    16|     80|   11|
|       7|     8|     81|   11|
|       7|    13|     81|   11|
|       7|    20|     81|   11|
+--------+------+-------+-----+
only showing top 20 rows



In [14]:
windowDept = Window.partitionBy("row_no").orderBy(col("count").desc())
joined = joined.withColumn("row",row_number().over(windowDept)) \
  .filter(col("row") == 1).drop("row")

In [15]:
joined.show()

+--------+------+-------+-----+
|group_no|row_no|match_%|count|
+--------+------+-------+-----+
|       1|     1|    100|    1|
|       2|     2|    100|   11|
|       3|     3|    100|    1|
|       4|     4|    100|    1|
|       5|     5|    100|    1|
|       6|     6|    100|    1|
|       2|     7|     83|   11|
|       2|     8|     81|   11|
|       9|     9|    100|    2|
|       9|    10|     97|    2|
|      11|    11|    100|    1|
|      12|    12|    100|    1|
|       2|    13|     83|   11|
|      14|    14|    100|    1|
|      15|    15|    100|    1|
|       2|    16|     87|   11|
|       2|    17|     86|   11|
|      18|    18|    100|    1|
|      19|    19|    100|    1|
|       2|    20|     83|   11|
+--------+------+-------+-----+
only showing top 20 rows



(optional) removed the match percentage of rows which are only matched with their self.

In [16]:
# joined = joined.withColumn('match_%', when(joined['count'] == 1, None).otherwise(joined['match_%']))
# joined.show()

# Final step

In [17]:
joined = joined.orderBy(col('group_no'), col('row_no'))

In [18]:
df = joined.join(df, joined.row_no == df.row_num, "inner")

In [19]:
df.show()

+--------+------+-------+-----+----------+--------------------+---------+--------------------+-------------------+--------------+--------------+-------+
|group_no|row_no|match_%|count|First Name|             Surname|Full Name|               Email|             Mobile|Categorisation|Email_Validity|row_num|
+--------+------+-------+-----+----------+--------------------+---------+--------------------+-------------------+--------------+--------------+-------+
|       1|     1|    100|    1|      NULL|03rd Project Desi...|     NULL|3rdprojectdesign@...|         7595452048|          Live|             1|      1|
|       2|     2|    100|   11|      NULL|1 Kingsdown Parad...|     NULL|mharris@andrewson...|               null|          Live|             1|      2|
|       3|     3|    100|    1|      NULL|1 The Paragon (Ma...|     NULL|lydmick@talktalk.net|               null|          Live|             1|      3|
|       4|     4|    100|    1|      NULL|12 Marlborough St...|     NULL|jeremy.la

In [20]:
df = df.select(F.row_number().over(Window.partitionBy(df['group_no']).orderBy(df['row_no'])).alias("row_num"),"match_%", "First Name", "Surname", "Full Name", "Email", "Mobile", "Categorisation", "Email_Validity")

In [21]:
df.show()

+-------+-------+----------+--------------------+---------+--------------------+-------------------+--------------+--------------+
|row_num|match_%|First Name|             Surname|Full Name|               Email|             Mobile|Categorisation|Email_Validity|
+-------+-------+----------+--------------------+---------+--------------------+-------------------+--------------+--------------+
|      1|    100|      NULL|03rd Project Desi...|     NULL|3rdprojectdesign@...|         7595452048|          Live|             1|
|      1|    100|      NULL|1 Kingsdown Parad...|     NULL|mharris@andrewson...|               null|          Live|             1|
|      2|     83|      NULL|1-6 Highbanks Res...|     NULL|mharris@andrewson...|               null|          Live|             1|
|      3|     81|      NULL|1-6 Northway Cour...|     NULL|mharris@andrewson...|               null|          Live|             1|
|      4|     83|      NULL|2 Russell Street ...|     NULL|mharris@andrewson...|   