In [1]:
import pyspark
from pprint import pprint
sc = pyspark.SparkContext(appName='aas')
sc

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('aas').getOrCreate()
spark

In [3]:
rawblocks = sc.textFile('./linkage_csv')
rawblocks

./linkage_csv MapPartitionsRDD[1] at textFile at <unknown>:0

In [4]:
rawblocks.first()

'"id_1","id_2","cmp_fname_c1","cmp_fname_c2","cmp_lname_c1","cmp_lname_c2","cmp_sex","cmp_bd","cmp_bm","cmp_by","cmp_plz","is_match"'

In [5]:
head = rawblocks.take(10)
pprint(head)

['"id_1","id_2","cmp_fname_c1","cmp_fname_c2","cmp_lname_c1","cmp_lname_c2","cmp_sex","cmp_bd","cmp_bm","cmp_by","cmp_plz","is_match"',
 '37291,53113,0.833333333333333,?,1,?,1,1,1,1,0,TRUE',
 '39086,47614,1,?,1,?,1,1,1,1,1,TRUE',
 '70031,70237,1,?,1,?,1,1,1,1,1,TRUE',
 '84795,97439,1,?,1,?,1,1,1,1,1,TRUE',
 '36950,42116,1,?,1,1,1,1,1,1,1,TRUE',
 '42413,48491,1,?,1,?,1,1,1,1,1,TRUE',
 '25965,64753,1,?,1,?,1,1,1,1,1,TRUE',
 '49451,90407,1,?,1,?,1,1,1,1,0,TRUE',
 '39932,40902,1,?,1,?,1,1,1,1,1,TRUE']


In [6]:
rawblocks.take(5)

['"id_1","id_2","cmp_fname_c1","cmp_fname_c2","cmp_lname_c1","cmp_lname_c2","cmp_sex","cmp_bd","cmp_bm","cmp_by","cmp_plz","is_match"',
 '37291,53113,0.833333333333333,?,1,?,1,1,1,1,0,TRUE',
 '39086,47614,1,?,1,?,1,1,1,1,1,TRUE',
 '70031,70237,1,?,1,?,1,1,1,1,1,TRUE',
 '84795,97439,1,?,1,?,1,1,1,1,1,TRUE']

In [7]:
head = sc.parallelize(head)

In [8]:
head.foreach(print)

In [9]:
def isHeader(line):
    if line.find('id_1')==-1: return False
    else: return True
    
head.filter(isHeader).foreach(print)

In [10]:
head.filter(lambda x: not isHeader(x)).foreach(print)

In [11]:
parsed = spark.read.option('header','true').option('nullValue','?').option('inferSchema','true').csv('./linkage_csv/block_1.csv')

In [12]:
parsed.printSchema()

root
 |-- id_1: integer (nullable = true)
 |-- id_2: integer (nullable = true)
 |-- cmp_fname_c1: double (nullable = true)
 |-- cmp_fname_c2: double (nullable = true)
 |-- cmp_lname_c1: double (nullable = true)
 |-- cmp_lname_c2: double (nullable = true)
 |-- cmp_sex: integer (nullable = true)
 |-- cmp_bd: integer (nullable = true)
 |-- cmp_bm: integer (nullable = true)
 |-- cmp_by: integer (nullable = true)
 |-- cmp_plz: integer (nullable = true)
 |-- is_match: boolean (nullable = true)



In [13]:
parsed.count()

574913

In [14]:
parsed.cache()

DataFrame[id_1: int, id_2: int, cmp_fname_c1: double, cmp_fname_c2: double, cmp_lname_c1: double, cmp_lname_c2: double, cmp_sex: int, cmp_bd: int, cmp_bm: int, cmp_by: int, cmp_plz: int, is_match: boolean]

In [15]:
parsed.take(10)

[Row(id_1=37291, id_2=53113, cmp_fname_c1=0.833333333333333, cmp_fname_c2=None, cmp_lname_c1=1.0, cmp_lname_c2=None, cmp_sex=1, cmp_bd=1, cmp_bm=1, cmp_by=1, cmp_plz=0, is_match=True),
 Row(id_1=39086, id_2=47614, cmp_fname_c1=1.0, cmp_fname_c2=None, cmp_lname_c1=1.0, cmp_lname_c2=None, cmp_sex=1, cmp_bd=1, cmp_bm=1, cmp_by=1, cmp_plz=1, is_match=True),
 Row(id_1=70031, id_2=70237, cmp_fname_c1=1.0, cmp_fname_c2=None, cmp_lname_c1=1.0, cmp_lname_c2=None, cmp_sex=1, cmp_bd=1, cmp_bm=1, cmp_by=1, cmp_plz=1, is_match=True),
 Row(id_1=84795, id_2=97439, cmp_fname_c1=1.0, cmp_fname_c2=None, cmp_lname_c1=1.0, cmp_lname_c2=None, cmp_sex=1, cmp_bd=1, cmp_bm=1, cmp_by=1, cmp_plz=1, is_match=True),
 Row(id_1=36950, id_2=42116, cmp_fname_c1=1.0, cmp_fname_c2=None, cmp_lname_c1=1.0, cmp_lname_c2=1.0, cmp_sex=1, cmp_bd=1, cmp_bm=1, cmp_by=1, cmp_plz=1, is_match=True),
 Row(id_1=42413, id_2=48491, cmp_fname_c1=1.0, cmp_fname_c2=None, cmp_lname_c1=1.0, cmp_lname_c2=None, cmp_sex=1, cmp_bd=1, cmp_bm=1

In [16]:
parsed.rdd.map(lambda x: x.is_match).countByValue()

defaultdict(int, {True: 2093, False: 572820})

In [17]:
parsed.groupBy('is_match').count().orderBy('count',ascending=False).show()

+--------+------+
|is_match| count|
+--------+------+
|   false|572820|
|    true|  2093|
+--------+------+



In [18]:
parsed.agg({'cmp_sex':'avg','cmp_sex':'stddev'})

DataFrame[stddev(cmp_sex): double]

In [19]:
parsed.agg({'cmp_sex':'avg','cmp_sex':'stddev'}).show()

+-------------------+
|    stddev(cmp_sex)|
+-------------------+
|0.20710152240504381|
+-------------------+



In [20]:
parsed.agg({'cmp_bd':'avg','cmp_sex':'stddev'}).show()

+-------------------+-------------------+
|    stddev(cmp_sex)|        avg(cmp_bd)|
+-------------------+-------------------+
|0.20710152240504381|0.22475563232907309|
+-------------------+-------------------+



In [21]:
parsed.createOrReplaceTempView('linkage')

In [22]:
spark.sql('select is_match, count(*) cnt from linkage group by is_match order by cnt desc').show()

+--------+------+
|is_match|   cnt|
+--------+------+
|   false|572820|
|    true|  2093|
+--------+------+



In [23]:
summary = parsed.describe()
summary.show()

+-------+------------------+-----------------+-------------------+------------------+-------------------+-------------------+-------------------+-------------------+------------------+-------------------+--------------------+
|summary|              id_1|             id_2|       cmp_fname_c1|      cmp_fname_c2|       cmp_lname_c1|       cmp_lname_c2|            cmp_sex|             cmp_bd|            cmp_bm|             cmp_by|             cmp_plz|
+-------+------------------+-----------------+-------------------+------------------+-------------------+-------------------+-------------------+-------------------+------------------+-------------------+--------------------+
|  count|            574913|           574913|             574811|             10325|             574913|                239|             574913|             574851|            574851|             574851|              573618|
|   mean|33271.962171667714| 66564.6636865056| 0.7127592938251666|0.8977586763518972|0.315572457

In [24]:
summary.select('summary','cmp_fname_c1','cmp_fname_c2').show()

+-------+-------------------+------------------+
|summary|       cmp_fname_c1|      cmp_fname_c2|
+-------+-------------------+------------------+
|  count|             574811|             10325|
|   mean| 0.7127592938251666|0.8977586763518972|
| stddev|0.38892864524635457| 0.274257752043053|
|    min|                0.0|               0.0|
|    max|                1.0|               1.0|
+-------+-------------------+------------------+



In [25]:
matches = parsed.where('is_match=true')
matchSummary = matches.describe()

In [26]:
matchSummary.select('summary','cmp_fname_c1','cmp_fname_c2').show()

+-------+-------------------+-------------------+
|summary|       cmp_fname_c1|       cmp_fname_c2|
+-------+-------------------+-------------------+
|  count|               2091|                128|
|   mean| 0.9970329792424486| 0.9955357142857143|
| stddev|0.03979189523588238|0.05050762722761048|
|    min|                0.0|  0.428571428571429|
|    max|                1.0|                1.0|
+-------+-------------------+-------------------+



In [27]:
misses = parsed.filter(parsed.is_match==False)
misses.describe().select('summary','cmp_fname_c1','cmp_fname_c2').show()

+-------+------------------+-------------------+
|summary|      cmp_fname_c1|       cmp_fname_c2|
+-------+------------------+-------------------+
|  count|            572720|              10197|
|   mean|0.7117214109570877| 0.8965313093953877|
| stddev|0.3892503865780529|0.27569600395266136|
|    min|               0.0|                0.0|
|    max|               1.0|                1.0|
+-------+------------------+-------------------+

