<a href="https://colab.research.google.com/github/jembi/mpi-toolkit-notebook/blob/cr-review-dataset/fastLink-notebook/Linking/FastLinkRecordLinking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installing rpy2 
rpy2 is running an embedded R, providing access to it from Python using R’s own C-API through either:

a high-level interface making R functions an objects just like Python functions and providing a seamless conversion to numpy and pandas data structures

a low-level interface closer to the C-API

In [None]:
!pip install rpy2

## Import python packages

In [None]:
import pandas as pd
import logging
from rpy2.robjects import globalenv
from rpy2.robjects.vectors import StrVector
import rpy2.robjects as r_objects
import rpy2.robjects.packages as r_packages
r = r_objects.r

from ipywidgets import Dropdown
from ipywidgets import FloatSlider

logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO)
logging.info('Started')

## Import R packages

In [None]:
utils = r_packages.importr('utils')
utils.chooseCRANmirror(ind=1)
pack_names = ('fastLink', 'tictoc', 'strex', 'data.table', 'csv', 'stringr')
names_to_install = [x for x in pack_names if not r_packages.isinstalled(x)]
if len(names_to_install) > 0:
  utils.install_packages(StrVector(names_to_install))
  base = r_packages.importr('base')
  stats = r_packages.importr('stats')
  fastLink = r_packages.importr('fastLink')
  strex = r_packages.importr('strex')
  data_table = r_packages.importr('data.table')
  stringr = r_packages.importr('stringr')

## Upload a file from your computer

In [None]:
from google.colab import files
uploaded = files.upload()


## Reading a csv file and splitting it into 2 dataframes

In [5]:

globalenv['csv'] = r['read.csv']("data-50-25.csv", header=True, stringsAsFactors=False)
r('csv[csv==""] <- NA')
#r('csv$group    <- as.integer(substr(csv$ID, 5, 12))')
col_names_r = r('colnames(csv)')
col_names = list(col_names_r)
r('dfA          <- csv[str_detect(csv$ID, "-aaa-"), ]')
r('dfB          <- csv[str_detect(csv$ID, "-bbb-"), ]')
s = r('structure(list(csv = csv, dfA = dfA, dfB = dfB))')

menu_1 = Dropdown(description="Choose your unique identifier", options=col_names)
menu_2 = Dropdown(description="Choose your string distance algorithm", options=["Jaro-Winkler", "Levensthein"])
slider_1 = FloatSlider(description="cut.a:", value=0.94, min=0, max=1, step=0.01)
slider_2 = FloatSlider(description="cut.p:", value=0.88, min=0, max=1, step=0.01)
display(menu_1, menu_2, slider_1, slider_2)


Dropdown(description='Choose your unique identifier', options=('ID', 'hivCaseReportNumber', 'name', 'fathersNa…

Dropdown(description='Choose your string distance algorithm', options=('Jaro-Winkler', 'Levensthein'), value='…

FloatSlider(value=0.94, description='cut.a:', max=1.0, step=0.01)

FloatSlider(value=0.88, description='cut.p:', max=1.0, step=0.01)

## Get user input
The Jaro–Winkler distance is a string metric measuring an edit distance between two sequences.
The lower the Jaro–Winkler distance for two strings is, the more similar the strings are. The score is normalized such that 0 means an exact match and 1 means there is no similarity. 

The Levenshtein distance is a string metric for measuring the difference between two sequences. Informally, the Levenshtein distance between two words is the minimum number of single-character edits (insertions, deletions or substitutions) required to change one word into the other.


In [None]:
key = menu_1.value
string_distance = menu_2.value
if string_distance == "Jaro-Winkler":
  string_distance = "jw"
else:
  string_distance = "lv"
cut_a = slider_1.value
cut_p = slider_2.value
print(key)
print(string_distance)
print(cut_a)
print(cut_p)

## fastLink: Fast Probabilistic Record Linkage with Missing Data
Implements a Fellegi-Sunter probabilistic record linkage model that allows for missing data and the inclusion of auxiliary information. This includes functionalities to conduct a merge of two datasets under the Fellegi-Sunter model using the Expectation-Maximization algorithm. In addition, tools for preparing, adjusting, and summarizing data merges are included.

## Link the 2 dataframes using fastLink

In [None]:
  logging.info("FastLink : initialized")

  df_a = r('dfA')
  df_b = r('dfB')

  def fl_link(df_a, df_b, key, string_distance, cut_a, cut_p):
      get_links = r('''
          my_fl_link <- function(dfA, dfB) {{
              pasteT <- function(x) {{
                  x <- sort(x)
                  x <- paste(x, collapse = ",")
                  x
              }}
              
              varnames <- colnames(dfA[,-1])
              #varnames <- varnames[-which(varnames %in% c('{0}'))]
              print(varnames)
              fl_out <- fastLink(dfA = dfA, dfB = dfB, varnames = varnames,
                                  stringdist.match = varnames, stringdist.method = '{1}', cut.a = {2}, cut.p = {3},
                                  dedupe.matches = FALSE, linprog.dedupe = FALSE,
                                  cond.indep = TRUE,
                                  n.cores = 8,
                                  verbose = TRUE)
              inds_ab <- data.table(cbind(fl_out$matches$inds.a, fl_out$matches$inds.b))
              inds_ab[, `:=`(V3, pasteT(V2)), by = V1]
              inds_ab <- inds_ab[,.(V1, V3)]
              inds_ab <- inds_ab[!duplicated(inds_ab)]
              setnames(inds_ab, 'V3', 'V2')
              structure(list(fl_out = fl_out, inds_ab = inds_ab))
          }}'''.format(key, string_distance, cut_a, cut_p))
      return get_links(df_a, df_b)

## Display fastLink Logs to the console

In [8]:
def analytics(process):
  varnames = tuple(globalenv['{0}'.format(process)].rx2('fl_out').rx2('EM').rx2('varnames'))
  logging.info(('{0}: %s'.format(process), tuple(globalenv['{0}'.format(process)].names)))
  logging.info('fl_out: %s', tuple(globalenv['{0}'.format(process)].rx2('fl_out').names))
  logging.info('fl_out: %s', tuple(globalenv['{0}'.format(process)].rx2('fl_out').rx2('matches').names))
  logging.info('EM: %s', tuple(globalenv['{0}'.format(process)].rx2('fl_out').rx2('EM').names))
  logging.info('patterns.w: %s', tuple(globalenv['{0}'.format(process)].rx2('fl_out').rx2('EM').rx2('patterns.w').names[1]))
  logging.info('varnames: %s', tuple(globalenv['{0}'.format(process)].rx2('fl_out').rx2('EM').rx2('varnames')))
  logging.info('patterns: %s', tuple(globalenv['{0}'.format(process)].rx2('fl_out').rx2('patterns').names))
  logging.info('inds_ab: %s', tuple(globalenv['{0}'.format(process)].rx2('inds_ab').names))
  # The posterior probability of a pair matching.
  logging.info('p.m: %f', r('{0}$fl_out$EM$p.m'.format(process))[0])
  # The posterior probability of a pair not matching.
  logging.info('p.u: %f', r('{0}$fl_out$EM$p.u'.format(process))[0])
  # The posterior of the matching prVobability for a specific matching field.
  logging.info("EM.p.gamma.k.m")
  em_p_gamma_k_m = []
  for i in range(1, len(r('{0}$fl_out$EM$p.gamma.k.m'.format(process))) - 1):
    placeholder = ['p.gamma.k.m ----- %-20s : %3.10f  %3.10f  %3.10f',
                    varnames[i],
                    r('{0}$fl_out$EM$p.gamma.k.m'.format(process))[i][0],
                    abs(r('{0}$fl_out$EM$p.gamma.k.m'.format(process))[i][1]),
                    r('{0}$fl_out$EM$p.gamma.k.m'.format(process))[i][0] + r('{0}$fl_out$EM$p.gamma.k.m'.format(process))[i][1]]
    em_p_gamma_k_m.append(placeholder)
    logging.info('p.gamma.k.m ----- %-20s : %3.10f  %3.10f  %3.10f',
                  varnames[i],
                  r('{0}$fl_out$EM$p.gamma.k.m'.format(process))[i][0],
                  abs(r('{0}$fl_out$EM$p.gamma.k.m'.format(process))[i][1]),
                  r('{0}$fl_out$EM$p.gamma.k.m'.format(process))[i][0] + r('{0}$fl_out$EM$p.gamma.k.m'.format(process))[i][1])
  # The posterior of the non-matching probability for a specific matching field.
  logging.info("EM.p.gamma.k.u")
  em_p_gamma_k_u = []
  for i in range(1, len(r('{0}$fl_out$EM$p.gamma.k.u'.format(process))) - 1):
    placeholder = ['p.gamma.k.u ----- %-20s : %3.10f  %3.10f  %3.10f',
                    varnames[i],
                    abs(r('{0}$fl_out$EM$p.gamma.k.u'.format(process))[i][0]),
                    r('{0}$fl_out$EM$p.gamma.k.u'.format(process))[i][1],
                    r('{0}$fl_out$EM$p.gamma.k.u'.format(process))[i][0] + r('{0}$fl_out$EM$p.gamma.k.u'.format(process))[i][1]]
    em_p_gamma_k_u.append(placeholder)
    logging.info('p.gamma.k.u ----- %-20s : %3.10f  %3.10f  %3.10f',
                  varnames[i],
                  abs(r('{0}$fl_out$EM$p.gamma.k.u'.format(process))[i][0]),
                  r('{0}$fl_out$EM$p.gamma.k.u'.format(process))[i][1],
                  r('{0}$fl_out$EM$p.gamma.k.u'.format(process))[i][0] + r('{0}$fl_out$EM$p.gamma.k.u'.format(process))[i][1])
  logging.info('iter.converge: %d', r('{0}$fl_out$EM$iter.converge'.format(process))[0])
  return varnames, em_p_gamma_k_m, em_p_gamma_k_u

In [None]:
globalenv['links'] = fl_link(df_a, df_b, key, string_distance, cut_a, cut_p)
log_info = analytics('links')
varnames = log_info[0]
em_p_gamma_k_m = log_info[1]
em_p_gamma_k_u = log_info[2]

v1 = tuple(map(int, r('links$inds_ab$V1')))
v2 = tuple(r('links$inds_ab$V2'))
false_positives = 0
true_positives = 0

fields = ('ID',) + varnames
left = pd.DataFrame(columns=('key',) + fields)
right = pd.DataFrame(columns=('key',) + fields)
print(left)

k = 0
for i in range(len(v1)):
  dupe_links = tuple(map(int, v2[i].split(',')))
  master = r('dfA[{},]'.format(v1[i]))
  left.loc[i] = (i,) + tuple(map(lambda x: master.rx2(x)[0], fields))
  for j in range(len(dupe_links)):
    dup = r('dfB[{},]'.format(dupe_links[j]))
    right.loc[k] = (i,) + tuple(map(lambda x: dup.rx2(x)[0], fields))
    k = k + 1
    


## Display result to the screen

In [None]:
print(f"left side {left}")
print(f"right side {right}")