# Joining and selecting the matches

# General information
This notebook joins all the matches from `matches_tf_idf` and `matches_rapidfuzz` files, recodes multiple variables and saves subsets by algorithm and DENUE's version of all the matches found. Also, merges the original names (before cleaning them) to each match. 

# Input files
1. **rapidfuzz_file_prefix:** `'/scratch/public/jpvasquez/MNCs_informality/Final_data/output/2-2-'` These files contain the matches found with the Rapidfuzz algorithm between firm names of DENUE and ORBIS.
2. **tf_idf_file_prefix:** `'/scratch/public/jpvasquez/MNCs_informality/Final_data/output/2-1-'` These files contain the matches found with the TF-IDF algorithm between firm names of DENUE and ORBIS.
3. **denue_names:** `'/scratch/public/jpvasquez/MNCs_informality/Intermediate_data/output/denue_names.csv'` This file contains a dataset where each row represents a firm with all of their original names associated attached to a DENUE's key. This file is for DENUE's original version. 
4. **denue_alternative_names:** `'/scratch/public/jpvasquez/MNCs_informality/Intermediate_data/output/denue_alternative_names.csv'` This file contains a dataset where each row represents a firm with all of their original names associated attached to a DENUE's key. This file is for DENUE's alternative version. 

In [1]:
rapidfuzz_file_prefix = '/scratch/public/jpvasquez/MNCs_informality/Final_data/output/2-2-'
tf_idf_file_prefix = '/scratch/public/jpvasquez/MNCs_informality/Final_data/output/2-1-'
denue_names_file = '/scratch/public/jpvasquez/MNCs_informality/Intermediate_data/output/denue_names.csv'
denue_alternative_names_file = '/scratch/public/jpvasquez/MNCs_informality/Intermediate_data/output/denue_alternative_names.csv'

# Output files
1. **#-final_matches_prefix:** `'/scratch/public/jpvasquez/MNCs_informality/Final_data/output/both_entity_municipality/denue/tf-idf/3-1-#-final_matches_*'` These files contain the final matches for each algorithm and DENUE's version. 

In [2]:
directory = '/scratch/public/jpvasquez/MNCs_informality/Final_data/output/'
output_prefix = 'final_matches'

# Packages
These are the needed packages to run this code. In case, the machine you're running this in doesn't have any of these packages, run this code: 

`!pip install package_name`

- **Pandas** is the package which handles importing, wrangling, cleaning and doing everything with the data. 
- **Glob** gets all the files from a directory with a prefix. 

In [3]:
import glob
import pandas as pd

# Importing the data

In [4]:
joint_matches_rapidfuzz = pd.concat([pd.read_csv(f) for f in glob.glob(rapidfuzz_file_prefix + '*.csv')], ignore_index = True)
joint_matches_tf_idf = pd.concat([pd.read_csv(f) for f in glob.glob(tf_idf_file_prefix + '*.csv')], ignore_index = True)
denue_names = pd.read_csv(denue_names_file)
denue_alternative_names = pd.read_csv(denue_alternative_names_file)

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


# Preparing the data

In [5]:
joint_matches_rapidfuzz = joint_matches_rapidfuzz.drop(columns = ['key'])
joint_matches_tf_idf['algorithm'] = 'tf-idf'

## Concatenating the matches

In [6]:
joint_matches = pd.concat([joint_matches_tf_idf, joint_matches_rapidfuzz], ignore_index = True)

## Encoding variables to numerical ones and recoding

In [7]:
joint_matches['geozone_n'] = joint_matches['selection'].map({'both_entity_municipality': 1, 
                                                          'both_entity_municipality_alternative': 1,  
                                                          'orbis_entity_municipality_denue_entity_big_companies': 2, 
                                                          'orbis_entity_municipality_denue_entity_big_companies_alternative': 2, 
                                                          'orbis_entity_municipality_denue': 3, 
                                                          'orbis_entity_municipality_denue_alternative': 3, 
                                                          'no_geo': 4, 
                                                          'no_geo_alternative': 4})
joint_matches = joint_matches.rename(columns={"selection": "geozone"})
joint_matches['version_n'] = joint_matches['geozone'].map({'both_entity_municipality': 1, 
                                                          'both_entity_municipality_alternative': 2,  
                                                          'orbis_entity_municipality_denue_entity_big_companies': 1, 
                                                          'orbis_entity_municipality_denue_entity_big_companies_alternative': 2, 
                                                          'orbis_entity_municipality_denue': 1, 
                                                          'orbis_entity_municipality_denue_alternative': 2, 
                                                          'no_geo': 1, 
                                                          'no_geo_alternative': 2})
joint_matches['version'] = joint_matches['geozone'].map({'both_entity_municipality': 'denue', 
                                                          'both_entity_municipality_alternative': 'denue_alternative',  
                                                          'orbis_entity_municipality_denue_entity_big_companies': 'denue', 
                                                          'orbis_entity_municipality_denue_entity_big_companies_alternative': 'denue_alternative', 
                                                          'orbis_entity_municipality_denue': 'denue', 
                                                          'orbis_entity_municipality_denue_alternative': 'denue_alternative', 
                                                          'no_geo': 'denue', 
                                                          'no_geo_alternative': 'denue_alternative'})
joint_matches['algorithm_n'] = joint_matches['algorithm'].map({'td-idf': 1, 
                                                              'rapidfuzz': 2})

## Keeping the most restrictive match for each firm
When running the algorithms, if there is a match found controlling both by entity and municipality, there'll be three duplicates in our database, because it'll be found in big companies and entity, and all DENUE. So, we keep the most restrictive match for each firm in ORBIS and matching pairs in DENUE. 

In [8]:
joint_matches = (joint_matches.sort_values(by = ['geozone_n'])
                              .drop_duplicates(subset = ['algorithm_n', 
                                                         'version_n', 
                                                         'bvdidnumber', 
                                                         'companyname', 
                                                         'entidad_x', 
                                                         'municipio_x', 
                                                         'llave_denue', 
                                                         'n_workers', 
                                                         'entidad_y', 
                                                         'municipio_y', 
                                                         'firm', 
                                                         'elegible_2', 
                                                         'accuracy'], 
                                               ignore_index = True, 
                                               keep = 'first'))

# Saving the matches

## Splitting and saving the matches by algorithm and version

In [9]:
for group, df in joint_matches.groupby(['algorithm', 'version']): 
    index_prefix = '3-1-' # common prefix
    if group[1] == 'denue': 
        df = df.merge(denue_names, left_on = 'llave_denue', right_on = 'llave_denue', how = 'left') # attach original names
        if group[0] == 'tf-idf': 
            index_prefix += '1-'
        else: 
            index_prefix += '2-'
    else: 
        df = df.merge(denue_alternative_names, left_on = 'llave_denue', right_on = 'llave_denue', how = 'left') # attach original names
        if group[0] == 'tf-idf': 
            index_prefix += '3-'
        else: 
            index_prefix += '4-'
    df.to_csv(directory + index_prefix + output_prefix + '_' + group[0] + '_' + group[1] + '.csv', # save matches to .csv file
              index = False)

## Save all the final matches

In [10]:
index_prefix = '3-1-5-'
joint_matches.to_csv(directory + index_prefix + output_prefix + '.csv', index = False)