# Stage 3 - Entity Mathcing 

In [1]:
from collections import Counter
import os

import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd

import py_entitymatching as em

## Data

The type of entity we want to match is company. The two tables are Fortune 500 lists crawled from [Fortune](http://fortune.com/) and NASDAQ company information downloaded from [NASDAQ](http://www.nasdaq.com/). See detailed information about our data below.

### Fortune 500 data
(Description goes here...)

In [2]:
forbes_filename = './dataset/structured_data/forbes_all.csv'
forbes_df = pd.read_csv(open(forbes_filename, encoding = "ISO-8859-1"))
print("# tuples:", len(forbes_df))

# tuples: 3110


Here is some sample tuples.

In [3]:
forbes_df[:5]

Unnamed: 0,ID,Company,Country,Industry,Sales (M),Profits (M),Assets (M),Market Value (M),Employee
0,1,3i Group,United Kingdom,Investment Services,485,925,7500,6700,-
1,2,3M,United States,Conglomerates,30300,4800,32700,102200,-
2,3,4moms,United States,Consumer Durables,$48,-,-,-,175
3,4,77 Bank,Japan,Regional Banks,853,165,69100,1400,-
4,5,84 Lumber,United States,Retailing,2500,-,-,-,4700


### NASDAQ data
(Description goes here...)

In [4]:
nasdaq_filename = './dataset/structured_data/nasdaq.csv'
nasdaq_df = pd.read_csv(nasdaq_filename)
print("# tuples:", len(nasdaq_df))

# tuples: 6709


Here is some sample tuples.

In [5]:
nasdaq_df[:5]

Unnamed: 0,ID,Symbol,Name,LastSale,MarketCap,IPOyear,Sector,industry,Summary Quote
0,1,PIH,"1347 Property Insurance Holdings, Inc.",7.25,$43.2M,2014.0,Finance,Property-Casualty Insurers,http://www.nasdaq.com/symbol/pih
1,2,FLWS,"1-800 FLOWERS.COM, Inc.",10.15,$665.53M,1999.0,Consumer Services,Other Specialty Stores,http://www.nasdaq.com/symbol/flws
2,3,FCCY,1st Constitution Bancorp (NJ),18.6,$148.68M,,Finance,Savings Institutions,http://www.nasdaq.com/symbol/fccy
3,4,SRCE,1st Source Corporation,47.06,$1.22B,,Finance,Major Banks,http://www.nasdaq.com/symbol/srce
4,5,VNET,"21Vianet Group, Inc.",7.24,$823.07M,2011.0,Technology,"Computer Software: Programming, Data Processing",http://www.nasdaq.com/symbol/vnet


We are to make use of the fields of name, industry, sector and market value.

In [6]:
all_fields = ['Name', 'Industry', 'MarketValue']

In [7]:
forbes = forbes_df.rename(columns={'Company':'Name', 'Market Value (M)':'MarketValue'})

In [8]:
nasdaq = nasdaq_df.rename(columns={'industry':'Industry', 'MarketCap':'MarketValue'})

## Blocking

In [9]:
A = forbes
em.set_key(A, "ID")

True

In [10]:
B = nasdaq
em.set_key(B, "ID")

True

In [11]:
ob = em.OverlapBlocker()

In [12]:
ob.stop_words.extend(w.lower() for w in [
    'property', 'holdings', 'inc', 'bancorp', 'Bancorporation'
    'Corporation', 'Group', 'Company',
    'Insurance', 'Bank', 'Pharmaceuticals', 'Pharma', 
    'Systems', 'Technologies', 'Express',
    'Mining', 'Energy', 'Electric', 'Gas', 'Oil', 'Power'])
ob.stop_words = list(set(ob.stop_words))

In [13]:
## jz: not yet sure how to combine multiple occurrence of same companies, e.g. "Zions Bancorp" vs "Zions Bancorp."
AA = ob.block_tables(A, A, 'Name', 'Name', word_level=True, overlap_size=1, rem_stop_words=True,
                    l_output_attrs=all_fields, 
                    r_output_attrs=all_fields,
                    show_progress=False)
print(len(AA))
AA.head()

49499


Unnamed: 0,_id,ltable_ID,rtable_ID,ltable_Name,ltable_Industry,ltable_MarketValue,rtable_Name,rtable_Industry,rtable_MarketValue
0,0,1,1,3i Group,Investment Services,6700,3i Group,Investment Services,6700
1,1,2,2,3M,Conglomerates,102200,3M,Conglomerates,102200
2,2,3,3,4moms,Consumer Durables,-,4moms,Consumer Durables,-
3,3,4,4,77 Bank,Regional Banks,1400,77 Bank,Regional Banks,1400
4,4,5,5,84 Lumber,Retailing,-,84 Lumber,Retailing,-


In [14]:
# Specify the tokenization to be 'word' level and set overlap_size to be 3.
C1 = ob.block_tables(A, B, 'Name', 'Name', word_level=True, overlap_size=1, rem_stop_words=True,
                    l_output_attrs=all_fields, 
                    r_output_attrs=all_fields,
                    show_progress=False)
print(len(C1))
# Display first 5 tuple pairs in the candidate set.
C1.head()

105490


Unnamed: 0,_id,ltable_ID,rtable_ID,ltable_Name,ltable_Industry,ltable_MarketValue,rtable_Name,rtable_Industry,rtable_MarketValue
0,0,1361,4,Huayi Brothers Media Corporation,-,5661,1st Source Corporation,Major Banks,$1.22B
1,1,1489,4,J M Smith Corporation,Health Care Equipment & Svcs,-,1st Source Corporation,Major Banks,$1.22B
2,2,211,4,Arctic Slope Regional Corporation,Multicompany,-,1st Source Corporation,Major Banks,$1.22B
3,3,772,4,Corporation Bank,Regional Banks,629,1st Source Corporation,Major Banks,$1.22B
4,4,261,4,Avant Credit Corporation,Banking,-,1st Source Corporation,Major Banks,$1.22B


In [15]:
# try out blocker over a single pair
# ob.block_tuples(A.ix[60], B.ix[0], l_overlap_attr='Name', r_overlap_attr='Name',
#                rem_stop_words=True, word_level=True, overlap_size=1)

## comments on Magellan

### bugs

- A typo (missing ",") at `In[6]` in the [overlap blocker Jupyter notebook](https://nbviewer.jupyter.org/github/anhaidgroup/py_entitymatching/blob/rel_0.1.x/notebooks/guides/step_wise_em_guides/Performing%20Blocking%20Using%20Built-In%20Blockers%20%28Overlap%20Blocker%29.ipynb) in the [stepwise guide](http://anhaidgroup.github.io/py_entitymatching/v0.1.x/singlepage.html#stepwise-guides).