# Build the dataset
Build a dataset of `<product description>` `<product id>`pairs

## Define filepaths

In [1]:
root_dir = '../..'
data_dir = 'data/corpus'
alaska_dir = 'alaska_camera'

## Load the dataset

### Load the Alaska camera dataset for entity resolution (ER)

In [2]:
import os
import pandas as pd

In [3]:
filename = 'camera_entity_resolution_gt.xlsx'

In [4]:
alaska_camera_df = pd.read_excel(os.path.join(root_dir, data_dir, filename))
alaska_camera_df.head()

Unnamed: 0,"entity_id,spec_id"
0,"ENTITY#1,buy.net//4236"
1,"ENTITY#1,www.ebay.com//46670"
2,"ENTITY#10,www.pcconnection.com//12363"
3,"ENTITY#10,buy.net//6531"
4,"ENTITY#10,www.shopmania.in//698"


In [5]:
alaska_camera_df.shape

(3865, 1)

In [6]:
alaska_camera_df['entity_id'] = alaska_camera_df['entity_id,spec_id'].map(lambda x: x.split(',')[0])
alaska_camera_df['spec_id'] = alaska_camera_df['entity_id,spec_id'].map(lambda x: x.split(',')[1])

In [7]:
alaska_camera_df = alaska_camera_df.drop(columns=['entity_id,spec_id'])
alaska_camera_df.head()

Unnamed: 0,entity_id,spec_id
0,ENTITY#1,buy.net//4236
1,ENTITY#1,www.ebay.com//46670
2,ENTITY#10,www.pcconnection.com//12363
3,ENTITY#10,buy.net//6531
4,ENTITY#10,www.shopmania.in//698


### Load the product description for each `spec_id`

In [8]:
import json

In [9]:
def parse_json(source, product_id):
    product_id = product_id + '.json'
    filepath = os.path.join(root_dir, data_dir, alaska_dir, source, product_id)
    with open(filepath) as fd:
        data = json.load(fd)
    return data

In [10]:
alaska_camera_df['page_title'] = alaska_camera_df['spec_id'].map(lambda x: parse_json(*x.split('//'))['<page title>'])
alaska_camera_df['page_title'] = alaska_camera_df['page_title'].map(lambda x: x.lower())

In [11]:
alaska_camera_df.head()

Unnamed: 0,entity_id,spec_id,page_title
0,ENTITY#1,buy.net//4236,polaroid is426 16 megapixel compact camera - r...
1,ENTITY#1,www.ebay.com//46670,camera polaroid is426 4x zoom | ebay
2,ENTITY#10,www.pcconnection.com//12363,buy sony a7 interchangeable lens camera with l...
3,ENTITY#10,buy.net//6531,sony alpha a7 black digital camera kit w/ 28-7...
4,ENTITY#10,www.shopmania.in//698,sony alpha 7 kit 28-70mm digital camera prices...


---

## Select clusters by their size

### Get each cluster's name

In [12]:
gt_filename = 'alaska_camera_gt.json'
gt_filepath = os.path.join(root_dir, data_dir, gt_filename)

In [13]:
import json

In [14]:
with open(gt_filepath, 'r') as fd:
    gt_dict = json.load(fd)

### Select top N clusters

In [15]:
grouped_alaska_df = alaska_camera_df.groupby('entity_id')

In [16]:
print(f'There are {grouped_alaska_df.size().shape[0]} clusters')

There are 103 clusters


In [17]:
NUM_CLUSTERS = 20

In [18]:
top_n_clusters = grouped_alaska_df.size().sort_values(ascending=False).head(NUM_CLUSTERS)
top_n_clusters = top_n_clusters.to_frame().reset_index()
top_n_clusters = top_n_clusters.rename(columns={0: 'size'})
top_n_clusters["entity_name"] = top_n_clusters["entity_id"].map(lambda x: gt_dict[x])
top_n_clusters

Unnamed: 0,entity_id,size,entity_name
0,ENTITY#44,184,nikon d3200
1,ENTITY#23,178,canon eos 7d
2,ENTITY#18,168,canon eos 60d
3,ENTITY#36,155,nikon d3100
4,ENTITY#41,144,nikon d5200
5,ENTITY#21,137,nikon d5100
6,ENTITY#75,130,nikon d7000
7,ENTITY#96,125,canon eos 70d
8,ENTITY#6,117,nikon d5300
9,ENTITY#101,112,canon eos 5d mark iii


### Build a Dataframe using the selected product descriptions

In [19]:
f'The selected clusters contain {top_n_clusters["size"].sum()} product descriptions'

'The selected clusters contain 2171 product descriptions'

In [20]:
top_n_clusters_ids = top_n_clusters["entity_id"].tolist()

In [21]:
df_list = []
for cluster_id in top_n_clusters_ids:
    selected_df = grouped_alaska_df.get_group(cluster_id)
    df_list.append(selected_df)

reduced_alaska_df = pd.concat(df_list, ignore_index=True)
reduced_alaska_df = reduced_alaska_df.drop(columns=['spec_id'])

In [22]:
reduced_alaska_df.head()

Unnamed: 0,entity_id,page_title
0,ENTITY#44,nikon d3200 dslr camera with 18 55mm and 55 20...
1,ENTITY#44,nikon d3200 + 18-55/3.5-5.6 + 55-200/4.0-5.6 ...
2,ENTITY#44,nikon d3200 digital dslr camera w 18 55mm 55 2...
3,ENTITY#44,nikon d3200 digital dslr camera 24 1 w 18 55mm...
4,ENTITY#44,"nikon d3200 (body only) price in india, bangal..."


In [23]:
reduced_alaska_df.shape

(2171, 2)

## Merge clusters in order to get "noisy" ones

Merging: 
- nikon d3200 (ENTITY#44) with olympus omd em5 (ENTITY#7) and canon eos 5d mark ii (ENTITY#102) (big + small + medium cluster
- canon eos 7d (ENTITY#23) with nikon 1 j3 (ENTITY#8) (big + small cluster)
- canon eos 60d (ENTITY#18) with nikon nikon d300 (ENTITY#84) (big + small cluster)
- nikon d5200 (ENTITY#41) with nikon d5100 (ENTITY#21) and nikon d7000 (ENTITY#75) (same size)
- nikon d610 (ENTITY#76) with nikon d3300 (ENTITY#19) with nikon 1 j1 (ENTITY#58)(same size)

In [24]:
noisy_mappings = {"ENTITY#7": "ENTITY#44",
                  "ENTITY#102": "ENTITY#44",
                  "ENTITY#8": "ENTITY#23",
                  "ENTITY#84": "ENTITY#18",
                  "ENTITY#21": "ENTITY#41",
                  "ENTITY#75": "ENTITY#41",
                  "ENTITY#58": "ENTITY#76",
                  "ENTITY#19": "ENTITY#76"}

reduced_alaska_df["entity_id"] = reduced_alaska_df["entity_id"].map(lambda x: noisy_mappings[x] if x in noisy_mappings else x)

In [25]:
reduced_alaska_df["entity_id"].unique()

array(['ENTITY#44', 'ENTITY#23', 'ENTITY#18', 'ENTITY#36', 'ENTITY#41',
       'ENTITY#96', 'ENTITY#6', 'ENTITY#101', 'ENTITY#16', 'ENTITY#57',
       'ENTITY#76', 'ENTITY#37'], dtype=object)

In [26]:
len(reduced_alaska_df["entity_id"].unique())

12

## Convert the data to the `TrainingCorpus` format

In [27]:
reduced_alaska_dict = {}

### Define the `docs` field

In [28]:
reduced_alaska_dict['docs'] = reduced_alaska_df.index.tolist()

### Define the `texts` field

In [29]:
reduced_alaska_dict['texts'] = reduced_alaska_df['page_title'].tolist()

### Define the `tokens` field

In [30]:
src_dir = 'src'

In [31]:
import sys

In [32]:
sys.path.append(os.path.join(root_dir, src_dir))

In [33]:
from training import TrainingCorpus

In [34]:
reduced_alaska_dict['tokens'] = reduced_alaska_df['page_title']\
.map(lambda x: TrainingCorpus.tokenize(x.lower())).tolist()

### Define the `labels` field

In [35]:
reduced_alaska_dict['labels'] = sorted(reduced_alaska_df['entity_id'].unique().tolist(), 
                                       key=lambda x: int(x.split('#')[1]))

### Define the `target` field

In [36]:
reduced_alaska_dict['target'] = reduced_alaska_df['entity_id'].map(lambda x: [x]).to_dict()

---

## Save to JSON file

In [37]:
dataset_filename = 'alaska_corpus_noisy.json'
dataset_filepath = os.path.join(root_dir, data_dir, dataset_filename)

In [38]:
with open(dataset_filepath, 'w') as fd:
    json.dump(reduced_alaska_dict, fd)

---

## Open the dataset as an instance of the `TrainingCorpus` class

In [39]:
alaska_corpus = TrainingCorpus()
alaska_corpus.load(dataset_filepath)

In [40]:
alaska_corpus.get_text(0)

'nikon d3200 dslr camera with 18 55mm and 55 200mm lenses black | ebay'

In [41]:
alaska_corpus.get_tokens(0)

['nikon',
 'd3200',
 'dslr',
 'camera',
 '18',
 '55mm',
 '55',
 '200mm',
 'lenses',
 'black',
 'ebay']

---

## Compute noun chunks

In [42]:
chunks_filename = 'alaska_chunks_noisy.json'
chunks_filepath = os.path.join(root_dir, data_dir, chunks_filename)

In [43]:
chunks_filepath

'../../data/corpus/alaska_chunks_noisy.json'

In [44]:
alaska_corpus.detect_chunks()

100%|██████████| 2171/2171 [00:19<00:00, 109.46it/s]


In [45]:
alaska_corpus.save_chunks(chunks_filepath)

---

## Load chunks

In [46]:
alaska_corpus.load_chunks(chunks_filepath)

In [47]:
list(alaska_corpus.noun_chunks.items())[:3]

[('nikon_d3200_dslr_camera', 5), ('18_55mm', 55), ('55_200mm_lenses', 2)]

In [48]:
alaska_corpus.get_chunk_document(0, threshold=0)

['nikon_d3200_dslr_camera', '18_55mm', '55_200mm_lenses', 'black_ebay']

In [49]:
alaska_corpus.get_text(0)

'nikon d3200 dslr camera with 18 55mm and 55 200mm lenses black | ebay'

In [50]:
alaska_corpus.get_tokens(0)

['nikon',
 'd3200',
 'dslr',
 'camera',
 '18',
 '55mm',
 '55',
 '200mm',
 'lenses',
 'black',
 'ebay']

---