# Build the dataset
Build a dataset of `<product description>` `<product id` pairs

---

## Define filepaths

In [1]:
root_dir = '../..'
data_dir = 'data/corpus'
alaska_dir = 'alaska_camera'

---

## Load the dataset

### Load the Alaska camera dataset for entity resolution (ER)

In [2]:
import os
import pandas as pd

In [3]:
filename = 'camera_entity_resolution_gt.xlsx'

In [4]:
alaska_camera_df = pd.read_excel(os.path.join(root_dir, data_dir, filename))
alaska_camera_df.head()

Unnamed: 0,"entity_id,spec_id"
0,"ENTITY#1,buy.net//4236"
1,"ENTITY#1,www.ebay.com//46670"
2,"ENTITY#10,www.pcconnection.com//12363"
3,"ENTITY#10,buy.net//6531"
4,"ENTITY#10,www.shopmania.in//698"


In [5]:
alaska_camera_df.shape

(3865, 1)

In [6]:
alaska_camera_df['entity_id'] = alaska_camera_df['entity_id,spec_id'].map(lambda x: x.split(',')[0])
alaska_camera_df['spec_id'] = alaska_camera_df['entity_id,spec_id'].map(lambda x: x.split(',')[1])

In [7]:
alaska_camera_df = alaska_camera_df.drop(columns=['entity_id,spec_id'])
alaska_camera_df.head()

Unnamed: 0,entity_id,spec_id
0,ENTITY#1,buy.net//4236
1,ENTITY#1,www.ebay.com//46670
2,ENTITY#10,www.pcconnection.com//12363
3,ENTITY#10,buy.net//6531
4,ENTITY#10,www.shopmania.in//698


### Load the product description for each `spec_id`

In [8]:
import json

In [9]:
def parse_json(source, product_id):
    product_id = product_id + '.json'
    filepath = os.path.join(root_dir, data_dir, alaska_dir, source, product_id)
    with open(filepath) as fd:
        data = json.load(fd)
    return data

In [60]:
alaska_camera_df.loc[alaska_camera_df['spec_id'] == 'www.ebay.com//24855']

Unnamed: 0,entity_id,spec_id,page_title
3830,ENTITY#96,www.ebay.com//24855,canon eos 7d digital slr camera 4 lens 18 55 s...


In [58]:
filepath_prova = os.path.join(root_dir, data_dir, 'alaska_camera_annotated', 'www.ebay.com', '24855.json')
filepath_prova

'../../data/corpus/alaska_camera_annotated/www.ebay.com/24855.json'

In [59]:
!cat '../../data/corpus/alaska_camera_annotated/www.ebay.com/24855.json'

{
    "<page title>": "Canon EOS 7D Digital SLR Camera 4 Lens 18 55 STM Sigma 70 300 32GB Top Kit | eBay",
    "__extracted_model": "canon eos 7d",
    "brand": "Canon",
    "color": "Black",
    "condition": "New: A brand-new, unused, unopened, undamaged item in its original packaging (where packaging is\napplicable). Packaging should be the same as what is found in a retail store, unless the item is handmade or was packaged by the manufacturer in non-retail packaging, such as an unprinted box or plastic bag. See the seller's listing for full details.\nSee all condition definitions- opens in a new window or tab\n... Read moreabout the condition",
    "ean": "4960999654621",
    "fld filter": "helps balance light in outdoor situations or\nwhere fluorescent lights are used",
    "manufacturer warranty": "No",
    "polarizing filter": "eliminates reflections from\nnon-metallic surfaces, and increases contrast and color saturation",
    "type": "Digital SLR",
    "uv filter": "

In [10]:
alaska_camera_df['page_title'] = alaska_camera_df['spec_id'].map(lambda x: parse_json(*x.split('//'))['<page title>'])
alaska_camera_df['page_title'] = alaska_camera_df['page_title'].map(lambda x: x.lower())

In [11]:
alaska_camera_df.head()

Unnamed: 0,entity_id,spec_id,page_title
0,ENTITY#1,buy.net//4236,polaroid is426 16 megapixel compact camera - r...
1,ENTITY#1,www.ebay.com//46670,camera polaroid is426 4x zoom | ebay
2,ENTITY#10,www.pcconnection.com//12363,buy sony a7 interchangeable lens camera with l...
3,ENTITY#10,buy.net//6531,sony alpha a7 black digital camera kit w/ 28-7...
4,ENTITY#10,www.shopmania.in//698,sony alpha 7 kit 28-70mm digital camera prices...


---

## Select clusters by their size

In [12]:
grouped_alaska_df = alaska_camera_df.groupby('entity_id')

In [13]:
print(f'There are {grouped_alaska_df.size().shape[0]} clusters')

There are 103 clusters


In [14]:
NUM_CLUSTERS = 20

In [15]:
top_n_clusters = grouped_alaska_df.size().sort_values(ascending=False).head(NUM_CLUSTERS)
top_n_clusters

entity_id
ENTITY#44     184
ENTITY#23     178
ENTITY#18     168
ENTITY#36     155
ENTITY#41     144
ENTITY#21     137
ENTITY#75     130
ENTITY#96     125
ENTITY#6      117
ENTITY#101    112
ENTITY#102     95
ENTITY#16      91
ENTITY#57      80
ENTITY#76      79
ENTITY#19      79
ENTITY#58      78
ENTITY#37      57
ENTITY#84      55
ENTITY#8       54
ENTITY#7       53
dtype: int64

In [None]:
'canon eos 7d digital slr camera 4 lens 18 55 stm sigma 70 300 32gb top kit | ebay'

In [57]:
grouped_alaska_df.get_group('ENTITY#23').page_title.tolist()

['canon eos 7d 18 0 mp digital slr camera with ef s 18 135mm f 3 5 5 6 lens 013803117493 | ebay',
 'canon eos 7d - price comparison & reviews - digital cameras - australia',
 'canon eos 7d 18 0 mp digital slr camera black kit w is 28 135mm lens | ebay',
 'canon eos 7d  new zealand prices - priceme',
 'bundle canon eos 7d 18 0 mp digital slr camera black kit w is 28 135mm lens | ebay',
 'canon 7d 18 0 mp digital slr camera body only excellent condition no marks 013803117493 | ebay',
 'canon eos 7d 18mp dslr camera on sale for $734.39',
 'canon eos 7d digital slr camera with 2 canon lenses 18 55 is 75 300 32gb kit | ebay',
 'canon eos 7d digital slr camera ef 28 135 f 3 5 5 6 is usm kit with original box | ebay',
 'canon eos 7d 18 0 mp digital slr camera black body only 013803117493 | ebay',
 'canon eos 7d (body only) price in india, bangalore, hyderabad, delhi, chennai, mumbai, pune, kolkatta',
 'canon eos 7d + 17-55/2.8 is  new zealand prices - priceme',
 'canon eos 7d 18 0 mp digital 

In [16]:
f'The selected clusters contain {top_n_clusters.values.sum()} product descriptions'

'The selected clusters contain 2171 product descriptions'

In [17]:
top_n_clusters_ids = top_n_clusters.index.tolist()

In [18]:
df_list = []
for cluster_id in top_n_clusters_ids:
    selected_df = grouped_alaska_df.get_group(cluster_id)
    df_list.append(selected_df)

reduced_alaska_df = pd.concat(df_list, ignore_index=True)
reduced_alaska_df = reduced_alaska_df.drop(columns=['spec_id'])

In [19]:
reduced_alaska_df.head()

Unnamed: 0,entity_id,page_title
0,ENTITY#44,nikon d3200 dslr camera with 18 55mm and 55 20...
1,ENTITY#44,nikon d3200 + 18-55/3.5-5.6 + 55-200/4.0-5.6 ...
2,ENTITY#44,nikon d3200 digital dslr camera w 18 55mm 55 2...
3,ENTITY#44,nikon d3200 digital dslr camera 24 1 w 18 55mm...
4,ENTITY#44,"nikon d3200 (body only) price in india, bangal..."


In [20]:
reduced_alaska_df.shape

(2171, 2)

## Convert the data to the `TrainingCorpus` format

In [21]:
reduced_alaska_dict = {}

### Define the `docs` field

In [22]:
reduced_alaska_dict['docs'] = reduced_alaska_df.index.tolist()

### Define the `texts` field

In [23]:
reduced_alaska_dict['texts'] = reduced_alaska_df['page_title'].tolist()

### Define the `tokens` field

In [24]:
src_dir = 'src'

In [25]:
import sys

In [26]:
sys.path.append(os.path.join(root_dir, src_dir))

In [27]:
from training import TrainingCorpus

In [28]:
reduced_alaska_dict['tokens'] = reduced_alaska_df['page_title']\
.map(lambda x: TrainingCorpus.tokenize(x.lower())).tolist()

### Define the `labels` field

In [29]:
reduced_alaska_dict['labels'] = sorted(reduced_alaska_df['entity_id'].unique().tolist(), 
                                       key=lambda x: int(x.split('#')[1]))

### Define the `target` field

In [30]:
reduced_alaska_dict['target'] = reduced_alaska_df['entity_id'].map(lambda x: [x]).to_dict()

---

## Save to JSON file

In [31]:
dataset_filename = 'alaska_corpus.json'
dataset_filepath = os.path.join(root_dir, data_dir, dataset_filename)

In [32]:
with open(dataset_filepath, 'w') as fd:
    json.dump(reduced_alaska_dict, fd)

---

## Open the dataset as an instance of the `TrainingCorpus` class

In [33]:
alaska_corpus = TrainingCorpus()
alaska_corpus.load(dataset_filepath)

In [34]:
alaska_corpus.get_text(0)

'nikon d3200 dslr camera with 18 55mm and 55 200mm lenses black | ebay'

In [35]:
alaska_corpus.get_tokens(0)

['nikon',
 'd3200',
 'dslr',
 'camera',
 '18',
 '55mm',
 '55',
 '200mm',
 'lenses',
 'black',
 'ebay']

---

## Compute noun chunks

In [36]:
chunks_filename = 'alaska_chunks.json'
chunks_filepath = os.path.join(root_dir, data_dir, chunks_filename)

In [37]:
chunks_filepath

'../../data/corpus/alaska_chunks.json'

In [38]:
alaska_corpus.detect_chunks()

100%|██████████| 2171/2171 [00:25<00:00, 83.93it/s]


In [39]:
alaska_corpus.save_chunks(chunks_filepath)

---

## Load chunks

In [40]:
alaska_corpus.load_chunks(chunks_filepath)

In [42]:
list(alaska_corpus.noun_chunks.items())[:3]

[('nikon_d3200_dslr_camera', 5), ('18_55mm', 55), ('55_200mm_lenses', 2)]

In [43]:
alaska_corpus.get_chunk_document(0, threshold=0)

['nikon_d3200_dslr_camera', '18_55mm', '55_200mm_lenses', 'black_ebay']

In [44]:
alaska_corpus.get_text(0)

'nikon d3200 dslr camera with 18 55mm and 55 200mm lenses black | ebay'

In [45]:
alaska_corpus.get_tokens(0)

['nikon',
 'd3200',
 'dslr',
 'camera',
 '18',
 '55mm',
 '55',
 '200mm',
 'lenses',
 'black',
 'ebay']

---