# Build the dataset
Build a dataset of `<product description>` `<product id` pairs

---

## Define filepaths

In [1]:
root_dir = '../..'
data_dir = 'data/corpus'
alaska_dir = 'alaska_camera'

---

## Load the dataset

### Load the Alaska camera dataset for entity resolution (ER)

In [2]:
import os
import pandas as pd

In [3]:
filename = 'camera_entity_resolution_gt.xlsx'

In [4]:
alaska_camera_df = pd.read_excel(os.path.join(root_dir, data_dir, filename))
alaska_camera_df.head()

Unnamed: 0,"entity_id,spec_id"
0,"ENTITY#1,buy.net//4236"
1,"ENTITY#1,www.ebay.com//46670"
2,"ENTITY#10,www.pcconnection.com//12363"
3,"ENTITY#10,buy.net//6531"
4,"ENTITY#10,www.shopmania.in//698"


In [5]:
alaska_camera_df.shape

(3865, 1)

In [6]:
alaska_camera_df['entity_id'] = alaska_camera_df['entity_id,spec_id'].map(lambda x: x.split(',')[0])
alaska_camera_df['spec_id'] = alaska_camera_df['entity_id,spec_id'].map(lambda x: x.split(',')[1])

In [7]:
alaska_camera_df = alaska_camera_df.drop(columns=['entity_id,spec_id'])
alaska_camera_df.head()

Unnamed: 0,entity_id,spec_id
0,ENTITY#1,buy.net//4236
1,ENTITY#1,www.ebay.com//46670
2,ENTITY#10,www.pcconnection.com//12363
3,ENTITY#10,buy.net//6531
4,ENTITY#10,www.shopmania.in//698


### Load the product description for each `spec_id`

In [8]:
import json

In [9]:
def parse_json(source, product_id):
    product_id = product_id + '.json'
    filepath = os.path.join(root_dir, data_dir, alaska_dir, source, product_id)
    with open(filepath) as fd:
        data = json.load(fd)
    return data

In [10]:
alaska_camera_df['page_title'] = alaska_camera_df['spec_id'].map(lambda x: parse_json(*x.split('//'))['<page title>'])

In [11]:
alaska_camera_df.head()

Unnamed: 0,entity_id,spec_id,page_title
0,ENTITY#1,buy.net//4236,Polaroid Is426 16 Megapixel Compact Camera - R...
1,ENTITY#1,www.ebay.com//46670,Camera Polaroid IS426 4X Zoom | eBay
2,ENTITY#10,www.pcconnection.com//12363,Buy Sony a7 Interchangeable Lens Camera with L...
3,ENTITY#10,buy.net//6531,Sony Alpha A7 Black Digital Camera Kit W/ 28-7...
4,ENTITY#10,www.shopmania.in//698,Sony Alpha 7 kit 28-70mm digital camera prices...


---

## Preprocess the `page_title` column

### Convert all words to lowercase, remove stopwords and punctuation

In [12]:
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [13]:
stopwords_eng = list(stopwords.words('english'))

In [14]:
punctuations = list(string.punctuation)
punctuations.append("''")
punctuations.append('--')
punctuations.append('“')
punctuations.append('”')
punctuations.append('``')

In [15]:
def preprocess_text(text):
    lowercase_text = text.lower()
    tokenized_text = word_tokenize(lowercase_text)
    stop = stopwords_eng + punctuations
    new_tokenized_text = [token for token in tokenized_text if token not in stop]
    new_text = ' '.join(new_tokenized_text)
    return new_text

In [16]:
alaska_camera_df['page_title'] = alaska_camera_df['page_title'].map(lambda x: preprocess_text(x))
alaska_camera_df.head()

Unnamed: 0,entity_id,spec_id,page_title
0,ENTITY#1,buy.net//4236,polaroid is426 16 megapixel compact camera red...
1,ENTITY#1,www.ebay.com//46670,camera polaroid is426 4x zoom ebay
2,ENTITY#10,www.pcconnection.com//12363,buy sony a7 interchangeable lens camera lens c...
3,ENTITY#10,buy.net//6531,sony alpha a7 black digital camera kit w/ 28-7...
4,ENTITY#10,www.shopmania.in//698,sony alpha 7 kit 28-70mm digital camera prices...


---

## Select clusters by their size

In [17]:
grouped_alaska_df = alaska_camera_df.groupby('entity_id')

In [18]:
print(f'There are {grouped_alaska_df.size().shape[0]} clusters')

There are 103 clusters


In [19]:
NUM_CLUSTERS = 10

In [20]:
top_n_clusters = grouped_alaska_df.size().sort_values(ascending=False).head(NUM_CLUSTERS)
top_n_clusters

entity_id
ENTITY#44     184
ENTITY#23     178
ENTITY#18     168
ENTITY#36     155
ENTITY#41     144
ENTITY#21     137
ENTITY#75     130
ENTITY#96     125
ENTITY#6      117
ENTITY#101    112
dtype: int64

In [21]:
f'The selected clusters contain {top_n_clusters.values.sum()} product descriptions'

'The selected clusters contain 1450 product descriptions'

In [22]:
top_n_clusters_ids = top_n_clusters.index.tolist()

In [23]:
df_list = []
for cluster_id in top_n_clusters_ids:
    selected_df = grouped_alaska_df.get_group(cluster_id)
    df_list.append(selected_df)

reduced_alaska_df = pd.concat(df_list, ignore_index=True)
reduced_alaska_df = reduced_alaska_df.drop(columns=['spec_id'])

In [24]:
reduced_alaska_df.head()

Unnamed: 0,entity_id,page_title
0,ENTITY#44,nikon d3200 dslr camera 18 55mm 55 200mm lense...
1,ENTITY#44,nikon d3200 18-55/3.5-5.6 55-200/4.0-5.6 new z...
2,ENTITY#44,nikon d3200 digital dslr camera w 18 55mm 55 2...
3,ENTITY#44,nikon d3200 digital dslr camera 24 1 w 18 55mm...
4,ENTITY#44,nikon d3200 body price india bangalore hyderab...


In [25]:
reduced_alaska_df.shape

(1450, 2)

## Convert the data to the `TrainingCorpus` format

In [30]:
reduced_alaska_dict = {}

### Define the `docs` field

In [31]:
reduced_alaska_dict['docs'] = reduced_alaska_df.index.tolist()

### Define the `texts` field

In [32]:
reduced_alaska_dict['texts'] = reduced_alaska_df['page_title'].tolist()

### Define the `tokens` field

In [33]:
reduced_alaska_dict['tokens'] = reduced_alaska_df['page_title'].map(lambda x: x.split()).tolist()

### Define the `labels` field

In [34]:
reduced_alaska_dict['labels'] = sorted(reduced_alaska_df['entity_id'].unique().tolist(), 
                                       key=lambda x: int(x.split('#')[1]))

### Define the `target` field

In [35]:
reduced_alaska_dict['target'] = reduced_alaska_df['entity_id'].map(lambda x: [x]).to_dict()

---

## Save to JSON file

In [36]:
dataset_filename = 'reduced_alaska.json'
dataset_filepath = os.path.join(root_dir, data_dir, dataset_filename)

In [37]:
dataset_filepath

'../../data/corpus/reduced_alaska.json'

In [38]:
with open(dataset_filepath, 'w') as fd:
    json.dump(reduced_alaska_dict, fd)

---

## Open the dataset as an instance of the `TrainingCorpus` class

In [39]:
src_dir = 'src'

In [40]:
import sys

In [41]:
sys.path.append(os.path.join(root_dir, src_dir))

In [42]:
from dataset.training import TrainingCorpus

In [43]:
alaska_corpus = TrainingCorpus()
alaska_corpus.load(dataset_filepath)