In this notebook, we will perform Exploratory Data Analysis.

In [1]:
import json
import os

import pandas as pd


ROOT_PATH = os.path.dirname(os.path.abspath(os.getcwd()))

In [2]:
def inspect_df(df: pd.DataFrame, n : int=5) -> pd.DataFrame:
    """Helper method to easily inspect DataFrames."""
    
    print(f'shape: {df.shape}')

    return df.head(n)

In [3]:
def json_loader(dirpath: str) -> list:
    """Discover all .json files and gather their respective data, given a `dirpath`.
    """
    data = []
    for subdir in os.listdir(dirpath):
        
        temp = os.path.join(dirpath, subdir)

        for datafile in os.listdir(temp):

            with open(os.path.join(temp, datafile), 'r') as f:

                spec = json.loads(f.read())
            
            # keep global identifier, format it as in the labelled dataset
            spec['id'] = subdir + '//' + datafile.split('.json')[0]
            
            data.append(spec)

    return data

In [4]:
data = json_loader(dirpath=os.path.join(ROOT_PATH, 'data/2013_camera_specs'))

In [5]:
specs = pd.DataFrame(data)

In [6]:
inspect_df(specs)

shape: (29787, 4662)


Unnamed: 0,<page title>,camera type,depth,digital zoom,effective megapixels,flash,focal length,height,image sensor,image stabilization,...,motion image progressive pal area,focus mode manual focus,my camera,sound files,af system points,ultrasonic motor usm,remote controller switch,image erase,image erase protection,gps log
0,Canon PowerShot SX200 IS 12.1 Megapixel Compac...,Compact Camera,1.5 in,4x,12.1 Megapixel,Auto Flash|Flash OFF|Flash ON|Red-eye Reduction,5 mm to 60 mm,2.4 in,CCD,Optical,...,,,,,,,,,,
1,Sony DSC-RX100/B Black Digital Camera (20.2 MP...,Point & Shoot Digital Camera,1.44 in,14 X,20200000 pixels,Built-in Flash|Accessory Shoe,,2.38 in,CMOS,Optical,...,,,,,,,,,,
2,Canon EOS-1D X Black SLR Digital Camera - Body...,Digital SLR Camera,3.3 in,,18.1 Megapixel,Auto Flash|Flash ON|Flash OFF|X-sync|Rear Curt...,,6.4 in,CMOS,No,...,,,,,,,,,,
3,Olympus Pen E-P5 White Digital Camera (16.1 MP...,Point & Shoot Digital Camera,1.46 in,,16100000 pixels,Built-in Flash|Accessory Shoe,,2.71 in,CMOS,Sensor Shift,...,,,,,,,,,,
4,Canon PowerShot G1 X Mark II Black Digital Cam...,Point & Shoot Digital Camera,2.61 in,4 X,12800000 pixels,Accessory Shoe|Built-in Flash,12.5 mm to 62.5 mm,2.91 in,CMOS,Optical,...,,,,,,,,,,


In [7]:
specs.set_index('id', inplace=True)

In [8]:
spec_matchings = pd.read_csv(os.path.join(ROOT_PATH, 'data/sigmod_medium_labelled_dataset.csv'))

In [9]:
inspect_df(spec_matchings)

shape: (46665, 3)


Unnamed: 0,left_spec_id,right_spec_id,label
0,www.garricks.com.au//31,www.ebay.com//53278,1
1,www.ebay.com//58782,www.ebay.com//24817,0
2,www.ebay.com//58782,www.ebay.com//43019,0
3,www.ebay.com//42055,www.ebay.com//54403,0
4,www.ebay.com//44280,buy.net//6145,0


In [10]:
specs_info = specs.describe()
specs_info = specs_info.transpose()

inspect_df(specs_info)

shape: (4661, 4)


Unnamed: 0,count,unique,top,freq
<page title>,29787,26748,Canon EOS Rebel T3i 600D 18 0 MP Digital SLR C...,26
camera type,2080,92,Point-and-Shoot,252
depth,1176,353,20 mm,28
digital zoom,3216,286,4x,1004
effective megapixels,344,91,16 Megapixel,40


In [11]:
specs_info['support'] = specs_info['count'] / len(specs.index)
specs_info = specs_info.sort_values(by='support', ascending=False)

In [12]:
specs_info.head(10)

Unnamed: 0,count,unique,top,freq,support
<page title>,29787,26748,Canon EOS Rebel T3i 600D 18 0 MP Digital SLR C...,26,1.0
brand,15739,348,Canon,3858,0.528385
model,14748,3993,7D,142,0.495115
megapixels,13827,477,16.0 MP,868,0.464196
type,13660,437,Point & Shoot,6638,0.458589
screen size,12350,365,"3""",5027,0.41461
optical zoom,11656,578,3x,2857,0.391312
mpn,10536,3452,5169B003,93,0.353711
condition,9951,13,Used: An item that has been used previously. T...,5730,0.334072
upc,7800,3723,013803117493,68,0.261859


In [13]:
top10 = list(specs_info.head(10).index)

These are the 10 camera specs (attributes) with the highest support.

In [14]:
specs[top10]

Unnamed: 0_level_0,<page title>,brand,model,megapixels,type,screen size,optical zoom,mpn,condition,upc
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
buy.net//6015,Canon PowerShot SX200 IS 12.1 Megapixel Compac...,,,,,,12 X,,,
buy.net//6383,Sony DSC-RX100/B Black Digital Camera (20.2 MP...,,,,,,3.6 X,,,
buy.net//6698,Canon EOS-1D X Black SLR Digital Camera - Body...,,,,,,,,,
buy.net//6123,Olympus Pen E-P5 White Digital Camera (16.1 MP...,,,,,,,,,
buy.net//6212,Canon PowerShot G1 X Mark II Black Digital Cam...,,,,,,5 X,,,
...,...,...,...,...,...,...,...,...,...,...
www.canon-europe.com//78,Canon PowerShot D30 - PowerShot and IXUS digit...,,,,TTL,,,,,
www.canon-europe.com//63,Canon PowerShot A550 - PowerShot and IXUS digi...,,,,,,,,,
www.canon-europe.com//156,Canon Digital IXUS 900 Ti - PowerShot and IXUS...,,,,,,,,,
www.canon-europe.com//133,Canon IXUS 105 - PowerShot and IXUS digital co...,,,,TTL,,,,,


In [15]:
def create_dataset(data: pd.DataFrame, labels: pd.DataFrame, features: list):
    """Helper method that creates a dataset.
    """
    dataset = pd.merge(labels, data[features], how='inner', left_on='left_spec_id', right_on='id')
    
    dataset = pd.merge(dataset, data[features], how='inner', left_on='right_spec_id', right_on='id')
    
    return dataset

In [16]:
X = create_dataset(data=specs, labels=spec_matchings, features=top10[0])

In [17]:
X

Unnamed: 0,left_spec_id,right_spec_id,label,<page title>_x,<page title>_y
0,www.garricks.com.au//31,www.ebay.com//53278,1,Nikon D3200 Black w/ 18-55mm VR Lens,Nikon D3200 24 2 MP Digital SLR Camera Black K...
1,www.ebay.com//58782,www.ebay.com//53278,0,Nikon D80 10 2 MP Digital SLR Camera Black Kit...,Nikon D3200 24 2 MP Digital SLR Camera Black K...
2,www.ebay.com//42055,www.ebay.com//53278,0,Nikon D80 DSLR Camera Body Acessories 2 Batter...,Nikon D3200 24 2 MP Digital SLR Camera Black K...
3,www.ebay.com//42074,www.ebay.com//53278,0,Canon EOS 20D 8 2 MP Digital DSLR Camera Body ...,Nikon D3200 24 2 MP Digital SLR Camera Black K...
4,www.ebay.com//42039,www.ebay.com//53278,1,Nikon D3200 Digital SLR Camera w AF s DX VR 55...,Nikon D3200 24 2 MP Digital SLR Camera Black K...
...,...,...,...,...,...
46660,www.shopmania.in//1458,www.shopmania.in//932,0,"Sony Alpha 7R body digital camera prices, Shop...","Nikon COOLPIX L30 digital camera prices, Shopp..."
46661,www.mypriceindia.com//52,www.shopmania.in//932,0,"Sony ILCE 7R (Body Only) Price In India, Banga...","Nikon COOLPIX L30 digital camera prices, Shopp..."
46662,www.shopmania.in//1458,www.mypriceindia.com//142,0,"Sony Alpha 7R body digital camera prices, Shop...","Nikon Coolpix L30 Price In India, Bangalore, H..."
46663,www.mypriceindia.com//52,www.mypriceindia.com//142,0,"Sony ILCE 7R (Body Only) Price In India, Banga...","Nikon Coolpix L30 Price In India, Bangalore, H..."
