# Make Data Set

##### Hongbo Zhang

The metadata of each record can be found in json file `json/taobao_result.json` and `json/tmall_result.json`.

The brand ids are in `id2cate_brand.json`. It contains: item id, brand name, category id list. It records the brand name and categories of each record with item id in `tmall_result.json` and `taobao_result.json`.

The category ids are in `merge_c`. It contains: category id, and category name

## Familiar with Metadata

The metadata in json files are encoded by utf-8 and cannot be displayed properly.

Therefore, we have to first decode the json file to make all the content readable, 
and then know the structure of metadata.

***Problems***

+ In the 4th line in `tmall_result.json`, there is no `img`

In [1]:
#encoding:utf-8

# setting Chinese encoding

import sys
stdi, stdo, stde = sys.stdin, sys.stdout, sys.stderr
reload(sys)
sys.stdin, sys.stdout, sys.stderr = stdi, stdo, stde
sys.setdefaultencoding('utf-8')

### Display Metadata

In [2]:
import json

metaPath = "json/tmall_result.json"

def readJson(metaPath):
    '''
    read `tmall_result.json` or `taobao_result.json`
    return a list, in which each element is a json record.
    '''
    ret = []
    metafile = open(metaPath, 'r')
    for line in metafile:
        record = json.loads(line[:-2]) # end with ",\n"
        ret.append(record)
    metafile.close()
    return ret
    

def printRawJson(metadata, num):
    '''
    metadata is output of `readJson`
    num: how many records to display
    print metadata to stdout
    '''
    # print(metadata)
    for idx, record in enumerate(metadata):
        if idx < num:
            print("record " + str(idx) + ":")
            print("\tid   : " + str(record["_id"]))
            print("\ttitle: " + str(record["title"]))
            print("\timage: ")
            if record.has_key("img"):
                for i in record["img"]:
                    print("\t\t" + i)
            print("\tattrs:")
            if record.has_key("attrs"):
                for i in record["attrs"]:
                    print("\t\t" + i)
            print("")
        
    

printRawJson(readJson(metaPath), 10)

record 0:
	id   : 573743779841
	title: 小米 米家智能石英手表新款男女情侣潮流简约时尚防水学生手表
	image: 
		3040e4c040054a6887ed5d8f80ea2191.jpg
		2bd39903148b483e94d7a52d74f300d4.jpg
		91b3b528cad449ccb2d186910e11ed89.jpg
		b1882928aebf4bfcb76067b8502801d3.jpg
		234479a861d947eea564cbf96d670d0d.jpg
	attrs:
		保修: 全国联保
		成色: 全新
		手表镜面材质: 镀膜玻璃
		是否商场同款: 否
		机芯产地: 中国
		品牌: MIJIA/米家
		型号: 米家智能石英手表
		机芯类型: 石英机芯
		手表种类: 中性
		风格: 时尚
		表带材质: 真皮
		形状: 圆形
		显示方式: 指针式
		上市时间: 2018年夏季
		颜色分类: 灰色 白色 黑色
		防水深度: 3ATM
		表扣款式: 针扣
		表底类型: 普通
		表冠类型: 普通
		表盘厚度: 11mm
		表盘直径: 40mm
		品牌产地: 国内
		流行元素: 大表盘
		表壳材质: 精钢

record 1:
	id   : 564453267459
	title: Lenovo/联想 笔记本电源适配器65W 细圆口
	image: 
		6b1113a4f24f4bd7bf8021bdc3fea144.jpg
		86556cb2ca7c47d78aff1a251c8f4f0e.jpg
		50d5fc0f41434b0b92423baf8a3f520e.jpg
		fca1c42e2d314add9c6f86b0b3ded4f0.jpg
		6c624d1a19f74afd8cd3fb73e537a900.jpg
	attrs:
		证书编号：2016010907834796
		证书状态：有效
		产品名称：电源适配器
		3C规格型号：ADLX65CDGC2A;  输入：100-240VAC,50-60Hz,1.5A; 输出：20Vd...
		产品名称：Lenovo/联想 65W小细圆口
		生产企业: 台达电子工业

In [3]:
# statistics, how many record without any imge

### Subtitute id with Category or Brand

In [19]:
import copy

idBrandCateFile = "json/id2cate_brand_tmall.json" # id, brand name, category id list
cateFile = "json/merge_c_tmall"  # category id, category name

def substituteIdOneRecord(record, idBrandCateList, cateList):
    '''
    record is one record in json file (output of `readJson`)
    output is a record with brand and category name.
    '''
    # get id
    itemid = record["_id"]
    ##  # file rewind # too slow
    ##  idBrandCate.seek(0)
    ##  cate.seek(0) 
    # make category id set
    cate = dict()
    for line in cateList:
        name, cid = line.strip().split(",")
        cate[cid] = name
    # main part
    brand = ""
    categoryList = []
    for line in idBrandCateList:
        tmp = json.loads(line[:-2])
        # find id in id-brand-category file
        if tmp["_id"] == record["_id"]:
            # get brand and category id list
            brand = tmp["brand"]
            categoryList = tmp["category"]
    # find category names by category id
    # construct new record: original record + brand + [ category id, category name ]
    ret = copy.deepcopy(record)
    ret['brand'] = brand
    tmpList = []
    for i in categoryList:
        tmpList.append(i + "|" + cate[i])
    ret['category'] = tmpList
    # for c in categoryList:
    #     record[]
    
    # return
    return ret

def substituteIdFull(metadata, idBrandCateFile, cateFile): 
    '''
    metadata is the output of `readJson`
    '''
    ret = []
    # read info
    # since file operation (seek(0)) is too slow, read file to buffer at first in one shot
    idBrandCate = open(idBrandCateFile, 'r')
    cate = open(cateFile, 'r')
    idBrandCateList = []
    cateList = []
    for line in idBrandCate:
        idBrandCateList.append(line)
    for line in cate:
        cateList.append(line)
    # main part
    for record in metadata:
        ret.append( substituteIdOneRecord(record, idBrandCateList, cateList) )
    # close files
    idBrandCate.close()
    cate.close()
    return ret
    
def printFullJson(metadata, num):
    '''
    metadata is the output of `substituteIdFull`
    num: how many records to display
    print metadata to stdout
    '''
    # print(metadata)
    for idx, record in enumerate(metadata):
        if idx < num:
            print("record " + str(idx) + ":")
            print("\tid   : " + str(record["_id"]))
            print("\tbrand: " + record["brand"])
            print("\ttitle: " + str(record["title"]))
            print("\timage: ")
            if record.has_key("img"):
                for i in record["img"]:
                    print("\t\t" + i)
            print("\tcateg: ")
            if record.has_key("category"):
                for i in record["category"]:
                    print("\t\t" + i)
            print("\tattrs:")
            if record.has_key("attrs"):
                for i in record["attrs"]:
                    print("\t\t" + i)
            print("----------------------------------------------")
        

fullmetadata = substituteIdFull(readJson(metaPath), idBrandCateFile, cateFile)
printFullJson(fullmetadata, 10)

record 0:
	id   : 573743779841
	brand: xiaomi
	title: 小米 米家智能石英手表新款男女情侣潮流简约时尚防水学生手表
	image: 
		3040e4c040054a6887ed5d8f80ea2191.jpg
		2bd39903148b483e94d7a52d74f300d4.jpg
		91b3b528cad449ccb2d186910e11ed89.jpg
		b1882928aebf4bfcb76067b8502801d3.jpg
		234479a861d947eea564cbf96d670d0d.jpg
	categ: 
		807818185|箱包鞋服及生活周边:服饰
	attrs:
		保修: 全国联保
		成色: 全新
		手表镜面材质: 镀膜玻璃
		是否商场同款: 否
		机芯产地: 中国
		品牌: MIJIA/米家
		型号: 米家智能石英手表
		机芯类型: 石英机芯
		手表种类: 中性
		风格: 时尚
		表带材质: 真皮
		形状: 圆形
		显示方式: 指针式
		上市时间: 2018年夏季
		颜色分类: 灰色 白色 黑色
		防水深度: 3ATM
		表扣款式: 针扣
		表底类型: 普通
		表冠类型: 普通
		表盘厚度: 11mm
		表盘直径: 40mm
		品牌产地: 国内
		流行元素: 大表盘
		表壳材质: 精钢
----------------------------------------------
record 1:
	id   : 564453267459
	brand: lenovo
	title: Lenovo/联想 笔记本电源适配器65W 细圆口
	image: 
		6b1113a4f24f4bd7bf8021bdc3fea144.jpg
		86556cb2ca7c47d78aff1a251c8f4f0e.jpg
		50d5fc0f41434b0b92423baf8a3f520e.jpg
		fca1c42e2d314add9c6f86b0b3ded4f0.jpg
		6c624d1a19f74afd8cd3fb73e537a900.jpg
	categ: 
		1371242209|智能选件与服务:电脑配件
	attrs:
		证书

## Statistics of Data

### Training Data

Make sure there is no obvious mistake.

+ img is missing:  
    ***17 records in tmall_result.json have no img***
    ```
    the 4th line has no img
    the 7th line has no img
    the 16th line has no img
    the 94th line has no img
    the 247th line has no img
    the 405th line has no img
    the 466th line has no img
    the 473th line has no img
    the 487th line has no img
    the 524th line has no img
    the 614th line has no img
    the 682th line has no img
    the 726th line has no img
    the 742th line has no img
    the 763th line has no img
    the 772th line has no img
    the 775th line has no img
    ```
+ ~~~category id has duplication~~~
+ ~~~attr is missing~~~
+ brand is missing:  
    ***there are 4 records in tmall_result.json have no brand. actually, their corresponding _id doesn't exist in id2cate_brand.json***
    ```
    the 250th line has no brand   _id: 563817603032
    the 293th line has no brand   _id: 566468948688
    the 472th line has no brand   _id: 564891647135
    the 640th line has no brand   _id: 557563549275
    ```
+ image missing. There are 3818 img in tmall_pic.tar. However, after making training set, there are only 3764 pics with brand info, and 20 pics without brand info. Therefore, 34 pics are missing! (34 pics don't appear in the "img" in json file)
    ```
    echo $[$(ll  ... | wc -l)-1]
    ```
+ so there are 3764 images in training set in total.

In [27]:
def checkMissingImg(metadata):
    '''
    metadata is output of `readJson`
    '''
    for idx, record in enumerate(metadata):
        if not record.has_key('img') :
            print("the " + str(idx+1) + "th line has no img")

checkMissingImg(readJson(metaPath))

the 4th line has no img
the 7th line has no img
the 16th line has no img
the 94th line has no img
the 247th line has no img
the 405th line has no img
the 466th line has no img
the 473th line has no img
the 487th line has no img
the 524th line has no img
the 614th line has no img
the 682th line has no img
the 726th line has no img
the 742th line has no img
the 763th line has no img
the 772th line has no img
the 775th line has no img


In [31]:
def checkMissingBrand(metadata):
    '''
    metadata is output of `substituteIdFull`
    '''
    for idx, record in enumerate(metadata):
        if record['brand'] == "":
            print("the " + str(idx+1) + "th line has no brand")
    
# fullmetadata = substituteIdFull(readJson(metaPath), idBrandCateFile, cateFile)
checkMissingBrand(fullmetadata)

the 250th line has no brand
the 293th line has no brand
the 472th line has no brand
the 640th line has no brand


## Make Training Data Set

In [35]:
import os
import shutil

picPath = "pic/"
storePath = "train/"

def makeDataSet(metadata, picPath, storePath):
    '''
    metadata is the output of `substituteIdFull`
    picPath is the path of images
    storePath is the storage path of train/test dataset
    '''
    # 1 find all brands
    brand = set()
    for record in metadata:
        brand.add(record['brand'])
    brand.add("nobrand")
    # 2 make one folder for each brand. 
    #   there is no category sub-folder, since the category is not in high quality and has multiple categories
    for b in brand:
        os.makedirs(storePath + b)
    # 2 copy images to each folder
    for record in metadata:
        if record.has_key('img'):
            b = record['brand']
            if b == "":
                b = "nobrand"
            for i in record['img']:
                dst = storePath + b + "/" + i
                src = picPath + i
                shutil.copyfile(src, dst)
        

os.system("rm -r " + storePath)
# fullmetadata = substituteIdFull(readJson(metaPath), idBrandCateFile, cateFile)
makeDataSet(fullmetadata, picPath, storePath)

## Make Testing Data Set

In [None]:
# test meta data
testMeta = readJson(metaPath)

# test full meta data
testFullMeta = substituteIdFull(testMeta, idBrandCateFile, cateFile)

# make testing set
makeDataSet(testFullMeta, "taobaoPic/", "test/")

# checking data