In [1]:
import os
import random
import pickle
import json
import jsonlines
from datetime import datetime
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from huggingface_hub import HfApi, snapshot_download
from PIL import Image
from tqdm.notebook import tqdm
import ast

tqdm.pandas()

In [2]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)


seed = 42
seed_everything(42)

In [3]:
def dump_pickle(data, path):
    with open(path, "wb") as file:
        pickle.dump(data, file)


def load_pickle(path):
    with open(path, "rb") as file:
        data = pickle.load(file)
    return data


def dump_json(data, path):
    with open(path, "w") as file:
        json.dump(data, file)


def load_json(path):
    with open(path, "r") as file:
        data = json.load(file)
    return data


def save_pt(data, path):
    with open(path, "wb") as file:
        torch.save(data, file)

In [4]:
def mk_dir(file_path):
    if not os.path.exists(file_path):
        os.makedirs(file_path)


def get_timestamp(date_format: str = "%d%H%M%S") -> str:
    timestamp = datetime.now()
    return timestamp.strftime(date_format)


# data_dir = "./data/amazon/fashion/raw"
# mk_dir(data_dir)

### Fashion


#### load and preprocess

In [5]:
meta = []

with jsonlines.open('./data/amazon/fashion/meta_Amazon_Fashion.jsonl') as f:
    for line in f.iter():
    	meta.append(line)

In [6]:
inter = []

with jsonlines.open('./data/amazon/fashion/Amazon_Fashion.jsonl') as f:
    for line in f.iter():
    	inter.append(line)

In [7]:
meta_df = pd.DataFrame(meta)
meta_df.shape

(826108, 14)

In [8]:
inter_df = pd.DataFrame(inter)
inter_df.shape

(2500939, 10)

In [9]:
meta_df = meta_df[["title", "description", "images", "details", "parent_asin"]]
meta_df = meta_df.dropna(axis=0, how='any')
meta_df.shape

(826108, 5)

In [10]:
inter_df = inter_df[["user_id", "asin", "parent_asin", "timestamp"]]
inter_df = inter_df.dropna(axis=0, how='any')
inter_df.shape

(2500939, 4)

In [11]:
no_img_cnt = meta_df["images"].apply(lambda x: (len(x)==0 or x=={}))
no_desc_cnt = meta_df["description"].apply(lambda x: (len(x)==0))
no_title_cnt = meta_df["title"].apply(lambda x: (len(x.split())<2))

In [12]:
sum(no_img_cnt)

1

In [13]:
sum(no_desc_cnt)

766819

In [14]:
sum(no_title_cnt)

656

In [15]:
no_img_title_index = no_img_cnt[no_img_cnt].index.append(no_title_cnt[no_title_cnt].index)
ava_meta_data = meta_df[~meta_df.index.isin(no_img_title_index)]

In [18]:
import requests 

no_img = {}
yes_img = {}

from concurrent.futures import ThreadPoolExecutor
import requests

def check_image(url, asin):
    try:
        response = requests.head(url, allow_redirects=True)
        return asin, url, 'image' in response.headers.get('Content-Type', '')
    except requests.RequestException:
        return asin, url, False

urls = ava_meta_data["images"].apply(lambda x: x[0]["large"])
asins = ava_meta_data["parent_asin"]

url_asin_pairs = list(zip(urls, asins))


with ThreadPoolExecutor() as executor:
    results = list(tqdm(executor.map(check_image, urls, asins), total=len(url_asin_pairs), desc="Processing Images"))

for asin, url, is_image in tqdm(results):
    if is_image:
        yes_img[asin] = url
    else:
        no_img[asin] = url
        print(asin)

Processing Images:   0%|          | 0/825451 [00:00<?, ?it/s]

  0%|          | 0/825451 [00:00<?, ?it/s]

B095Y3Y2SJ
B08SJ77F9C
B0893XF6NM
B01MRU8MF2
B08THKFDR2
B08H247Y2D
B07SK7JB6Q
B08FX7Z5GQ
B000NVAWEU
B08L75WM9C
B08M3WB11Y
B07NY94G1P
B07H82LYQS
B071X6G1TX
B078ZC29GG
B07BMBY3JT
B000O9T4JU
B00G5Y6JAU
B091LHLXFG
B08LTF3Q6G
B0192BWYZQ
B083Z36R8H
B07JD5LYBN
B00MA94F8W
B07D1QHJS7
B000OZFZ38
B07RWHHFR2
B01MXRODJL
B09BB79N5M
B07DBWBS31
B00GLSHBZM
B07PR9KSZM
B014B3IQ7K
B011EV0G1G
B096FXV7WM
B07CWBJ66L
B0833CHWR4
B087F4NV9T
B01JKWH376
B0796TS7YG
B000L3PF1U
B07TKLMWSS
B01KZ3U7VI
B0736HBW3R
B07SJZQJM7
B074SPBC5R
B07GC7Y7H8
B07ZGJ2LK9
B08BZ6PRJX
B08CKRHNPY
B074ZLD96K
B0887ZMMYS
B082D1GQ4B
B0001NE7LY
B00AMPSSKC
B019CEPL58
B01M2C8HX3
B071RK8MTH
B08C9RVGBQ
B08837J569
B07YTXVXX4
B089M5P11D
B07PB1W738
B0924NGWX5
B06XD2L8F3
B076HSZ6GB
B08JCH4RFC
B01MCXR2NF
B01I8VMMKS
B093PW5TFP
B08K4CP3FJ
B09M2SL182
B08F9NR5Y8
B08V8Q17BQ
B072R3W8HF
B012HL40NM
B075V2P5ND
B08XJXNF9C
B019QOZE60
B083RR31W7
B009QH3MWQ
B07JLZCB4H
B0989J8WBL
B07G91PS3B
B08V1FLYG2
B09BJ438ZD
B07G24J37C
B093ST3XNS
B00O1FV2IO
B07X3LRVQX
B01EL8UXT4

In [26]:
import requests 

no_img_2 = {}
yes_img_2 = {}

from concurrent.futures import ThreadPoolExecutor
import requests

def check_image(url, asin):
    try:
        response = requests.head(url, allow_redirects=True)
        return asin, url, 'image' in response.headers.get('Content-Type', '')
    except requests.RequestException:
        return asin, url, False

urls = no_img.values()
asins = no_img.keys()

url_asin_pairs = list(zip(urls, asins))


with ThreadPoolExecutor() as executor:
    results = list(tqdm(executor.map(check_image, urls, asins), total=len(url_asin_pairs), desc="Processing Images"))

for asin, url, is_image in tqdm(results):
    if is_image:
        yes_img_2[asin] = url
    else:
        no_img_2[asin] = url
        print(asin)

Processing Images:   0%|          | 0/260 [00:00<?, ?it/s]

  0%|          | 0/260 [00:00<?, ?it/s]

B08THKFDR2
B08H247Y2D
B000NVAWEU
B08M3WB11Y
B000O9T4JU
B000OZFZ38
B07RWHHFR2
B000L3PF1U
B07TKLMWSS
B0001NE7LY
B07PB1W738
B08V8Q17BQ
B07G24J37C
B000PYL9FQ
B08KSXCNGH
B07QB3FYKY
B000M9QGX4
B07QC2VGXV
B08VRK6NBS
B000MSKDX4
B000HSTWY0
B000MRFKKQ
B07W4TXMNS
B083DDXFQ4
B001AMPQZG
B07R13MT94
B08R9XS4LC
B07R39R9QM
B000MN4J3O
B07DLWZMH5
B06Y69NBQC


In [30]:
for k,v in yes_img_2.items():
    yes_img[k] = v

In [44]:
for k,v in no_img_2.items():
    yes_img[k] = ""

In [45]:
len(yes_img)

825451

In [46]:
ava_meta_data["img_url"] = ava_meta_data["parent_asin"].map(yes_img)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ava_meta_data["img_url"] = ava_meta_data["parent_asin"].map(yes_img)


In [21]:
no_img_asin = ["B08THKFDR2",
'B08H247Y2D',
'B000NVAWEU',
'B08M3WB11Y',
'B000O9T4JU',
'B000OZFZ38',
'B07RWHHFR2',
'B000L3PF1U',
'B07TKLMWSS',
'B0001NE7LY',
'B07PB1W738',
'B08V8Q17BQ',
'B07G24J37C',
'B000PYL9FQ',
'B08KSXCNGH',
'B07QB3FYKY',
'B000M9QGX4',
'B07QC2VGXV',
'B08VRK6NBS',
'B000MSKDX4',
'B000HSTWY0',
'B000MRFKKQ',
'B07W4TXMNS',
'B083DDXFQ4',
'B001AMPQZG',
'B07R13MT94',
'B08R9XS4LC',
'B07R39R9QM',
'B000MN4J3O',
'B07DLWZMH5',
'B06Y69NBQC']

In [25]:
ava_meta_data[ava_meta_data["parent_asin"].isin(no_img_asin)].iloc[0]["images"]

[{'thumb': 'https://m.media-amazon.com/images/I/31DHbGX05ML._AC_SR38,50_.jpg',
  'large': 'https://m.media-amazon.com/images/I/31DHbGX05ML._AC_.jpg',
  'variant': 'MAIN',
  'hi_res': None},
 {'thumb': 'https://m.media-amazon.com/images/I/31DHbGX05ML._AC_SR38,50_.jpg',
  'large': 'https://m.media-amazon.com/images/I/31DHbGX05ML._AC_.jpg',
  'variant': 'PT01',
  'hi_res': None},
 {'thumb': 'https://m.media-amazon.com/images/I/31DHbGX05ML._AC_SR38,50_.jpg',
  'large': 'https://m.media-amazon.com/images/I/31DHbGX05ML._AC_.jpg',
  'variant': 'PT02',
  'hi_res': None},
 {'thumb': 'https://m.media-amazon.com/images/I/31DHbGX05ML._AC_SR38,50_.jpg',
  'large': 'https://m.media-amazon.com/images/I/31DHbGX05ML._AC_.jpg',
  'variant': 'PT03',
  'hi_res': None}]

In [71]:
# ava_meta_data[ava_meta_data["img_url"] ==""].index
# no_img_idx = [3692,   5950,  13770,  15879,  43316,  73528,  73630,  96735, 101958,
#        164755, 205143, 256472, 302518, 343136, 369423, 370063, 406611, 439261,
#        449912, 489179, 608511, 609422, 633925, 638064, 685531, 705865, 709117,
#        754580, 779109, 789151, 791818]

Index([  3692,   5950,  13770,  15879,  43316,  73528,  73630,  96735, 101958,
       164755, 205143, 256472, 302518, 343136, 369423, 370063, 406611, 439261,
       449912, 489179, 608511, 609422, 633925, 638064, 685531, 705865, 709117,
       754580, 779109, 789151, 791818],
      dtype='int64')

In [28]:
ava_meta_data = ava_meta_data[~ava_meta_data["parent_asin"].isin(no_img_asin)]

In [34]:
ava_meta_data["img_url"] = ava_meta_data["images"].apply(lambda x: x[0]["large"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ava_meta_data["img_url"] = ava_meta_data["images"].apply(lambda x: x[0]["large"])


In [35]:
ava_data = inter_df[inter_df["parent_asin"].isin(ava_meta_data["parent_asin"])]

In [36]:
ava_meta_data.shape

(825420, 6)

In [37]:
ava_data.shape

(2498717, 4)

In [38]:
ava_data["user_id"].nunique()

2033860

In [39]:
ava_data.to_csv("./data/amazon/fashion/ava_inter.csv", index = False)
ava_meta_data.to_csv("./data/amazon/fashion/ava_meta.csv", index = False)

#### k-core_data

In [5]:
n_core = 3
data_dir = f"./data/amazon/fashion/{n_core}_core"
mk_dir(data_dir)


In [6]:
inter_data = pd.read_csv(f"./data/amazon/fashion/ava_inter.csv")
meta_data = pd.read_csv(f"./data/amazon/fashion/ava_meta.csv")

In [7]:
meta_data.columns

Index(['title', 'description', 'images', 'details', 'parent_asin', 'img_url'], dtype='object')

In [8]:
meta_data.shape

(825420, 6)

In [9]:
# 아이템/유저 등장 빈도에 따른 데이터 구성
# 유저: 상호작용이 threshold 이하인 경우 삭제, 아이템 : 상호작용이 threshold 이하인 경우 삭제
# 상호작용 유저/아이템 수는 반복 구매를 제외하고 unique한 값을 기준으로 함
# 반복적으로 실행하여, 모든 유저, 아이템이 조건을 만족하도록 함


def data_cutter(origin_data, threshold=5):
    print("### before ###")
    print("shape of n_interaction_data : ", origin_data.shape)
    while True:
        new_data = (
            origin_data.groupby("user_id")
            .filter(lambda x: x.asin.nunique() >= threshold)
            .reset_index(drop=True)
        )
        new_data = (
            new_data.groupby("asin")
            .filter(lambda x: x.user_id.nunique() >= 1)
            .reset_index(drop=True)
        )

        if new_data.equals(origin_data):
            print("finish")
            break
        origin_data = new_data
        print("cut again")

    print("### after ###")
    print("shape of n_interaction_data : ", new_data.shape)
    print("num of user : ", new_data.user_id.nunique())
    print("num of item : ", new_data.asin.nunique())
    print(
        "data density : ",
        new_data.shape[0]
        / (new_data.user_id.nunique() * new_data.asin.nunique())
        * 100,
        "%",
    )

    return new_data

##### 빈도체크

In [63]:
res = inter_data.groupby(["user_id"]).count()

In [64]:
grades = ["[0~5]", "[6~20]", "[21~]"]
cut_bins = [0, 5, 20, 320]

res["range_cnt"] = pd.cut(res["timestamp"], bins = cut_bins, labels = grades)

In [65]:
res = res.groupby("range_cnt").count()

  res = res.groupby("range_cnt").count()


In [69]:
temp = res.groupby(["timestamp"]).count()

9002

In [53]:
# pd.options.display.max_columns = 20
pd.set_option("display.max_rows", None)

####

In [10]:
core_inter_data = data_cutter(inter_data, n_core) # inter: 6, item: 1

### before ###
shape of n_interaction_data :  (2498717, 4)
cut again
finish
### after ###
shape of n_interaction_data :  (313788, 4)
num of user :  76958
num of item :  210342
data density :  0.0019384587308478131 %


In [11]:
core_inter_data.to_csv(f"{data_dir}/{n_core}_core_inter_data.csv", index=False)

In [12]:
core_meta_data = meta_data[
    meta_data["parent_asin"].isin(core_inter_data["parent_asin"])
].reset_index(drop=True)

In [13]:
core_meta_data.to_csv(f"{data_dir}/{n_core}_core_meta_data.csv", index=False)

In [14]:
print("shape of n_item_data : ", core_meta_data.shape)
print("shape of new_interaction_data : ", core_inter_data.shape)

shape of n_item_data :  (202704, 6)
shape of new_interaction_data :  (313788, 4)


In [None]:
metadata = {
    "shape of interaction data": core_inter_data.shape,
    "shape of meta_data": core_meta_data.shape,
    "num of user": core_inter_data.user_id.nunique(),
    "num of item": core_inter_data.asin.nunique(),
    "data density": f"{core_inter_data.shape[0]/(core_inter_data.user_id.nunique()*core_inter_data.asin.nunique())*100}%",
}

dump_json(metadata, f"{data_dir}/{n_core}_core_metadata.json")

#### 이미지, 텍스트 임베딩

In [54]:
new_interaction_data = pd.read_csv(f"{data_dir}/{n_core}_core_inter_data.csv")
n_item_data = pd.read_csv(f"{data_dir}/{n_core}_core_meta_data.csv")

print("shape of n_item_data : ", n_item_data.shape)
print("shape of new_interaction_data : ", new_interaction_data.shape)

shape of n_item_data :  (60197, 6)
shape of new_interaction_data :  (78423, 4)


In [55]:
n_item_data

Unnamed: 0,title,description,images,details,parent_asin,img_url
0,RONNOX Women's 3-Pairs Bright Colored Calf Com...,['Ronnox Calf Sleeves - Allowing Your Body to ...,[{'thumb': 'https://m.media-amazon.com/images/...,"{'Is Discontinued By Manufacturer': 'No', 'Pac...",B07SB2892S,https://m.media-amazon.com/images/I/51CqMDJOOD...
1,Stainless Steel Bracelet Grooved Cuff Bangle f...,[],[{'thumb': 'https://m.media-amazon.com/images/...,{'Package Dimensions': '4.9 x 3.5 x 0.7 inches...,B01AUYK33Y,https://m.media-amazon.com/images/I/31uLUKRBHu...
2,ELEOSL Women Ladies Sleeveless Fit and Flare R...,[],[{'thumb': 'https://m.media-amazon.com/images/...,"{'Is Discontinued By Manufacturer': 'No', 'Pac...",B077JWX1MX,https://m.media-amazon.com/images/I/41ZfDDnWr+...
3,akimoom Scrunchie Watch Band for Galaxy 46mm/G...,[],[{'thumb': 'https://m.media-amazon.com/images/...,{'Package Dimensions': '0.1 x 0.1 x 0.1 inches...,B08TWC3WKW,https://m.media-amazon.com/images/I/519k4XNOgw...
4,Koup Women's Cinnamon Anti-Odor Tech Lightweig...,[],[{'thumb': 'https://m.media-amazon.com/images/...,{'Package Dimensions': '8.27 x 6.3 x 1.97 inch...,B097BT6TKW,https://m.media-amazon.com/images/I/41TjExDA-R...
...,...,...,...,...,...,...
60192,Women's 1950s Vintage Floral Lace Half Sleeve ...,[],[{'thumb': 'https://m.media-amazon.com/images/...,"{'Is Discontinued By Manufacturer': 'No', 'Pro...",B0834LPMJH,https://m.media-amazon.com/images/I/419Njb86XB...
60193,VamJump Woman Summer Sexy Button V Neck Sleeve...,[],[{'thumb': 'https://m.media-amazon.com/images/...,{'Package Dimensions': '12.6 x 9.8 x 1.2 inche...,B01FGZ2YHU,https://m.media-amazon.com/images/I/41IxCWBwJD...
60194,BIG WASP Kids Girls Sandals Open Toe Princess ...,[],[{'thumb': 'https://m.media-amazon.com/images/...,{'Package Dimensions': '9.13 x 9.09 x 1.54 inc...,B094QXNTJW,https://m.media-amazon.com/images/I/51EE4loNtd...
60195,Jude Jewelers Stainless Steel Chain Simulated ...,[],[{'thumb': 'https://m.media-amazon.com/images/...,{'Package Dimensions': '1.97 x 1.26 x 0.35 inc...,B097NFK7JT,https://m.media-amazon.com/images/I/31FLU6uuCL...


In [9]:
from fashion_clip.fashion_clip import FashionCLIP

fclip = FashionCLIP("fashion-clip")

images = n_item_data["img_url"].to_list()
image_embeddings = fclip.encode_images(images, batch_size=1000)

61it [4:40:59, 276.39s/it]                          


In [10]:
# {article_id : emb}
id_img_emb_map = {
    k: torch.tensor(v)
    for k, v in zip(n_item_data["parent_asin"].tolist(), image_embeddings)
}
torch.save(id_img_emb_map, f"{data_dir}/aritcle2img_emb.pt")

In [11]:
texts = n_item_data["title"].tolist()
text_embeddings = fclip.encode_text(texts, batch_size=64)

Map:   0%|          | 0/60197 [00:00<?, ? examples/s]

941it [00:20, 45.79it/s]                         


In [12]:
# {article_id : emb}
id_text_emb_map = {
    k: torch.tensor(v)
    for k, v in zip(n_item_data["parent_asin"].tolist(), text_embeddings)
}
torch.save(id_text_emb_map, f"{data_dir}/aritcle2text_emb.pt")

In [27]:
user2idx = {
    v: k for k, v in enumerate(new_interaction_data["user_id"].unique())
}  # {user_id:idx}
item2idx = {
    v: k for k, v in enumerate(new_interaction_data["asin"].unique())
}  # {item_id:idx}
item2pitem = {
    k:v for k, v in zip(new_interaction_data["asin"], new_interaction_data["parent_asin"])
}  # {item_id: parents_item_id}
idx2pitem = {
    item2idx[k]:v for k, v in zip(new_interaction_data["asin"], new_interaction_data["parent_asin"])
} # {idx: parents_item_id}


print("# of user", len(user2idx))
print("# of item", len(item2idx))

torch.save(item2idx, f"{data_dir}/item2idx.pt")
torch.save(item2pitem, f"{data_dir}/item2pitem.pt")

# of user 8365
# of item 61663


In [57]:
pitem2item = {v:k for k,v in item2pitem.items()}

In [28]:
new_interaction_data.head()

Unnamed: 0,user_id,asin,parent_asin,timestamp
0,AFSKPY37N3C43SOI5IEXEK5JSIYA,B0BQHV9ZFM,B0BQJ61TXB,1674857583365
1,AFSKPY37N3C43SOI5IEXEK5JSIYA,B0856TH4LK,B0856TH4LK,1587205458621
2,AFSKPY37N3C43SOI5IEXEK5JSIYA,B07ZWZ2595,B07ZWZ2595,1579433019201
3,AFSKPY37N3C43SOI5IEXEK5JSIYA,B07NY72H7W,B07NY72H7W,1565639039659
4,AFSKPY37N3C43SOI5IEXEK5JSIYA,B07NX5RHZ2,B07NX5RHZ2,1561338044117


In [None]:
n_item_data.loc[:, "title"] = n_item_data["title"].apply(
    lambda x: " ".join(str(x).split())
)
n_item_data.loc[:, "meta"] = n_item_data.apply(
    lambda x: f'Name {x["title"]}',
    axis=1,
)

In [None]:
asin_img_emb_map = {row["asin"]:id_img_emb_map[row["parent_asin"]] for _, row in tqdm(new_interaction_data.iterrows())}
asin_text_emb_map = {row["asin"]:id_text_emb_map[row["parent_asin"]] for _, row in tqdm(new_interaction_data.iterrows())}

In [86]:
pasin_img_url = {row["parent_asin"]:row["img_url"] for _, row in tqdm(n_item_data.iterrows())}
pasin_meta = {row["parent_asin"]:row["meta"] for _, row in tqdm(n_item_data.iterrows())}

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [87]:
asin_img_url = {row["asin"]:pasin_img_url[row["parent_asin"]] for _, row in tqdm(new_interaction_data.iterrows())}
asin_meta = {row["asin"]:pasin_meta[row["parent_asin"]] for _, row in tqdm(new_interaction_data.iterrows())}

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [39]:
idx_img_emb_map = {item2idx[k]:v for k,v in tqdm(asin_img_emb_map.items())}
idx_text_emb_map = {item2idx[k]:v for k,v in tqdm(asin_text_emb_map.items())}

  0%|          | 0/61663 [00:00<?, ?it/s]

  0%|          | 0/61663 [00:00<?, ?it/s]

In [88]:
idx_img_url_map = {item2idx[k]:v for k,v in tqdm(asin_img_url.items())}
idx_meta_map= {item2idx[k]:v for k,v in tqdm(asin_meta.items())}

  0%|          | 0/61663 [00:00<?, ?it/s]

  0%|          | 0/61663 [00:00<?, ?it/s]

In [41]:
torch.save(idx_img_emb_map, f"{data_dir}/idx_img_emb_map.pt")
torch.save(idx_text_emb_map, f"{data_dir}/idx_text_emb_map.pt")

In [90]:
torch.save(asin_img_url, f"{data_dir}/asin_img_url.pt")
torch.save(idx_img_url_map, f"{data_dir}/idx_img_url_map.pt")

In [91]:
torch.save(asin_meta, f"{data_dir}/asin_meta.pt")
torch.save(idx_meta_map, f"{data_dir}/idx_meta_map.pt")

In [None]:

torch.save(asin_img_emb_map, f"{data_dir}/asin_img_emb_map.pt")
torch.save(asin_text_emb_map, f"{data_dir}/asin_text_emb_map.pt")

In [32]:
sorted_iteraction_data = new_interaction_data.sort_values(["user_id", "timestamp"]).reset_index(drop=True)

In [33]:
sorted_iteraction_data

Unnamed: 0,user_id,asin,parent_asin,timestamp
0,AE22M44KAE72RFU6DWM5Y3YN6SEA,B01MQKVQL4,B01MQKVQL4,1490229255000
1,AE22M44KAE72RFU6DWM5Y3YN6SEA,B01J3HUG8Q,B01J3HUG8Q,1490549894000
2,AE22M44KAE72RFU6DWM5Y3YN6SEA,B01M8NNVF9,B01M8NNVF9,1492630212000
3,AE22M44KAE72RFU6DWM5Y3YN6SEA,B01N3L0DTS,B01N3L0DTS,1492630325000
4,AE22M44KAE72RFU6DWM5Y3YN6SEA,B01M2Y86RF,B01M2Y86RF,1492630634000
...,...,...,...,...
78418,AHZZIENTC5NQS32NSKEI7FWPDLWQ,B071G7KWH8,B071G7KWH8,1525397475117
78419,AHZZIENTC5NQS32NSKEI7FWPDLWQ,B077PPKKM8,B077PPKKM8,1546880320127
78420,AHZZIENTC5NQS32NSKEI7FWPDLWQ,B0725XQDF9,B0725XQDF9,1547081000680
78421,AHZZIENTC5NQS32NSKEI7FWPDLWQ,B076S8Q5GZ,B076S8Q5GZ,1547262313529


In [34]:
sorted_iteraction_data["user_id"] = sorted_iteraction_data["user_id"].map(user2idx)
sorted_iteraction_data["asin"] = sorted_iteraction_data["asin"].map(item2idx)

In [35]:
sorted_iteraction_data.head()

Unnamed: 0,user_id,asin,parent_asin,timestamp
0,7703,58008,B01MQKVQL4,1490229255000
1,7703,58007,B01J3HUG8Q,1490549894000
2,7703,58006,B01M8NNVF9,1492630212000
3,7703,58005,B01N3L0DTS,1492630325000
4,7703,5821,B01M2Y86RF,1492630634000


In [37]:
sorted_iteraction_data = sorted_iteraction_data[["user_id", "asin"]]

In [38]:
cnt = sorted_iteraction_data.groupby(["user_id"]).count()

In [40]:
min(cnt["asin"])

6

In [41]:
unique_data = sorted_iteraction_data.drop_duplicates(
    ["asin", "user_id"], keep="last"
)

In [44]:
min(unique_data.groupby(["user_id"]).count()["asin"])

6

In [45]:
unique_data.shape

(77859, 2)

In [46]:
metadata = {
    "shape of interaction data": unique_data.shape,
    "num of user": unique_data.user_id.nunique(),
    "num of item": unique_data.asin.nunique(),
    "data density": f"{unique_data.shape[0]/(unique_data.user_id.nunique()*unique_data.asin.nunique())*100}%",
}

dump_json(metadata, f"{data_dir}/{n_core}_core_uniqued_metadata.json")

In [47]:
tqdm.pandas()
test_data = dict(unique_data.groupby("user_id")["asin"].progress_apply(list))

  0%|          | 0/8365 [00:00<?, ?it/s]

In [48]:
test_data = [v for v in test_data.values()]

In [49]:
test_data

[[7, 6, 5, 4, 3, 2, 1, 0],
 [43,
  42,
  41,
  40,
  39,
  38,
  37,
  36,
  35,
  34,
  33,
  32,
  31,
  30,
  29,
  28,
  27,
  26,
  25,
  24,
  23,
  22,
  21,
  20,
  19,
  18,
  17,
  16,
  15,
  14,
  13,
  12,
  11,
  10,
  9,
  8],
 [49, 48, 47, 46, 45, 44],
 [62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50],
 [78, 77, 76, 75, 74, 73, 72, 71, 70, 69, 68, 67, 66, 65, 64, 63],
 [90, 89, 88, 87, 86, 85, 84, 83, 82, 81, 80, 79],
 [168,
  167,
  166,
  165,
  164,
  163,
  162,
  161,
  160,
  159,
  158,
  157,
  156,
  155,
  154,
  153,
  152,
  151,
  150,
  149,
  148,
  147,
  146,
  145,
  144,
  143,
  142,
  141,
  140,
  139,
  138,
  137,
  136,
  135,
  134,
  133,
  132,
  131,
  130,
  129,
  128,
  127,
  126,
  125,
  124,
  123,
  122,
  121,
  120,
  119,
  118,
  117,
  116,
  115,
  114,
  113,
  112,
  111,
  110,
  109,
  108,
  107,
  106,
  105,
  104,
  103,
  102,
  101,
  100,
  99,
  98,
  97,
  96,
  95,
  94,
  93,
  92,
  91],
 [228,
  227,
  226,

In [51]:
torch.save(test_data, f"{data_dir}/uniqued_test_data.pt")

In [92]:
# Upload to Huggingface Hub
api = HfApi()
api.upload_folder(
    folder_path=data_dir,
    repo_id=f"SLKpnu/amazon_fashion_{n_core}_core",
    commit_message=f"dataset created timestamp : {get_timestamp()}",
    repo_type="dataset",
)

asin_meta.pt:   0%|          | 0.00/7.78M [00:00<?, ?B/s]

asin_img_url.pt:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

idx_meta_map.pt:   0%|          | 0.00/6.73M [00:00<?, ?B/s]

idx_img_url_map.pt:   0%|          | 0.00/3.80M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/SLKpnu/amazon_fashion_6_core/commit/16af8814b7091e09aea8c62589700db8e9f585b9', commit_message='dataset created timestamp : 06095738', commit_description='', oid='16af8814b7091e09aea8c62589700db8e9f585b9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/SLKpnu/amazon_fashion_6_core', endpoint='https://huggingface.co', repo_type='dataset', repo_id='SLKpnu/amazon_fashion_6_core'), pr_revision=None, pr_num=None)

#### Clothings

In [6]:
meta = []

with jsonlines.open('./data/amazon/clothings/meta_Clothing_Shoes_and_Jewelry.jsonl') as f:
    for line in f.iter():
    	meta.append(line)

In [5]:
inter = []

with jsonlines.open('./data/amazon/clothings/Clothing_Shoes_and_Jewelry.jsonl') as f:
    for line in f.iter():
    	inter.append(line)

In [6]:
# meta_df = pd.DataFrame(meta)
meta_df = pd.read_csv("./data/amazon/clothings/meta.csv")
meta_df.shape
# meta_df.to_csv("./data/amazon/clothings/meta.csv", index=False)

(7218481, 5)

In [7]:
# inter_df = pd.DataFrame(inter)
inter_df = pd.read_csv("./data/amazon/clothings/inter.csv")
inter_df.shape
# inter_df.shape
# inter_df.to_csv("./data/amazon/clothings/inter.csv", index=False)

(66033346, 4)

In [None]:
meta_df = meta_df[["title", "description", "images", "details", "parent_asin"]]
meta_df.to_csv("./data/amazon/clothings/meta.csv", index=False)

inter_df = inter_df[["user_id", "asin", "parent_asin", "timestamp"]]
inter_df.to_csv("./data/amazon/clothings/inter.csv", index=False)

In [8]:
meta_df.columns

Index(['title', 'description', 'images', 'details', 'parent_asin'], dtype='object')

In [9]:
meta_df = meta_df[["title", "description", "images", "details", "parent_asin"]]
meta_df = meta_df.dropna(axis=0, how='any')

In [10]:
meta_df.shape


(7217878, 5)

In [11]:
inter_df.columns

Index(['user_id', 'asin', 'parent_asin', 'timestamp'], dtype='object')

In [12]:
inter_df = inter_df[["user_id", "asin", "parent_asin", "timestamp"]]
inter_df = inter_df.dropna(axis=0, how='any')

In [13]:
inter_df.shape

(66033346, 4)

In [14]:
no_img_cnt = meta_df["images"].apply(lambda x: (len(x)<=4))
no_desc_cnt = meta_df["description"].apply(lambda x: (len(x)<=4))
no_title_cnt = meta_df["title"].apply(lambda x: (len(x.split())<2))

In [15]:
sum(no_img_cnt)

10048

In [16]:
sum(no_desc_cnt)

3424793

In [17]:
sum(no_title_cnt)

2888

In [19]:
no_img_title_index = no_img_cnt[no_img_cnt].index.append(no_title_cnt[no_title_cnt].index)

In [20]:
ava_meta_data = meta_df[~meta_df.index.isin(no_img_title_index)]

In [21]:
meta_df.shape

(7217878, 5)

In [22]:
ava_data = inter_df[inter_df["parent_asin"].isin(ava_meta_data["parent_asin"])]

In [23]:
ava_data.shape

(65984322, 4)

In [24]:
ava_data["user_id"].nunique()

22544591

In [25]:
ava_meta_data.to_csv("./data/amazon/clothings/ava_meta.csv", index=False)

In [26]:
ava_data.to_csv("./data/amazon/clothings/ava_inter.csv", index=False)

In [28]:
ava_meta_data

Unnamed: 0,title,description,images,details,parent_asin
0,BALEAF Women's Long Sleeve Zip Beach Coverup U...,[],[{'thumb': 'https://m.media-amazon.com/images/...,"{'Department': 'womens', 'Date First Available...",B09X1MRDN6
1,Merrell Work Moab 2 Vent Waterproof SR Boulder,[],[{'thumb': 'https://m.media-amazon.com/images/...,{'Package Dimensions': '14.02 x 9.29 x 4.8 inc...,B073C4Q7W8
2,"SAS Women's, Relaxed Sandal","['Unwind, leave your worries behind, and simpl...",[{'thumb': 'https://m.media-amazon.com/images/...,{'Product Dimensions': '10 x 15 x 6 inches; 2 ...,B0944VG4Y4
3,SheIn Women's Basic Stretch Plaid Mini Bodycon...,[],[{'thumb': 'https://m.media-amazon.com/images/...,"{'Department': 'womens', 'Date First Available...",B08JGGF5TJ
4,"Michael Kors Cindy, Women’s Cross-Body Bag","['Adjustable crossbody strap with 24""-26"" drop...",[{'thumb': 'https://m.media-amazon.com/images/...,"{'Is Discontinued By Manufacturer': 'No', 'Pro...",B00ZQMM6BI
...,...,...,...,...,...
7218476,CORIRESHA Women's One Shoulder Floral Doll Sle...,[],[{'thumb': 'https://m.media-amazon.com/images/...,{'Package Dimensions': '15.47 x 11.61 x 1.02 i...,B0B1HZ3SJY
7218477,Shawhuwa Womens Long Sleeve Vibrant Print Rash...,['This unique long sleeve swimsuit for women i...,[{'thumb': 'https://m.media-amazon.com/images/...,"{'Is Discontinued By Manufacturer': 'No', 'Pac...",B07C7KBGPB
7218478,PF Flyers Center Lo Sneaker,"['Leather upper, lace-up sneaker with metal gr...",[{'thumb': 'https://m.media-amazon.com/images/...,"{'Brand Name': 'PF Flyers', 'Color': 'Dark Gre...",B005AR7PFQ
7218479,Heybaby Women's Letter Print Backless One Piec...,['We are professional designers to follow the ...,[{'thumb': 'https://m.media-amazon.com/images/...,"{'Is Discontinued By Manufacturer': 'No', 'Pac...",B06XHVXX21


In [27]:
f_meta_data = pd.read_csv("./data/amazon/fashion/ava_meta.csv")

In [32]:
merge_date = pd.concat([f_meta_data, ava_meta_data], axis=0, join="inner").reset_index(drop=True)

In [37]:
drop_data = merge_date.drop_duplicates(subset=["title", "parent_asin"]).reset_index(drop=True)

In [38]:
drop_data.shape

(8030396, 5)

In [36]:
merge_date.shape

(8030396, 5)

#### HF upload

In [65]:
from huggingface_hub import HfApi, snapshot_download

# Upload to Huggingface Hub
api = HfApi()
api.upload_folder(
    folder_path="./data/amazon/fashion",
    repo_id=f"SLKpnu/amazon_fashion_raw",
    commit_message=f"dataset created timestamp : {get_timestamp()}",
    repo_type="dataset",
)

6_core_meta_data.csv:   0%|          | 0.00/90.4M [00:00<?, ?B/s]

aritcle2img_emb.pt:   0%|          | 0.00/141M [00:00<?, ?B/s]

aritcle2text_emb.pt:   0%|          | 0.00/141M [00:00<?, ?B/s]

asin_img_emb_map.pt:   0%|          | 0.00/141M [00:00<?, ?B/s]

Upload 14 LFS files:   0%|          | 0/14 [00:00<?, ?it/s]

asin_img_url.pt:   0%|          | 0.00/4.81M [00:00<?, ?B/s]

asin_text_emb_map.pt:   0%|          | 0.00/141M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [41]:
from huggingface_hub import HfApi, snapshot_download

# Upload to Huggingface Hub
api = HfApi()
api.upload_folder(
    folder_path="./data/amazon/clothings",
    repo_id=f"SLKpnu/amazon_clothing_raw",
    commit_message=f"dataset created timestamp : {get_timestamp()}",
    repo_type="dataset",
)

ava_inter.csv:   0%|          | 0.00/4.29G [00:00<?, ?B/s]

ava_meta.csv:   0%|          | 0.00/12.3G [00:00<?, ?B/s]

inter.csv:   0%|          | 0.00/4.29G [00:00<?, ?B/s]

Clothing_Shoes_and_Jewelry.jsonl:   0%|          | 0.00/27.8G [00:00<?, ?B/s]

meta.csv:   0%|          | 0.00/12.3G [00:00<?, ?B/s]

Upload 6 LFS files:   0%|          | 0/6 [00:00<?, ?it/s]

meta_Clothing_Shoes_and_Jewelry.jsonl:   0%|          | 0.00/18.0G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/SLKpnu/amazon_clothing_raw/commit/228ffb5f93c7d3a25bff9ed7362892e7ddf4afbb', commit_message='dataset created timestamp : 31134813', commit_description='', oid='228ffb5f93c7d3a25bff9ed7362892e7ddf4afbb', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/SLKpnu/amazon_clothing_raw', endpoint='https://huggingface.co', repo_type='dataset', repo_id='SLKpnu/amazon_clothing_raw'), pr_revision=None, pr_num=None)