## Image feature extraction

In [1]:
import tensorflow as tf
import os
import pandas as pd
import numpy as np
from PIL import Image
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing import image as keras_image
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input


pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import numpy as np
import scipy.sparse as sp
import json
import gzip
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm
import dgl.data
torch.manual_seed(0)
import random
random.seed(0)
np.random.seed(0)

In [3]:
# Load pre-trained ResNet50 model without top (classification) layers
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(None, None, 3))

# Define Faster R-CNN architecture on top of ResNet50 backbone
x = base_model.output
x = tf.keras.layers.GlobalAveragePooling2D()(x)  # Global Average Pooling to reduce spatial dimensions
faster_rcnn_model = Model(inputs=base_model.input, outputs=x)

In [4]:
# # Load pre-trained ResNet50 model without top (classification) layers
# base_model_2 = ResNet50(weights='imagenet', include_top=True, input_shape=(224, 224, 3))
# base_model_2.summary()

In [5]:
import os
ids_done = os.listdir('../processed/images_clothing_men/')
ids_done = [i for i in ids_done if  len(os.listdir(f'../processed/images_clothing_men/{i}'))]

In [6]:
base_model.summary()

Model: "resnet50"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None, None,  0           []                               
                                 3)]                                                              
                                                                                                  
 conv1_pad (ZeroPadding2D)      (None, None, None,   0           ['input_1[0][0]']                
                                3)                                                                
                                                                                                  
 conv1_conv (Conv2D)            (None, None, None,   9472        ['conv1_pad[0][0]']              
                                64)                                                        

In [7]:
def extract_vectors(img_path,model = faster_rcnn_model):
    img = keras_image.load_img(img_path, target_size=(224, 224))
    img_array = keras_image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)
    
    # Extract features using Faster R-CNN model
    features = model.predict(img_array,verbose = 0)
    features = features.flatten()
    return features

In [8]:
extract_vectors('../1.jpg')

array([0.0000000e+00, 7.6014652e+00, 1.1391789e-01, ..., 6.2565319e-03,
       2.1243437e-01, 4.5812988e-01], dtype=float32)

In [10]:
df_full = pd.read_parquet('../processed/final_filtered_meta_Clothing_Shoes_and_Jewelry_2018_men_clothing_only.parquet')

In [10]:
# df_full = df_full[df_full['main_cat'].isin(['Computers', 'All Electronics', 'Camera & Photo',
#        'Home Audio & Theater', 'Cell Phones & Accessories'])]

In [11]:
# df_full['main_cat'].value_counts().to_frame()

In [12]:
# ## keeping only common product types compared to also bought
# unique_id = df_full['asin'].unique()
# temp = []
# for i in df_full.also_buy:
#     temp+=i
# final_list = list((set(temp) & set(unique_id)))
# imgs_dir = os.listdir('./processed/images/')
# imgs_dir  = [i for i in imgs_dir if len(os.listdir(f'./processed/images/{i}'))]
# print(len(imgs_dir),len(final_list),len(set(final_list).intersection(imgs_dir)))
# final_list = tuple(set(final_list).intersection(imgs_dir))
# df_full = df_full[df_full.asin.isin(final_list)]
# df_full = df_full.drop_duplicates('asin')
# df_full.shape,len(final_list)
# ## kwwping only common product ids present in also baught 

In [13]:
# df_full['niche'] = df_full.category.str[-1]

In [14]:
# df_full.to_parquet('./processed/meta_Electronics_2018_full_processed.parquet')

In [15]:
# df_full = pd.read_parquet('../processed/meta_Electronics_2018_full_processed.parquet')

# for i in ['also_view','also_buy','description','imageURL','imageURLHighRes','category']:
#     df_full[i] = df_full[i].apply(eval)
# ## removing all products without price tag
# df_full = df_full[(df_full.price!='') & (df_full.price.str.startswith('$'))]

In [11]:
from price_parser import Price

In [12]:
df_full['price'] = df_full.price.fillna('0').apply(lambda x: float(Price.fromstring(x).amount))

## Exploding also buy to get Edges for the graph

In [13]:
df_full.description = np.where(df_full.description.str.len(),df_full.description.str[0],'')

In [14]:
# df_full['sub_cat'] = np.where(df_full.category.str[-2].isin(df_full.category.str[-2].value_counts()[(df_full.category.str[-2].value_counts()<5)].index),'others',df_full.category.str[-2])
# df_full['sub_cat'].value_counts()

In [15]:
# df_full_explode = df_full[~df_full.also_buy.isna()]
# df_full_explode = df_full_explode.explode('also_buy')
# final_list = df_full.asin
# # df_full_explode = df_full_explode[(df_full_explode.also_buy.isin(df_full.asin)) | (df_full_explode.also_buy.isna()) ]
# df_full_explode.also_buy = np.where(df_full_explode.also_buy.isin(final_list),df_full_explode.also_buy,np.nan)
# df_full_explode = df_full_explode.drop_duplicates(['asin','also_buy'])
# print("Exploded df shape", df_full_explode.shape)


# edges = df_full_explode[~df_full_explode.also_buy.isna()][['asin', 'also_buy']].drop_duplicates(['asin', 'also_buy'])
# final_list = list(edges.asin) + list(edges.also_buy)
# final_list = sorted(list(set(final_list)))
# len(final_list)

In [16]:
## keeping products whihc only contains images
df_full = df_full[df_full['asin'].isin(ids_done)]

In [20]:
df_full = df_full.drop_duplicates('asin')

In [21]:
df_full.asin.nunique(),df_full.shape

(85336, (85336, 21))

In [None]:
list_ds = df_full.asin.values
img_feature = []
for i in tqdm(list_ds):
    images_list = os.listdir(f'../processed/images_clothing_men/{i}')
    imagename = sorted([int(i.split('.')[0]) for i in images_list])[0]
    img_feature.append(extract_vectors(f'../processed/images_clothing_men/{i}/{imagename}.jpg'))


X=None
if X:
    X = np.hstack((X,np.array(img_feature)))
else:
    X = np.array(img_feature)
print(X.shape)
image_features = pd.DataFrame(X,columns=[f'vec_{i}' for i in range(X.shape[1])],index = list_ds)
image_features.shape
image_features.to_parquet('../processed/restnet_men_image_features.parquet')