## Image feature extraction

In [1]:
import tensorflow as tf
import os
import pandas as pd
import numpy as np
from PIL import Image
# from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
# from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing import image as keras_image
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input


pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import numpy as np
import scipy.sparse as sp
import json
import gzip
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm
import dgl.data
torch.manual_seed(0)
import random
random.seed(0)
np.random.seed(0)

In [3]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [4]:
# base_model_2 = InceptionV3(weights='imagenet', include_top=True, input_shape=(299, 299, 3))
# base_model_2.summary()

In [5]:
# Load pre-trained ResNet50 model without top (classification) layers
base_model = InceptionV3(weights='imagenet', include_top=False, input_shape=(None, None, 3))

# Define Faster R-CNN architecture on top of ResNet50 backbone
x = base_model.output
x = tf.keras.layers.GlobalAveragePooling2D()(x)  # Global Average Pooling to reduce spatial dimensions
faster_rcnn_model = Model(inputs=base_model.input, outputs=x)

In [6]:
base_model.summary()

Model: "inception_v3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None, None,  0           []                               
                                 3)]                                                              
                                                                                                  
 conv2d (Conv2D)                (None, None, None,   864         ['input_1[0][0]']                
                                32)                                                               
                                                                                                  
 batch_normalization (BatchNorm  (None, None, None,   96         ['conv2d[0][0]']                 
 alization)                     32)                                                    

In [7]:
def extract_vectors(img_path,model = faster_rcnn_model):
    with tf.device('/device:GPU:0'):
        img = keras_image.load_img(img_path, target_size=(224, 224))
        img_array = keras_image.img_to_array(img)
        img_array = np.expand_dims(img_array, axis=0)
        img_array = preprocess_input(img_array)
        
        # Extract features using Faster R-CNN model
        features = model.predict(img_array,verbose = 0)
        features = features.flatten()
    return features

In [8]:
extract_vectors('../1.jpg')

array([0.        , 0.        , 0.16176921, ..., 0.4692055 , 0.06945412,
       2.4667668 ], dtype=float32)

In [9]:
df_full = pd.read_parquet('../processed/final_filtered_meta_Clothing_Shoes_and_Jewelry_2018_men_clothing_only.parquet')

In [10]:
import os
ids_done = os.listdir('../processed/images_clothing_men/')
ids_done = [i for i in ids_done if  len(os.listdir(f'../processed/images_clothing_men/{i}'))]

In [11]:
## keeping products whihc only contains images
df_full = df_full[df_full['asin'].isin(ids_done)]
df_full = df_full.drop_duplicates('asin')

In [12]:
df_full.shape

(85336, 21)

In [13]:
list_ds = df_full.asin.values
img_feature = []
for i in tqdm(list_ds):
    images_list = os.listdir(f'../processed/images_clothing_men/{i}')
    imagename = sorted([int(i.split('.')[0]) for i in images_list])[0]
    img_feature.append(extract_vectors(f'../processed/images_clothing_men/{i}/{imagename}.jpg'))


X=None
if X:
    X = np.hstack((X,np.array(img_feature)))
else:
    X = np.array(img_feature)
print(X.shape)
image_features = pd.DataFrame(X,columns=[f'vec_{i}' for i in range(X.shape[1])],index = list_ds)
image_features.shape
image_features.to_parquet('../processed/inception_men_image_features.parquet')

100%|██████████| 85336/85336 [8:00:07<00:00,  2.96it/s]       


(85336, 2048)
