# 이미지 Feature vector를 활용해보기!

- 여러 논문에서 소개된것처럼 pre-trained CNN으로 image features를 추출해봅니다.
- image features를 비교하여 실제로 비슷한지 판단하고, rating정보와 함께 분석해봅니다.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
path = '/content/drive/MyDrive/data/amazon_reviews'

## Load dataset

1. AMAZON_FASHION_5.json
2. All_Beauty_5.json
3. Luxury_Beauty_5.json

In [None]:
import os, json
import pandas as pd

In [None]:
def load_json(filename):
  data = []
  with open(os.path.join(path, filename), 'r', encoding='utf-8') as f:
      for l in f:
        data.append(json.loads(l.strip()))

  df = pd.DataFrame.from_dict(data)

  # get rows which contains image 
  df = df[~df['image'].isnull()]

  return df

In [None]:
fashion_df = load_json('AMAZON_FASHION_5.json')
print(fashion_df.shape)
print(fashion_df.head())

(106, 12)
     overall  verified  ... vote                                              image
164      5.0      True  ...  NaN  [https://images-na.ssl-images-amazon.com/image...
172      5.0      True  ...  NaN  [https://images-na.ssl-images-amazon.com/image...
179      5.0      True  ...  NaN  [https://images-na.ssl-images-amazon.com/image...
192      5.0      True  ...  NaN  [https://images-na.ssl-images-amazon.com/image...
197      5.0      True  ...  NaN  [https://images-na.ssl-images-amazon.com/image...

[5 rows x 12 columns]


In [None]:
beauty_df = load_json('All_Beauty_5.json')
print(beauty_df.shape)
print(beauty_df.head())

(98, 12)
    overall  verified  ... vote                                              image
19      5.0      True  ...    5  [https://images-na.ssl-images-amazon.com/image...
20      5.0      True  ...    4  [https://images-na.ssl-images-amazon.com/image...
21      5.0      True  ...    4  [https://images-na.ssl-images-amazon.com/image...
34      1.0      True  ...    2  [https://images-na.ssl-images-amazon.com/image...
47      5.0      True  ...   20  [https://images-na.ssl-images-amazon.com/image...

[5 rows x 12 columns]


In [None]:
luxury_df = load_json('Luxury_Beauty_5.json')
print(luxury_df.shape)
print(luxury_df.head())

(617, 12)
     overall  verified  ... vote                                              image
68       5.0      True  ...  NaN  [https://images-na.ssl-images-amazon.com/image...
75       5.0      True  ...  NaN  [https://images-na.ssl-images-amazon.com/image...
86       5.0      True  ...  NaN  [https://images-na.ssl-images-amazon.com/image...
88       5.0      True  ...    9  [https://images-na.ssl-images-amazon.com/image...
104      5.0     False  ...    2  [https://images-na.ssl-images-amazon.com/image...

[5 rows x 12 columns]


## Download images

In [None]:
from tqdm import tqdm
import requests

In [None]:
def download_images(path, df, category):
  folder_path = os.path.join(path, category)

  if not os.path.exists(folder_path):
    os.makedirs(folder_path)

  for index in tqdm(df.index):
    url_list = df['image'].loc[index]
    for url_index, url in enumerate(url_list):
      if not os.path.exists(os.path.join(folder_path, f'{str(index)}_{str(url_index)}.jpg')):
        img_data = requests.get(url).content
        with open(os.path.join(folder_path, f'{str(index)}_{str(url_index)}.jpg'), 'wb') as handler:
          handler.write(img_data)

  print(f'{category}: {len(df.index)} images downloaded or already exist...')

In [None]:
download_images(path, beauty_df, 'beauty')

100%|██████████| 98/98 [00:00<00:00, 2715.37it/s]

beauty: 98 images downloaded or already exist...





In [None]:
download_images(path, fashion_df, 'fashion')

100%|██████████| 106/106 [00:00<00:00, 2684.24it/s]

fashion: 106 images downloaded or already exist...





In [None]:
download_images(path, luxury_df, 'luxury')

100%|██████████| 617/617 [00:00<00:00, 2251.24it/s]

luxury: 617 images downloaded or already exist...





## Use pre-trained CNN

In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.autograd import Variable
from PIL import Image

In [None]:
# Load the pretrained model
model = models.resnet18(pretrained=True)
# Use the model object to select the desired layer
layer = model._modules.get('avgpool')

Downloading: "https://download.pytorch.org/models/resnet18-5c106cde.pth" to /root/.cache/torch/hub/checkpoints/resnet18-5c106cde.pth


HBox(children=(FloatProgress(value=0.0, max=46827520.0), HTML(value='')))




In [None]:
# Set model to evaluation mode
model.eval()

In [None]:
scaler = transforms.Scale((224, 224))
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
to_tensor = transforms.ToTensor()

  "please use transforms.Resize instead.")


In [None]:
def get_vector(image_name):
    # 1. Load the image with Pillow library
    img = Image.open(image_name)
    # 2. Create a PyTorch Variable with the transformed image
    t_img = Variable(normalize(to_tensor(scaler(img))).unsqueeze(0))
    # 3. Create a vector of zeros that will hold our feature vector
    #    The 'avgpool' layer has an output size of 512
    my_embedding = torch.zeros(512)
    # 4. Define a function that will copy the output of a layer
    def copy_data(m, i, o):
      my_embedding.copy_(o.data.reshape(o.data.size(1)))
    # 5. Attach that function to our selected layer
    h = layer.register_forward_hook(copy_data)
    # 6. Run the model on our transformed image
    model(t_img)
    # 7. Detach our copy function from the layer
    h.remove()
    # 8. Return the feature vector
    return my_embedding.cpu().detach().numpy()

In [None]:
# Test
category = 'beauty'
for image_file in os.listdir(os.path.join(path, category))[:2]:
  print(f"{image_file} feature vectors")
  print(get_vector(os.path.join(path, category+'/'+image_file)))

## Preprocess dataset

- Remove unnecessary columns
- Remove all other columns except `overall`, `reviewerID`, `asin`, `image` 
- Create new column with image filename

In [None]:
def add_image_filenames(category, df):
  # Remove unnecessary columns
  df = df[['overall','reviewerID', 'asin', 'image']]

  filenames = []
  for row_index in df.index:
    each_files = []
    for idx in range(len(df.loc[row_index]['image'])):
      each_files.append(os.path.join(path, category+'/'+f'{row_index}_{idx}.jpg'))
    filenames.append(each_files)

  # Add new column
  df.drop('image', axis=1, inplace=True)
  df['image_filename'] = list(filenames)

  return df


In [None]:
luxury_df = add_image_filenames('luxury', luxury_df)
beauty_df = add_image_filenames('beauty', beauty_df)
fashion_df = add_image_filenames('fashion', fashion_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### Combine into one dataframe

- `luxury_df`, `fashion_df`, beauty_df

In [None]:
luxury_df.head()

Unnamed: 0,overall,reviewerID,asin,image_filename
68,5.0,A2BHOZILR7SY9,B000142FVW,[/content/drive/MyDrive/data/amazon_reviews/lu...
75,5.0,ACMSQCH1H7JZD,B000142FVW,[/content/drive/MyDrive/data/amazon_reviews/lu...
86,5.0,A2L77YQRAEA1YZ,B000142FVW,[/content/drive/MyDrive/data/amazon_reviews/lu...
88,5.0,A28W77RPDZK7AZ,B00014351Q,[/content/drive/MyDrive/data/amazon_reviews/lu...
104,5.0,A2IV70BWQBUF32,B00014351Q,[/content/drive/MyDrive/data/amazon_reviews/lu...


In [None]:
data_list = []
dataframe_list = [('luxury',luxury_df), ('beauty', beauty_df), ('fashion', fashion_df)]
for dataframe in dataframe_list:
  category = dataframe[0]
  df = dataframe[1]
  for index, row in df.iterrows():
    for filename in row['image_filename']:
      data_tuple = (category, row['overall'], row['reviewerID'], row['asin'], filename)
      data_list.append(data_tuple)

In [None]:
combined_df = pd.DataFrame(data=data_list,columns=['category', 'overall', 'reviewerID', 'asin', 'filename'])
print(combined_df.head())
print(combined_df.shape)

  category  ...                                           filename
0   luxury  ...  /content/drive/MyDrive/data/amazon_reviews/lux...
1   luxury  ...  /content/drive/MyDrive/data/amazon_reviews/lux...
2   luxury  ...  /content/drive/MyDrive/data/amazon_reviews/lux...
3   luxury  ...  /content/drive/MyDrive/data/amazon_reviews/lux...
4   luxury  ...  /content/drive/MyDrive/data/amazon_reviews/lux...

[5 rows x 5 columns]
(1368, 5)


### Get image feature vectors

In [None]:
combined_df['image_vec'] = combined_df['filename'].apply(lambda x: get_vector(x))
combined_df.to_csv(os.path.join(path, 'image_dataset.csv'), sep='\t')

In [None]:
combined_df.head()

## K-means clustering

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
import numpy as np

In [None]:
def check_vector(vector):
  return np.array([0.0 if str(x) == '' else float(x) for x in vector])[:512]

In [None]:
df = pd.read_csv(os.path.join(path, 'image_dataset.csv'), sep='\t', index_col=0,
                 converters={"image_vec": lambda x: x.strip("[]").replace('\n','').split(" ")})


In [None]:
df.head()

Unnamed: 0,category,overall,reviewerID,asin,filename,image_vec
0,luxury,5.0,A2BHOZILR7SY9,B000142FVW,/content/drive/MyDrive/data/amazon_reviews/lux...,"[5.75584233e-01, 7.38849044e-01, 1.01735726e-0..."
1,luxury,5.0,ACMSQCH1H7JZD,B000142FVW,/content/drive/MyDrive/data/amazon_reviews/lux...,"[1.58416998e+00, 1.06636274e+00, 1.33691028e-0..."
2,luxury,5.0,A2L77YQRAEA1YZ,B000142FVW,/content/drive/MyDrive/data/amazon_reviews/lux...,"[9.46014881e-01, 1.26953840e+00, 1.63240239e-0..."
3,luxury,5.0,A28W77RPDZK7AZ,B00014351Q,/content/drive/MyDrive/data/amazon_reviews/lux...,"[0.861207, , , 0.44911623, 0.2592813, , 1.4042..."
4,luxury,5.0,A28W77RPDZK7AZ,B00014351Q,/content/drive/MyDrive/data/amazon_reviews/lux...,"[1.2655932, , 0.3834534, , 1.149265, , , 1.561..."


In [None]:
df['image_vec'] = df['image_vec'].apply(lambda x: check_vector(x))

In [None]:
df.head()

Unnamed: 0,category,overall,reviewerID,asin,filename,image_vec
0,luxury,5.0,A2BHOZILR7SY9,B000142FVW,/content/drive/MyDrive/data/amazon_reviews/lux...,"[0.575584233, 0.738849044, 0.101735726, 0.2158..."
1,luxury,5.0,ACMSQCH1H7JZD,B000142FVW,/content/drive/MyDrive/data/amazon_reviews/lux...,"[1.58416998, 1.06636274, 0.133691028, 0.309149..."
2,luxury,5.0,A2L77YQRAEA1YZ,B000142FVW,/content/drive/MyDrive/data/amazon_reviews/lux...,"[0.946014881, 1.2695384, 0.163240239, 0.691329..."
3,luxury,5.0,A28W77RPDZK7AZ,B00014351Q,/content/drive/MyDrive/data/amazon_reviews/lux...,"[0.861207, 0.0, 0.0, 0.44911623, 0.2592813, 0...."
4,luxury,5.0,A28W77RPDZK7AZ,B00014351Q,/content/drive/MyDrive/data/amazon_reviews/lux...,"[1.2655932, 0.0, 0.3834534, 0.0, 1.149265, 0.0..."


In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=1234)
print(train_df.shape)
print(test_df.shape)

(1094, 6)
(274, 6)


In [None]:
X_train = np.array([list(x) for x in train_df['image_vec'].values])

In [None]:
X_train[:3]

array([[0.59663069, 0.4942548 , 0.45976794, ..., 0.32996583, 0.72349626,
        0.16348897],
       [0.20204067, 0.80782998, 0.47467181, ..., 0.17086645, 0.26184279,
        1.39446735],
       [0.03232409, 1.05188787, 0.84250593, ..., 0.03921824, 0.13116719,
        2.16587114]])

In [None]:
kmeans = KMeans(n_clusters=3, random_state=0).fit(X_train)

In [None]:
kmeans.labels_

array([1, 2, 0, ..., 1, 1, 1], dtype=int32)

In [None]:
test_df.head()

### Evaluation

In [None]:
# kmeans.predict([test_df['image_vec'].iloc[0]])
test_df['prediction'] = test_df['image_vec'].apply(lambda x: kmeans.predict([x])[0])
test_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,category,overall,reviewerID,asin,filename,image_vec,prediction
266,luxury,5.0,A24YNYNC6QJNBN,B002B4540O,/content/drive/MyDrive/data/amazon_reviews/lux...,"[0.796785831, 1.94842029, 1.02035093, 0.158777...",2
1088,beauty,5.0,A245UNW3PI53NG,B0009RF9DW,/content/drive/MyDrive/data/amazon_reviews/bea...,"[1.44555044, 0.0705312714, 0.455866218, 1.2235...",2
488,luxury,4.0,A25QBCHO0KFT0P,B00B95PWYE,/content/drive/MyDrive/data/amazon_reviews/lux...,"[0.0354711264, 1.57379246, 0.111152746, 0.0373...",0
1087,beauty,5.0,A85ENSL5HBBZF,B0009RF9DW,/content/drive/MyDrive/data/amazon_reviews/bea...,"[0.9738507, 0.0, 0.3631986, 0.0, 1.566278, 0.0...",0
1152,beauty,4.0,A25QBCHO0KFT0P,B0010ZBORW,/content/drive/MyDrive/data/amazon_reviews/bea...,"[1.3862458, 0.0, 0.15757063, 2.012104, 0.0, 0....",0


In [None]:
test_df.groupby('category')['prediction'].count()

category
beauty      35
fashion     28
luxury     211
Name: prediction, dtype: int64

In [None]:
test_df.groupby('category').count()

Unnamed: 0_level_0,overall,reviewerID,asin,filename,image_vec,prediction
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
beauty,35,35,35,35,35,35
fashion,28,28,28,28,28,28
luxury,211,211,211,211,211,211


In [None]:
test_df.groupby('prediction').count()

Unnamed: 0_level_0,category,overall,reviewerID,asin,filename,image_vec
prediction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,98,98,98,98,98,98
1,77,77,77,77,77,77
2,99,99,99,99,99,99


In [None]:
print(test_df[(test_df.prediction == 0) & (test_df.category == 'luxury')].shape)
print(test_df[(test_df.prediction == 1) & (test_df.category == 'luxury')].shape)
print(test_df[(test_df.prediction == 2) & (test_df.category == 'luxury')].shape)

(75, 7)
(64, 7)
(72, 7)


## K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
neigh = KNeighborsClassifier(n_neighbors=3)

- `category`가 아닌 `overall`로 간단하게 평점 예측하기

In [None]:
y_train = train_df['overall'].values
y_train

array([4., 4., 5., ..., 5., 4., 5.])

In [None]:
neigh.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [None]:
test_df['prediction'] = test_df['image_vec'].apply(lambda x: neigh.predict([x])[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
test_df.head()

Unnamed: 0,category,overall,reviewerID,asin,filename,image_vec,prediction
266,luxury,5.0,A24YNYNC6QJNBN,B002B4540O,/content/drive/MyDrive/data/amazon_reviews/lux...,"[0.796785831, 1.94842029, 1.02035093, 0.158777...",5.0
1088,beauty,5.0,A245UNW3PI53NG,B0009RF9DW,/content/drive/MyDrive/data/amazon_reviews/bea...,"[1.44555044, 0.0705312714, 0.455866218, 1.2235...",5.0
488,luxury,4.0,A25QBCHO0KFT0P,B00B95PWYE,/content/drive/MyDrive/data/amazon_reviews/lux...,"[0.0354711264, 1.57379246, 0.111152746, 0.0373...",4.0
1087,beauty,5.0,A85ENSL5HBBZF,B0009RF9DW,/content/drive/MyDrive/data/amazon_reviews/bea...,"[0.9738507, 0.0, 0.3631986, 0.0, 1.566278, 0.0...",5.0
1152,beauty,4.0,A25QBCHO0KFT0P,B0010ZBORW,/content/drive/MyDrive/data/amazon_reviews/bea...,"[1.3862458, 0.0, 0.15757063, 2.012104, 0.0, 0....",5.0


In [None]:
test_df[test_df.overall == test_df.prediction].count()

category      174
overall       174
reviewerID    174
asin          174
filename      174
image_vec     174
prediction    174
dtype: int64

In [None]:
len(test_df)

274

In [None]:
174/274

0.635036496350365