# Extract Posting Description Embeddings using BERT (without Fine-Tuning BERT)

Inspiration: [here](https://github.com/jalammar/jalammar.github.io/blob/master/notebooks/bert/A_Visual_Notebook_to_Using_BERT_for_the_First_Time.ipynb)

# 0. Setup

### Installs

In [1]:
# install huggingface
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 8.4MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 50.8MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 48.1MB/s 
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1


### Imports

In [2]:
# import HuggingFace models
# DistilBert is a smaller model so we can run and train faster
from transformers import DistilBertModel, DistilBertTokenizer

# just for visualize data files
import pandas as pd 

# necessary :(
import torch

# classic shit
import numpy as np

### Model and Tokenizer setup

In [3]:
MODEL_NAME = "distilbert-base-uncased"

# define model and tokenizer
model = DistilBertModel.from_pretrained(MODEL_NAME)
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME, padding_side = "right")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




# 1. Data

### Preprocess data

In [4]:
# mount to drive (that's where the data is)
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


Get metadata

In [5]:
import csv
with open("/content/drive/My Drive/UT/NN/project/data/train.csv", "r", encoding="utf8") as f:
    metadata = [{k: v for k, v in row.items()} for row in csv.DictReader(f, skipinitialspace=True)]

In [6]:
len(metadata)

34250

In [7]:
metadata[0]

{'image': '0000a68812bc7e98c42888dfb1c07da0.jpg',
 'image_phash': '94974f937d4c2433',
 'label_group': '249114794',
 'posting_id': 'train_129225211',
 'title': 'Paper Bag Victoria Secret'}

For now only get the titles (product description). We only take 1000 because we're just testing BERT out.

In [8]:
data = []
for row in metadata[:1000]:
  data.append(row["title"])

In [9]:
data[:3]

['Paper Bag Victoria Secret',
 'Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DOUBLE FOAM TAPE',
 'Maling TTS Canned Pork Luncheon Meat 397 gr']

Tokenize and encode our descriptions. 

In [10]:
encodings = tokenizer(data, truncation=True, padding=True)
print(encodings.keys())
print(encodings["input_ids"][0])
print(encodings["attention_mask"][0])
# Note that 101 = [CLS] and 102 = [SEP]

dict_keys(['input_ids', 'attention_mask'])
[101, 3259, 4524, 3848, 3595, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


# 2. Extract Embeddings / Features

### Pass inputs through model

In [11]:
inputs = torch.tensor(encodings["input_ids"])
masks = torch.tensor(encodings["attention_mask"])

# Turn off the gradient (we only want to do the forward pass)
with torch.no_grad():
    last_hidden_states = model(inputs, attention_mask = masks)

### Get Features / Embeddings

In [12]:
features = last_hidden_states[0][:,0,:].numpy()
# [:,0,:] means [all descriptions, [CLS] token, all hidden unit outputs]

Embedding for first description:

In [13]:
features[0]

array([-1.73107952e-01, -6.27488866e-02, -4.82059643e-02, -1.72041208e-02,
       -6.08978607e-02, -6.15112074e-02,  2.01431543e-01,  2.76894182e-01,
       -2.46687844e-01,  6.47843000e-04,  9.25970152e-02, -1.14208281e-01,
       -5.75973801e-02,  2.51374096e-01,  5.01818135e-02,  3.23558524e-02,
       -1.74137130e-01,  2.33021215e-01,  3.65431726e-01, -1.32005453e-01,
        9.27552432e-02, -2.13548467e-01, -1.54527038e-01, -1.39038295e-01,
       -1.24754876e-01, -2.95492988e-02, -1.09178111e-01,  2.82701831e-02,
        1.98081866e-01,  1.15455814e-01,  8.01238865e-02, -6.57693222e-02,
        5.78293726e-02, -6.91585690e-02,  1.52884051e-01, -6.65585250e-02,
        1.09637976e-01, -9.37690958e-02,  6.80252835e-02,  1.66997030e-01,
       -5.30317537e-02,  5.04702665e-02,  1.72082394e-01, -1.04389554e-02,
        2.48542856e-02, -1.16254359e-01, -1.82346272e+00,  3.44907157e-02,
       -1.14456698e-01, -2.05303222e-01,  1.37109250e-01,  2.47520977e-04,
        2.13381007e-01,  