In [1]:
import numpy as np
import pandas as pd
import csv
import pickle
import re
import time
from datetime import timedelta
from tqdm import tqdm

import torch
from torch.utils.data import DataLoader, Dataset
from transformers import *

import utils.gen_utils as utils
from utils.data_utils import MyMapDataset
import os
from pathlib import Path
import utils.dataset_processors as dataset_processors

# sys.path.insert(0, os.getcwd())

start = time.time()

if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
    print("GPU found (", torch.cuda.get_device_name(torch.cuda.current_device()), ")")
    torch.cuda.set_device(torch.cuda.current_device())
    print("num device avail: ", torch.cuda.device_count())

else:
    DEVICE = torch.device("cpu")
    print("running on cpu")


def extract_bert_features(input_ids, mode, n_hl):
    """Extract bert embedding for each input."""
    tmp = []
    bert_output = model(input_ids)
    # bert_output[2](this id gives all BERT outputs)[ii+1](which BERT layer)[:,0,:](taking the <CLS> output)

    for ii in range(n_hl):
        if embed_mode == "cls":
            tmp.append(bert_output[2][ii + 1][:, 0, :].cpu().numpy())
        elif embed_mode == "mean":
            tmp.append((bert_output[2][ii + 1].cpu().numpy()).mean(axis=1))

    hidden_features.append(np.array(tmp))
    return hidden_features


def get_model(embed):
    # * Model          | Tokenizer          | Pretrained weights shortcut
    # MODEL=(DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased')
    if embed == "bert-base":
        n_hl = 12
        hidden_dim = 768
        MODEL = (BertModel, BertTokenizer, "bert-base-uncased")

    elif embed == "bert-large":
        n_hl = 24
        hidden_dim = 1024
        MODEL = (BertModel, BertTokenizer, "bert-large-uncased")

    elif embed == "albert-base":
        n_hl = 12
        hidden_dim = 768
        MODEL = (AlbertModel, AlbertTokenizer, "albert-base-v2")

    elif embed == "albert-large":
        n_hl = 24
        hidden_dim = 1024
        MODEL = (AlbertModel, AlbertTokenizer, "albert-large-v2")

    model_class, tokenizer_class, pretrained_weights = MODEL

    # load the LM model and tokenizer from the HuggingFace Transformers library
    model = model_class.from_pretrained(
        pretrained_weights, output_hidden_states=True
    )  # output_attentions=False
    tokenizer = tokenizer_class.from_pretrained(pretrained_weights, do_lower_case=True)

    return model, tokenizer, n_hl, hidden_dim


running on cpu


In [2]:
dataset = 'reddit'
token_length = 10
batch_size=int(32)
embed = 'albert-base'
mode = ''
op_dir = 'pk1_data'
embed_mode = 'mean'
print(
    "{} | {} | {} | {} | {}".format(dataset, embed, token_length, mode, embed_mode)
)

reddit | albert-base | 10 |  | mean


In [3]:
model, tokenizer, n_hl, hidden_dim = get_model(embed)

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertModel: ['predictions.decoder.weight', 'predictions.LayerNorm.bias', 'predictions.decoder.bias', 'predictions.dense.weight', 'predictions.bias', 'predictions.LayerNorm.weight', 'predictions.dense.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
reddit_file = "data/reddit/5kdata5.csv"
reddit_file = "data/reddit/4mildata_uncleaned.csv"
start = time.time()
map_dataset = MyMapDataset(dataset, tokenizer, token_length, DEVICE, mode, reddit_file)
end = time.time()
print(f"Took {int(end-start)} seconds")

params: dataset --> reddit
E :  0    8881
1    2483
Name: E, dtype: int64
N :  1    10562
0      802
Name: N, dtype: int64
F :  1    6960
0    4404
Name: F, dtype: int64
J :  0    6790
1    4574
Name: J, dtype: int64
Length of df: 11364


In [5]:
data_loader = DataLoader(
    dataset=map_dataset,
    batch_size=int(batch_size),
    shuffle=False,
)

In [6]:
if DEVICE == torch.device("cuda"):
    model = model.cuda()
    print(
        "\ngpu mem alloc: ", round(torch.cuda.memory_allocated() * 1e-9, 2), " GB"
    )

print("starting to extract LM embeddings...")

hidden_features = []
all_targets = []
all_author_ids = []


starting to extract LM embeddings...


In [7]:
print("Data loader batches:", len(data_loader))
print("Batch size:", batch_size)
# get bert embedding for each input
for author_ids, input_ids, targets in data_loader:
    with torch.no_grad():
        all_targets.append(targets.cpu().numpy())
        all_author_ids.append(author_ids.cpu().numpy())
        extract_bert_features(input_ids, mode, n_hl)

In [8]:
description = "4mil_uncleaned"
Path(op_dir).mkdir(parents=True, exist_ok=True)
pkl_file_name = dataset + "-" + embed + "-" + embed_mode + "-" + description + ".pkl"

file = open(os.path.join(op_dir, pkl_file_name), "wb")
pickle.dump(zip(all_author_ids, hidden_features, all_targets), file)
file.close()

# print(timedelta(seconds=int(time.time() - start)), end=' ')
print("extracting embeddings for {} dataset: DONE!".format(dataset))


extracting embeddings for reddit dataset: DONE!
