In [67]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel,DistilBertTokenizer,DistilBertModel
import pyarrow as pa
import pyarrow.parquet as pq
from datetime import datetime


In [68]:
print(torch.cuda.is_available())



True


In [69]:
df = pd.read_csv("src/transformed/data.csv")
# df = df.iloc[:10000]
textcol = df['content']
textcol

0          US says seizes tanker used to evade North Kore...
1          Is America the greatest country? Part 1 drops ...
2          O’Reilly predicts THIS is What DOOMS Biden’s r...
3               Corruption in Politics [HASH] [HASH]  [LINK]
4          @kylegriffin1 Are you getting all the info on ...
                                 ...                        
2367603                  Like me. Share me. Follow me [LINK]
2367604                .the main thing is to was love [LINK]
2367605    Amazing! Found my script: Download ReBuild - W...
2367606                               Present Perfect [LINK]
2367607                             GoPro SummerStory [LINK]
Name: content, Length: 2367608, dtype: object

In [70]:
# Ensure CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



modeltype = "DBert"
# Load a pre-trained BERT model and tokenizer

if modeltype == "DBert":
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
    model = DistilBertModel.from_pretrained('distilbert-base-cased')
else:
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
    model = BertModel.from_pretrained('bert-base-cased')


# Move the model to the GPU
model = model.to(device)
model.eval()


DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Li

In [71]:
torch.cuda.empty_cache()
def vectorize_text(text, tokenizer, model):
    # Tokenize input text and convert to tensor
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
    
    # Move inputs to the same device as the model
    if torch.cuda.is_available():
        inputs = {k: v.to('cuda') for k, v in inputs.items()}
    
    # Get embeddings
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Pool the outputs and convert to numpy array
    # Here, we simply take the mean of the second to last hidden layer
    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    
    return embeddings


def print_gpu_memory():
    allocated = torch.cuda.memory_allocated(0)
    cached = torch.cuda.memory_reserved(0)
    print(f"GPU Memory - Allocated: {allocated / 1e9:.2f} GB, Cached: {cached / 1e9:.2f} GB")

batches = 8
batch_size = len(textcol) // batches
embedded_content = []

for batch in range(batches):
    print(f"\rProcessing Batch {batch+1}/{batches}\n",end="",flush=True)
    start_idx = batch * batch_size
    end_idx = start_idx + batch_size if batch < batches - 1 else len(textcol)
    #Defining dataframe and embedded vector. Will combine these and save as pyarrow file
    dfbatch = df.iloc[start_idx:end_idx]
    print("Current Batch memory use:",dfbatch['content'].memory_usage()/1000,"MB")
    embedded_content = []
    for idx, value in enumerate(textcol.iloc[start_idx:end_idx], start=start_idx):
        if idx % 2368 == 0:
            print(f"\r{idx}/{len(textcol)} -- {round((idx/len(textcol))*100,4)}%", end='', flush=True)
        embedded_content.append(vectorize_text(value, tokenizer, model))
    print(f"Batch {batch} Complete -- Time: {datetime.now()}\n")
    embedded_content_array = np.array(embedded_content)
    np.save(f'src/embedded/embedded_text{batch}.npy', embedded_content_array)

    torch.cuda.empty_cache()



# df.to_csv("src/transformed/bertdata.csv", index=False)

Processing Batch 1/8
Current Batch memory use: 2367.74 MB
0/2367608 -- 0.0%

293632/2367608 -- 12.4021%Batch 0 Complete -- Time: 2024-03-07 21:51:43.400905

Processing Batch 2/8
Current Batch memory use: 2367.74 MB
589632/2367608 -- 24.9041%Batch 1 Complete -- Time: 2024-03-07 22:36:11.705035

Processing Batch 3/8
Current Batch memory use: 2367.74 MB
885632/2367608 -- 37.4062%Batch 2 Complete -- Time: 2024-03-07 23:24:02.391604

Processing Batch 4/8
Current Batch memory use: 2367.74 MB
1181632/2367608 -- 49.9083%Batch 3 Complete -- Time: 2024-03-08 00:16:01.508708

Processing Batch 5/8
Current Batch memory use: 2367.74 MB
1302400/2367608 -- 55.0091%

36424.73846153846