In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel,DistilBertTokenizer,DistilBertModel
import pyarrow as pa
import pyarrow.parquet as pq
from datetime import datetime
import os
import gc

In [2]:
print(torch.cuda.is_available())



True


In [3]:
df = pd.read_csv("src/transformed/data.csv")
df = df.drop(columns=["Unnamed: 0"])
# df = df.iloc[:10000]
textcol = df['content']
textcol

df

Unnamed: 0,followers,content,retweet,troll
0,6.568078,US says seizes tanker used to evade North Kore...,0,0.0
1,1.386294,Is America the greatest country? Part 1 drops ...,0,0.0
2,10.497367,O’Reilly predicts THIS is What DOOMS Biden’s r...,0,0.0
3,7.794411,Corruption in Politics [HASH] [HASH] [LINK],0,0.0
4,9.529812,@kylegriffin1 Are you getting all the info on ...,0,0.0
...,...,...,...,...
2367603,2.639057,Like me. Share me. Follow me [LINK],0,1.0
2367604,3.178054,.the main thing is to was love [LINK],0,1.0
2367605,3.713572,Amazing! Found my script: Download ReBuild - W...,0,1.0
2367606,1.609438,Present Perfect [LINK],0,1.0


In [4]:
# Ensure CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



modeltype = "DBerts"
# Load a pre-trained BERT model and tokenizer

if modeltype == "DBert":
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
    model = DistilBertModel.from_pretrained('distilbert-base-cased')
else:
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
    model = BertModel.from_pretrained('bert-base-cased')


# Move the model to the GPU
model = model.to(device)
model.eval()


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [5]:
flag = False

The text content of the data is BERT Encoded and then saved as numpy files. This took approximately five hours

In [6]:
torch.cuda.empty_cache()
def vectorize_text(text, tokenizer, model):
    # Tokenize input text and convert to tensor
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
    
    # Move inputs to the same device as the model
    if torch.cuda.is_available():
        inputs = {k: v.to('cuda') for k, v in inputs.items()}
    
    # Get embeddings
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Pool the outputs and convert to numpy array
    # Here, we simply take the mean of the second to last hidden layer
    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    
    return embeddings


def print_gpu_memory():
    allocated = torch.cuda.memory_allocated(0)
    cached = torch.cuda.memory_reserved(0)
    print(f"GPU Memory - Allocated: {allocated / 1e9:.2f} GB, Cached: {cached / 1e9:.2f} GB")

batches = 8
batch_size = len(textcol) // batches
embedded_content = []

if flag:
    for batch in range(batches):
        print(f"\rProcessing Batch {batch+1}/{batches}\n",end="",flush=True)
        start_idx = batch * batch_size
        end_idx = start_idx + batch_size if batch < batches - 1 else len(textcol)
        #Defining dataframe and embedded vector. Will combine these and save as pyarrow file
        dfbatch = df.iloc[start_idx:end_idx]
        print("Current Batch memory use:",dfbatch['content'].memory_usage()/1000,"MB")
        embedded_content = []
        for idx, value in enumerate(textcol.iloc[start_idx:end_idx], start=start_idx):
            if idx % 2368 == 0:
                print(f"\r{idx}/{len(textcol)} -- {round((idx/len(textcol))*100,4)}%", end='', flush=True)
            embedded_content.append(vectorize_text(value, tokenizer, model))
        print(f"Batch {batch} Complete -- Time: {datetime.now()}\n")
        embedded_content_array = np.array(embedded_content)
        np.save(f'src/embedded/embedded_text{batch}.npy', embedded_content_array)

        torch.cuda.empty_cache()
        flag = False



# df.to_csv("src/transformed/bertdata.csv", index=False)

In [14]:
batches = 9
if batches != 150:
    batches = 300
    npylist = []
    for file in os.listdir("src/embedded/"):
        if file.endswith(".npy"):
            filepath = os.path.join("src/embedded/", file)
            npylist.append(np.load(filepath))
    cols = np.concatenate(npylist,axis=0)
    del npylist

    cols = np.squeeze(cols, axis=1)
    batch_size = len(textcol) // batches
    gc.collect()

    num_batches = int(np.ceil(len(cols) / batch_size))
    for i in range(num_batches):
        print(f"\rProcessing Batch {i}\n",end="",flush=True)
        # Calculate start and end indices of the current batch
        start_idx = i * batch_size
        end_idx = start_idx + batch_size
        
        # Slice the data to get the current batch
        batch_data = cols[start_idx:end_idx]
        
        # Construct a unique filename for the batch
        batch_filename = os.path.join("src/embedded_ef/", f"batch_{i}.npy")
        # Save the batch to a .npy file
        np.save(batch_filename, batch_data)

Processing Batch 0
Processing Batch 1
Processing Batch 2
Processing Batch 3
Processing Batch 4
Processing Batch 5
Processing Batch 6
Processing Batch 7
Processing Batch 8
Processing Batch 9
Processing Batch 10
Processing Batch 11
Processing Batch 12
Processing Batch 13
Processing Batch 14
Processing Batch 15
Processing Batch 16
Processing Batch 17
Processing Batch 18
Processing Batch 19
Processing Batch 20
Processing Batch 21
Processing Batch 22
Processing Batch 23
Processing Batch 24
Processing Batch 25
Processing Batch 26
Processing Batch 27
Processing Batch 28
Processing Batch 29
Processing Batch 30
Processing Batch 31
Processing Batch 32
Processing Batch 33
Processing Batch 34
Processing Batch 35
Processing Batch 36
Processing Batch 37
Processing Batch 38
Processing Batch 39
Processing Batch 40
Processing Batch 41
Processing Batch 42
Processing Batch 43
Processing Batch 44
Processing Batch 45
Processing Batch 46
Processing Batch 47
Processing Batch 48
Processing Batch 49
Processing

In [7]:
npylist = []
for file in os.listdir("src/embedded/"):
    if file.endswith(".npy"):
        filepath = os.path.join("src/embedded/", file)
        npylist.append(np.load(filepath))

cols = np.concatenate(npylist,axis=0)
cols = np.squeeze(cols, axis=1)
cols

array([[ 0.12959589, -0.12403097, -0.23677471, ...,  0.10255872,
         0.28068584,  0.16638151],
       [ 0.18695916,  0.02634201, -0.32440275, ...,  0.45099917,
         0.32563406,  0.5358391 ],
       [ 0.17337924, -0.04668039, -0.04070751, ...,  0.17692627,
         0.35963893,  0.35065055],
       ...,
       [ 0.19144298,  0.10686906,  0.11784254, ...,  0.13194168,
         0.17162265,  0.2644113 ],
       [ 0.16193017,  0.01979022,  0.04998269, ...,  0.2538196 ,
         0.4808616 ,  0.3523853 ],
       [ 0.30106783, -0.06647517,  0.16155279, ...,  0.09048916,
         0.32095256,  0.29013553]], dtype=float32)

In [11]:
cols.shape

(2367608, 768)

In [11]:
309000 // 15784.053333333333


19.0

In [13]:
2367608/295951

8.0

In [10]:
#Making a dataframe out of BERT Data
bertdf = pd.DataFrame(cols,index=None)


#Dropping string content column
try:
    df = df.drop(columns=["content"])
except:
    df
#Concatenating
ml_df = pd.concat([df,bertdf],axis=1)


ml_df

Unnamed: 0,followers,retweet,troll,0,1,2,3,4,5,6,...,758,759,760,761,762,763,764,765,766,767
0,6.568078,0,0.0,0.129596,-0.124031,-0.236775,-0.050911,-0.166823,-0.048487,0.292008,...,-0.054018,0.150843,0.101452,-0.079402,-0.082468,0.165584,-0.004423,0.102559,0.280686,0.166382
1,1.386294,0,0.0,0.186959,0.026342,-0.324403,0.347149,-0.190089,-0.056083,0.222135,...,-0.340879,0.120872,-0.021201,-0.131148,-0.264621,-0.079073,-0.026709,0.450999,0.325634,0.535839
2,10.497367,0,0.0,0.173379,-0.046680,-0.040708,0.179813,-0.136966,-0.203053,0.199426,...,-0.150082,0.010410,0.023045,-0.041895,-0.121329,-0.108572,-0.124718,0.176926,0.359639,0.350651
3,7.794411,0,0.0,0.199814,0.083041,-0.156868,0.174393,-0.325946,-0.297179,0.212811,...,-0.148489,0.149483,0.035641,0.031641,-0.206473,0.101055,-0.115431,0.277113,0.364696,0.317693
4,9.529812,0,0.0,0.236709,-0.057754,-0.153811,0.104392,0.017117,-0.066983,0.228630,...,-0.236277,-0.026202,0.002557,-0.177493,-0.050488,-0.022349,0.021818,0.371962,0.221054,0.340444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2367603,2.639057,0,1.0,0.233419,-0.023126,0.117879,0.057869,0.147223,-0.075588,0.261743,...,-0.138552,0.004331,-0.032524,-0.237894,0.112055,0.045284,0.275680,0.210385,0.505846,0.109065
2367604,3.178054,0,1.0,0.152636,-0.035945,-0.032915,-0.007868,0.391450,-0.044776,0.244196,...,0.076266,-0.060051,-0.031566,0.065074,0.013659,0.108144,0.030948,0.268713,0.257801,0.086261
2367605,3.713572,0,1.0,0.191443,0.106869,0.117843,0.074146,0.003050,-0.161729,0.263482,...,-0.014377,-0.019059,-0.005819,-0.248802,-0.220177,-0.290386,0.211530,0.131942,0.171623,0.264411
2367606,1.609438,0,1.0,0.161930,0.019790,0.049983,-0.056537,0.099022,-0.058965,0.277467,...,-0.056340,0.152280,0.168008,0.060415,-0.054297,0.135931,0.185269,0.253820,0.480862,0.352385
