<a href="https://colab.research.google.com/github/lfreedom2750/lfreedom2750/blob/main/Welcome_To_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
os.kill(os.getpid(), 9)


In [1]:
import pandas as pd
import numpy as np
import re
import torch
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import gc

In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True,
                                  )

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [5]:
import re

file_fake = '/content/Fake.csv'
file_true = '/content/True.csv'

data_fake = pd.read_csv(file_fake)
data_true = pd.read_csv(file_true)

data_fake['label'] = 1
data_true['label'] = 0

data_combined = pd.concat([data_fake, data_true], ignore_index=True)

def clean_text(text):
    text = re.sub(r"\\", "", text)
    text = re.sub(r"\'", "", text)
    text = re.sub(r"\"", "", text)
    return text.strip().lower()

data_combined['text'] = data_combined['title'] + " " + data_combined['text']
data_combined['text'] = data_combined['text'].apply(clean_text)

texts = data_combined['text'].values
labels = data_combined['label'].values

In [6]:
def bert_text_preparation(text, tokenizer, max_len=512):
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    tokenized_text = tokenized_text[:max_len-2]  # Truncate for [CLS] and [SEP]
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1] * len(indexed_tokens)

    padding_length = max_len - len(indexed_tokens)
    indexed_tokens += [0] * padding_length
    segments_ids += [0] * padding_length

    tokens_tensor = torch.tensor([indexed_tokens]).to(device)
    segments_tensors = torch.tensor([segments_ids]).to(device)

    return tokens_tensor, segments_tensors

In [7]:
def get_bert_embeddings(tokens_tensor, segments_tensors, model):
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]  # All hidden states

    token_embeddings = hidden_states[-1]  # Last layer
    token_embeddings = torch.squeeze(token_embeddings, dim=0)
    list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]

    return list_token_embeddings

In [8]:
def create_dataloader(texts, tokenizer, batch_size=4, max_len=512):
    input_ids = []
    segment_ids = []

    for text in tqdm(texts, desc="Preparing data"):
        tokens_tensor, segments_tensors = bert_text_preparation(text, tokenizer, max_len)
        input_ids.append(tokens_tensor.squeeze(0))
        segment_ids.append(segments_tensors.squeeze(0))

    input_ids = torch.stack(input_ids)
    segment_ids = torch.stack(segment_ids)

    dataset = TensorDataset(input_ids, segment_ids)
    dataloader = DataLoader(dataset, batch_size=batch_size)

    return dataloader

In [9]:
# Tạo word embeddings từ mô hình BERT
def process_texts_in_batches(texts, tokenizer, model, batch_size=4, max_len=512):
    model.eval()

    num_batches = len(texts) // batch_size + (1 if len(texts) % batch_size > 0 else 0)

    for i in tqdm(range(num_batches), desc="Processing Batches"):
        start_idx = i * batch_size
        end_idx = min(start_idx + batch_size, len(texts))
        batch_texts = texts[start_idx:end_idx]
        batch_tokens = []
        batch_segments = []

        for text in batch_texts:
            tokens_tensor, segments_tensors = bert_text_preparation(text, tokenizer, max_len)
            batch_tokens.append(tokens_tensor)
            batch_segments.append(segments_tensors)

        batch_tokens = torch.cat(batch_tokens, dim=0)
        batch_segments = torch.cat(batch_segments, dim=0)

        # Tạo embeddings cho batch hiện tại
        with torch.no_grad():
            outputs = model(batch_tokens, batch_segments)
            hidden_states = outputs[2]  # All hidden states
            token_embeddings = hidden_states[-1]  # Last layer
            token_embeddings = torch.squeeze(token_embeddings, dim=1)

            # Lưu embeddings vào Google Drive trực tiếp để giảm tiêu thụ bộ nhớ
            batch_embeddings = [token_embed.tolist() for token_embed in token_embeddings]
            np.save(f'/content/drive/MyDrive/bert_embeddings_batch_{i}.npy', batch_embeddings)

        # Giải phóng bộ nhớ sau khi xử lý từng batch
        del batch_tokens, batch_segments, token_embeddings, batch_embeddings
        gc.collect()

    # Lưu danh sách các tệp embeddings nếu cần
    # Bạn có thể kết hợp các tệp này lại sau khi hoàn thành

# Xử lý dữ liệu và lưu kết quả vào Google Drive
process_texts_in_batches(texts, tokenizer, model, batch_size=4)

Processing Batches:  11%|█         | 1188/11225 [2:52:47<24:19:48,  8.73s/it]


KeyboardInterrupt: 

In [11]:
import os

drive.mount('/content/drive')

# Thư mục lưu các tệp nhúng
embedding_dir = '/content/drive/MyDrive/'

# Liệt kê tất cả các tệp nhúng trong thư mục
embedding_files = [f for f in os.listdir(embedding_dir) if f.startswith('bert_embeddings_batch') and f.endswith('.npy')]

# Đọc và in các vector nhúng từ các tệp
for embedding_file in embedding_files:
    file_path = os.path.join(embedding_dir, embedding_file)
    embeddings = np.load(file_path, allow_pickle=True)
    print(f"Embeddings from {embedding_file}:")
    print(embeddings)
    print("\n")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
   -0.05970227]
  [ 0.66621292  0.2912305   0.52635485 ... -0.60061538  0.76977581
   -0.12359751]
  ...
  [ 0.00771046  0.05383465  0.26294014 ...  0.03356766  0.05422965
   -0.30429673]
  [ 0.17281468  0.07672158  0.12090909 ...  0.15755706  0.00344646
   -0.4328025 ]
  [ 0.22913858 -0.18153153  0.34811985 ...  0.01161112 -0.00927681
   -0.43733791]]]


Embeddings from bert_embeddings_batch_1102.npy:
[[[-5.13133228e-01 -4.66227740e-01 -4.53999609e-01 ... -4.80062366e-01
    1.92069232e-01  4.96130705e-01]
  [-5.96582294e-01 -2.30517089e-02 -1.97313160e-01 ... -2.37878010e-01
    4.89433587e-01  1.61241457e-01]
  [-4.29223806e-01 -6.06705248e-01  1.03482270e+00 ... -5.00021338e-01
    9.87043977e-02  8.33755493e-01]
  ...
  [-3.06893617e-01 -8.64084423e-01 -8.23002309e-02 ...  3.08297485e-01
    5.80722094e-01 -2.92894900e-01]
  [-1.72102556e-01  5.03696278e-02  3.74154985e-01 ... -1.01837844e-01
    5.29801880e-04 -1.66