<a href="https://colab.research.google.com/github/jaehoonnie/NLP1/blob/main/BERT1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install transformers



In [1]:
from transformers import BertTokenizer, BertModel

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')  # For uncased English
model = BertModel.from_pretrained('bert-base-uncased')

# Check if the model is loaded successfully
print("BERT model and tokenizer loaded.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


BERT model and tokenizer loaded.


In [2]:
# Input text
text = "BERT is amazing!"

# Tokenize text
inputs = tokenizer(text, return_tensors="pt")
print(f"Tokenized input IDs: {inputs['input_ids']}")
print(f"Attention mask: {inputs['attention_mask']}")

Tokenized input IDs: tensor([[  101, 14324,  2003,  6429,   999,   102]])
Attention mask: tensor([[1, 1, 1, 1, 1, 1]])


In [3]:
# Run input through BERT model
outputs = model(**inputs)

# Outputs
print(f"Last hidden state shape: {outputs.last_hidden_state.shape}")

Last hidden state shape: torch.Size([1, 6, 768])


In [4]:
import torch

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move model and inputs to GPU
model = model.to(device)
inputs = {key: val.to(device) for key, val in inputs.items()}

# Run the model on GPU
outputs = model(**inputs)
print(f"Output shape on GPU: {outputs.last_hidden_state.shape}")

Using device: cuda
Output shape on GPU: torch.Size([1, 6, 768])


In [5]:
import re
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')
# Download NLTK tokenizer if not already done
nltk.download('punkt')

# Load text from file
file_path = "/content/drive/MyDrive/Colab Notebooks/911.txt"
with open(file_path, 'r') as f:
    text = f.read()

# Tokenize text
tokens = word_tokenize(text)

# Define car-related terms and hard words (examples, can be extended)
car_related_terms = [
    "Porsche", "911", "hybrid", "turbo", "turbocharger", "GT3", "Carrera",
    "engine", "flat-six", "GTS", "PDK", "bhp", "torque"
]
hard_words = [
    "regenerative", "legislative", "embodied", "prototype", "comprehensive",
    "reconfigurable", "intercooler", "aerodynamic"
]

# Normalize tokens for comparison
normalized_tokens = [token.lower() for token in tokens]

# Label tokens
labeled_tokens = []
for token in tokens:
    token_lower = token.lower()
    if token_lower in [term.lower() for term in car_related_terms]:
        labeled_tokens.append((token, "Car-Related"))
    elif token_lower in [word.lower() for word in hard_words]:
        labeled_tokens.append((token, "Hard"))
    else:
        labeled_tokens.append((token, "Normal"))

# Output labeled tokens
for token, label in labeled_tokens[:50]:  # Display the first 50 for brevity
    print(f"{token}: {label}")

OVERVIEW: Normal
WHAT: Normal
IS: Normal
IT: Normal
?: Normal
It: Normal
’: Normal
s: Normal
the: Normal
same: Normal
,: Normal
but: Normal
different: Normal
.: Normal
It: Normal
’: Normal
s: Normal
the: Normal
Porsche: Car-Related
911: Car-Related
as: Normal
we: Normal
know: Normal
and: Normal
love: Normal
it: Normal
,: Normal
but: Normal
it: Normal
’: Normal
s: Normal
facelift: Normal
time: Normal
for: Normal
the: Normal
992: Normal
generation: Normal
and: Normal
more: Normal
has: Normal
changed: Normal
than: Normal
you: Normal
might: Normal
think: Normal
.: Normal
Heck: Normal
,: Normal
there: Normal
’: Normal


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
!pip install datasets
!pip install seqeval

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [6]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report
import torch
import nltk

# Step 1: Load training data from `word.csv`
train_path = "/content/drive/MyDrive/Colab Notebooks/word.csv"
train_df = pd.read_csv(train_path)

# Assuming `word.csv` has columns: `token`, `label`
assert 'token' in train_df.columns and 'label' in train_df.columns, "CSV must have 'token' and 'label' columns"

# Group tokens and labels by sentence if necessary
sentences = []
labels = []
sentence = []
label_list = []
for _, row in train_df.iterrows():
    if pd.isnull(row['token']):  # Sentence separator
        if sentence:
            sentences.append(sentence)
            labels.append(label_list)
            sentence = []
            label_list = []
    else:
        sentence.append(row['token'])
        label_list.append(row['label'])
if sentence:  # Add the last sentence
    sentences.append(sentence)
    labels.append(label_list)

# Map labels to IDs
unique_labels = sorted(set(label for label_seq in labels for label in label_seq))
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

# Convert labels to IDs
labels = [[label2id[label] for label in label_seq] for label_seq in labels]

# Create dataset
train_data = Dataset.from_dict({"tokens": sentences, "labels": labels})

# Step 2: Load evaluation text from `911.txt`
eval_path = "/content/drive/MyDrive/Colab Notebooks/911.txt"
with open(eval_path, "r") as f:
    eval_text = f.read()

nltk.download('punkt')
eval_tokens = nltk.word_tokenize(eval_text)
eval_sentences = [eval_tokens]  # Treat the whole document as one sentence

# Step 3: Tokenizer and Model
model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_id])
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_train_data = train_data.map(tokenize_and_align_labels, batched=True)
tokenized_eval_data = Dataset.from_dict({"tokens": eval_sentences}).map(
    lambda x: tokenizer(x["tokens"], truncation=True, is_split_into_words=True), batched=True
)

# Load pre-trained model
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label2id))
data_collator = DataCollatorForTokenClassification(tokenizer)

# Step 4: Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    report_to="none"
)

# Trainer setup with eval_dataset included
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_train_data,  # Provide eval dataset here
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Step 5: Train the model
trainer.train()

# Step 6: Evaluate on `911.txt`
eval_output = trainer.predict(tokenized_eval_data)
predictions = eval_output.predictions.argmax(-1)

# Map predictions to labels
predicted_labels = [[id2label[label] for label in sentence if label != -100] for sentence in predictions]

# Print predictions for `911.txt`
print(f"Predicted labels for 911.txt: {predicted_labels}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.875495
2,No log,0.670386
3,No log,0.585646


Predicted labels for 911.txt: [['Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Hard', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'normal', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Hard', 'Hard', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'normal', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'normal', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'Car-Related', 'normal', 'Car-Related', 'Car-Related', 'Car

In [10]:
!pip install tensorflow-text

Collecting tensorflow-text
  Downloading tensorflow_text-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Collecting tensorflow<2.19,>=2.18.0 (from tensorflow-text)
  Downloading tensorflow-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting tensorboard<2.19,>=2.18 (from tensorflow<2.19,>=2.18.0->tensorflow-text)
  Downloading tensorboard-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Downloading tensorflow_text-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m78.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tensorflow-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (615.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m615.3/615.3 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tensorboard-2.18.0-py3-none-any.whl (5.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━

In [7]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

# Load the 911.txt content
input_path = "/content/drive/MyDrive/Colab Notebooks/Grancabrio.txt"
output_path = "911_ground_truth.csv"

with open(input_path, "r") as file:
    text = file.read()

# Tokenize the text
tokens = word_tokenize(text)

# Example: Assign ground truth labels (you can modify these)
# Use 'Car-Related' for car terms, 'Hard' for complex terms, 'Normal' for others
ground_truth_labels = []
for token in tokens:
    if token.lower() in ["porsche", "911", "turbo", "hybrid", "gts", "targa"]:
        ground_truth_labels.append("Car-Related")
    elif token.lower() in ["regenerative", "flywheel", "intercooler", "aerodynamic"]:
        ground_truth_labels.append("Hard")
    else:
        ground_truth_labels.append("Normal")

# Create DataFrame
ground_truth_df = pd.DataFrame({"token": tokens, "label": ground_truth_labels})

# Save as CSV
ground_truth_df.to_csv(output_path, index=False)

print(f"Ground truth saved to {output_path}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Ground truth saved to 911_ground_truth.csv


In [1]:
!pip install datasets seqeval transformers nltk

import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from nltk.tokenize import word_tokenize
import torch
import nltk
from sklearn.metrics import classification_report, accuracy_score

# Download NLTK tokenizer
nltk.download('punkt')

# Step 1: Load and preprocess training data
# Assuming 'word.csv' has columns: 'token', 'label'
train_path = "/content/drive/MyDrive/Colab Notebooks/word.csv"
train_df = pd.read_csv(train_path)

assert 'token' in train_df.columns and 'label' in train_df.columns, "CSV must have 'token' and 'label' columns"

# Group tokens and labels by sentences
sentences = []
labels = []
sentence = []
label_list = []

for _, row in train_df.iterrows():
    if pd.isnull(row['token']):  # Sentence separator
        if sentence:
            sentences.append(sentence)
            labels.append(label_list)
            sentence = []
            label_list = []
    else:
        sentence.append(row['token'])
        label_list.append(row['label'])

if sentence:  # Add the last sentence
    sentences.append(sentence)
    labels.append(label_list)

# Map labels to numerical IDs
unique_labels = sorted(set(label for label_seq in labels for label in label_seq))
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

# Convert labels to IDs
numerical_labels = [[label2id[label] for label in label_seq] for label_seq in labels]

# Create a dataset
train_data = Dataset.from_dict({"tokens": sentences, "labels": numerical_labels})

# Step 2: Tokenization
model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)  # Ignore special tokens
            else:
                label_ids.append(label[word_id])
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_train_data = train_data.map(tokenize_and_align_labels, batched=True)

# Step 3: Fine-tune BERT
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label2id))

data_collator = DataCollatorForTokenClassification(tokenizer)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Train the model
trainer.train()

# Step 4: Predict and evaluate
# Load the Review.csv file (first 150 reviews)
review_path = "Review.csv"
review_df = pd.read_csv(review_path)

# Extract the first 150 reviews
first_150_reviews = review_df["Review"].head(150).tolist()
input_text = "\n".join(first_150_reviews)

# Predict function
def predict_text(text, model, tokenizer):
    tokens = word_tokenize(text)
    inputs = tokenizer(tokens, return_tensors="pt", is_split_into_words=True, truncation=True)
    inputs = {key: val.to(model.device) for key, val in inputs.items()}
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1).squeeze().tolist()

    # Map predictions to labels
    predicted_labels = [id2label[label] for label in predictions if label != -100]
    result = list(zip(tokens, predicted_labels))
    return result

# Get predictions for the first 150 reviews
result = predict_text(input_text, model, tokenizer)

# Print labeled tokens
for token, label in result:
    print(f"{token}: {label}")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


I: Car-Related
love: Car-Related
this: Car-Related
car: Car-Related
.: Car-Related
Gas: Car-Related
mileage: Car-Related
,: Car-Related
suspension: Car-Related
,: Car-Related
and: Car-Related
stereo: normal
are: Car-Related
great: Car-Related
.: Car-Related
Bluetooth: Car-Related
integration: Car-Related
and: Car-Related
Pandora: Car-Related
integration: Car-Related
work: Car-Related
perfectly: Car-Related
.: Car-Related
Suspension: Car-Related
and: Car-Related
noise: Car-Related
cancellation: Car-Related
work: Car-Related
amazingly: Car-Related
well: Car-Related
.: Car-Related
Although: Car-Related
it: Car-Related
looks: normal
small: Hard
,: Car-Related
it: Car-Related
has: Car-Related
plenty: normal
of: Car-Related
room: Car-Related
and: Car-Related
has: Car-Related
always: Car-Related
been: Car-Related
big: Car-Related
enough: Car-Related
.: Car-Related
Gas: Car-Related
mileage: Car-Related
has: Car-Related
been: Car-Related
from: Car-Related
42: Car-Related
to: Car-Related
45: Car

In [8]:
# Save the fine-tuned model and tokenizer
output_dir = "./saved_model"

# Save the model
model.save_pretrained(output_dir)

# Save the tokenizer
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")

Model and tokenizer saved to ./saved_model


In [5]:
import nltk
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

nltk.download('punkt')

# Load the fine-tuned model and tokenizer
model_checkpoint = "./saved_model"  # Path to your fine-tuned model directory
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Load text from the uploaded file
file_path = "/content/drive/MyDrive/Colab Notebooks/911.txt"
output_path = "911_highlighted.txt"

with open(file_path, "r") as file:
    text = file.read()

# Tokenize the text
tokens = nltk.word_tokenize(text)

# Function to predict labels for tokens
def predict_tokens(tokens, model, tokenizer):
    inputs = tokenizer(tokens, return_tensors="pt", is_split_into_words=True, truncation=True)
    inputs = {key: val.to(model.device) for key, val in inputs.items()}
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1).squeeze().tolist()

    return predictions

# Predict labels
model.eval()
predictions = predict_tokens(tokens, model, tokenizer)

# Map predictions back to labels
id2label = {0: "Normal", 1: "Car-Related", 2: "Hard"}  # Replace with your label mapping
labeled_tokens = [(token, id2label[pred]) for token, pred in zip(tokens, predictions)]

# Generate highlighted text for a TXT file
highlighted_text = ""
for token, label in labeled_tokens:
    if label == "Car-Related":
        highlighted_text += f"<<blue>>{token}<<end>> "
    elif label == "Hard":
        highlighted_text += f"<<yellow>>{token}<<end>> "
    else:
        highlighted_text += f"{token} "

# Save the highlighted text to a TXT file
with open(output_path, "w") as output_file:
    output_file.write(highlighted_text.strip())

print(f"Highlighted text saved to {output_path}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Highlighted text saved to 911_highlighted.txt


In [8]:
!pip install nltk
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [6]:
# car-related 단어 추출
car_related_words = [token for token, label in labeled_tokens if label == "Car-Related"]

In [10]:
from nltk.corpus import wordnet
for word in car_related_words[:10]:  # 10개 단어만 처리
    synsets = wordnet.synsets(word)
    if synsets:
        definition = synsets[0].definition()  # 첫 번째 synset의 정의 사용
        print(f"{word}: {definition}")
    else:
        print(f"{word}: WordNet에서 정의를 찾을 수 없습니다.")

992: WordNet에서 정의를 찾을 수 없습니다.
now: the momentary present
A: a metric unit of length equal to one ten billionth of a meter (or 0.0001 micron); used to specify wavelengths of electromagnetic radiation
hybrid: a word that is composed of parts from different languages (e.g., `monolingual' has a Greek prefix and a Latin root)
if: WordNet에서 정의를 찾을 수 없습니다.
developed: make something new, such as a product or a mental or artistic creation
turbocharged: WordNet에서 정의를 찾을 수 없습니다.
flat-six: WordNet에서 정의를 찾을 수 없습니다.
of: WordNet에서 정의를 찾을 수 없습니다.
and: WordNet에서 정의를 찾을 수 없습니다.
