# Xử lý data

In [None]:
from pyspark.sql import SparkSession

# Khởi tạo Spark session
spark = SparkSession.builder.appName("Sentiment140 + Reddit").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/04 00:25:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
from pyspark.sql.functions import col, regexp_replace, lower

# Đọc tập Sentiment140
csv_path = "/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv"
df_sent = spark.read.csv(csv_path, header=False, encoding="ISO-8859-1")
df_sent = df_sent.selectExpr("_c0 as raw_label", "_c5 as text")
df_sent = df_sent.filter(df_sent["raw_label"].isin(["0", "4"]))
df_sent = df_sent.withColumn("label", (col("raw_label") == "4").cast("integer"))  # 0=neg, 1=pos

# Làm sạch
df_sent = df_sent.withColumn("clean_text", lower(col("text")))
df_sent = df_sent.withColumn("clean_text", regexp_replace(col("clean_text"), r"http\S+", ""))
df_sent = df_sent.withColumn("clean_text", regexp_replace(col("clean_text"), r"[^a-z\s]", ""))
df_sent = df_sent.withColumn("clean_text", regexp_replace(col("clean_text"), r"\s+", " "))

In [None]:
# Đọc Human Stress dataset
human_path = "/kaggle/input/human-stress-prediction/Stress.csv"
df_human = spark.read.csv(human_path, header=True)
df_human = df_human.selectExpr("text as text", "`label` as raw_label")
df_human = df_human.filter(df_human["raw_label"].isin(["0", "1"]))
df_human = df_human.withColumn("label", col("raw_label").cast("integer"))

# Làm sạch
df_human = df_human.withColumn("clean_text", lower(col("text")))
df_human = df_human.withColumn("clean_text", regexp_replace(col("clean_text"), r"http\S+", ""))
df_human = df_human.withColumn("clean_text", regexp_replace(col("clean_text"), r"[^a-z\s]", ""))
df_human = df_human.withColumn("clean_text", regexp_replace(col("clean_text"), r"\s+", " "))

df_human = df_human.dropna()

In [None]:
df_combined = df_sent.select("clean_text", "label").unionByName(df_human.select("clean_text", "label"))
print("Tổng số dòng:", df_combined.count())
df_combined.groupBy("label").count().show()

                                                                                                    

Tổng số dòng: 1602623




+-----+------+
|label| count|
+-----+------+
|    0|801250|
|    1|801373|
+-----+------+



                                                                                                    

In [None]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml import Pipeline

# Bỏ các dòng có clean_text bị null
df_combined = df_combined.filter(df_combined["clean_text"].isNotNull())

# Pipeline xử lý văn bản
tokenizer = Tokenizer(inputCol="clean_text", outputCol="tokens")
stop_remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")
hashing_tf = HashingTF(inputCol="filtered_tokens", outputCol="raw_features", numFeatures=10000)
idf = IDF(inputCol="raw_features", outputCol="features")

pipeline = Pipeline(stages=[tokenizer, stop_remover, hashing_tf, idf])
pipeline_model = pipeline.fit(df_combined)
df_tfidf = pipeline_model.transform(df_combined)

# Xem thử
df_tfidf.select("clean_text", "label", "features").show(3)

                                                                                                    

+--------------------+-----+--------------------+
|          clean_text|label|            features|
+--------------------+-----+--------------------+
|switchfoot awww t...|    0|(10000,[1528,2306...|
|is upset that he ...|    0|(10000,[399,1939,...|
|kenichan i dived ...|    0|(10000,[2708,3206...|
+--------------------+-----+--------------------+
only showing top 3 rows



In [None]:
# Chia train/test theo tỉ lệ 80/20
train_df, test_df = df_tfidf.randomSplit([0.8, 0.2], seed=42)

print("Số dòng train:", train_df.count())
print("Số dòng test:", test_df.count())

# Chuyển dữ liệu Spark → pandas để huấn luyện model torch
sample_df = df_combined.select("clean_text", "label").toPandas()
sample_df = sample_df.dropna()
sample_df.head(100)



Số dòng train: 1282141


                                                                                                    

Số dòng test: 320482


                                                                                                    

Unnamed: 0,clean_text,label
0,switchfoot awww thats a bummer you shoulda got...,0
1,is upset that he cant update his facebook by t...,0
2,kenichan i dived many times for the ball manag...,0
3,my whole body feels itchy and like its on fire,0
4,nationwideclass no its not behaving at all im ...,0
...,...,...
95,strider is a sick little puppy,0
96,so ryleegracewana go steves party or not sadly...,0
97,hey i actually won one of my bracket pools too...,0
98,stark you dont follow me either and i work for...,0


# Mô hình Logistic Regression

In [None]:
from pyspark.ml.classification import LogisticRegression

# Khởi tạo mô hình Logistic Regression
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)

# Huấn luyện mô hình
lr_model = lr.fit(train_df)

# Dự đoán trên tập test
lr_preds = lr_model.transform(test_df)

# Hiển thị một vài dự đoán
lr_preds.select("clean_text", "label", "prediction", "probability").show(5, truncate=False)

[Stage 30:>                                                                             (0 + 1) / 1]

+--------------------------------------------------------------------------------------------+-----+----------+-----------------------------------------+
|clean_text                                                                                  |label|prediction|probability                              |
+--------------------------------------------------------------------------------------------+-----+----------+-----------------------------------------+
| a baby fell flat on his face and started bawling because of me forgot that he is wobbley   |0    |0.0       |[0.8921870665051995,0.10781293349480048] |
| a car parked six inches away from my two door car i think i nearly broke my leg getting in |0    |0.0       |[0.9588027704677475,0.041197229532252466]|
| a date night without my babysigh i miss you already                                        |0    |0.0       |[0.8825980759852103,0.11740192401478966] |
| a free day from work and its rainy yay                                    

                                                                                                    

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
lr_auc = evaluator.evaluate(lr_preds)

print(f"Logistic Regression ROC AUC: {lr_auc:.4f}")

                                                                                                    

Logistic Regression ROC AUC: 0.8269


# SVM

In [None]:
from pyspark.ml.classification import LinearSVC

# Khởi tạo và huấn luyện SVM
svm = LinearSVC(featuresCol="features", labelCol="label", maxIter=10, regParam=0.1)
svm_model = svm.fit(train_df)

# Dự đoán
svm_preds = svm_model.transform(test_df)
svm_auc = evaluator.evaluate(svm_preds)

print(f"SVM (LinearSVC) ROC AUC: {svm_auc:.4f}")

                                                                                                    

SVM (LinearSVC) ROC AUC: 0.8257


In [None]:
print("Kết quả tổng hợp:")
print(f"Logistic Regression AUC: {lr_auc:.4f}")
print(f"SVM (LinearSVC)     AUC: {svm_auc:.4f}")

Kết quả tổng hợp:
Logistic Regression AUC: 0.8269
SVM (LinearSVC)     AUC: 0.8257


# biLSTM

In [None]:
import re
from collections import Counter

# Tokenize
def tokenize(text):
    return re.findall(r'\b\w+\b', text.lower())

# Tạo vocab
all_tokens = [token for text in sample_df["clean_text"] for token in tokenize(text)]
vocab = {"<PAD>": 0, "<UNK>": 1}
vocab.update({word: i+2 for i, (word, _) in enumerate(Counter(all_tokens).most_common(10000))})

def encode(text):
    return [vocab.get(t, vocab["<UNK>"]) for t in tokenize(text)]

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split

class BiLSTMDataset(Dataset):
    def __init__(self, texts, labels):
        self.inputs = [torch.tensor(encode(t), dtype=torch.long) for t in texts]
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.labels[idx]

def collate_fn(batch):
    seqs, labels = zip(*batch)
    seqs = pad_sequence(seqs, batch_first=True, padding_value=0)
    return seqs, torch.tensor(labels)

# Chia tập train/test
texts_train, texts_val, labels_train, labels_val = train_test_split(
    sample_df["clean_text"].tolist(), sample_df["label"].tolist(), test_size=0.2, random_state=42)

train_ds = BiLSTMDataset(texts_train, labels_train)
val_ds = BiLSTMDataset(texts_val, labels_val)

train_loader_bilstm = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader_bilstm = DataLoader(val_ds, batch_size=32, collate_fn=collate_fn)

In [None]:
import torch.nn as nn

class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim*2, 2)

    def forward(self, x):
        x = self.embedding(x)
        _, (hn, _) = self.lstm(x)
        out = torch.cat((hn[0], hn[1]), dim=1)
        return self.fc(out)

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_bilstm = BiLSTMClassifier(len(vocab)).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_bilstm.parameters(), lr=1e-3)

for epoch in range(2):
    model_bilstm.train()
    total_loss = 0
    for x_batch, y_batch in train_loader_bilstm:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model_bilstm(x_batch)
        loss = loss_fn(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"BiLSTM Epoch {epoch+1} - Loss: {total_loss:.4f}")

# Đánh giá
model_bilstm.eval()
correct, total = 0, 0
with torch.no_grad():
    for x_batch, y_batch in val_loader_bilstm:
        x_batch = x_batch.to(device)
        preds = torch.argmax(model_bilstm(x_batch), dim=1).cpu()
        correct += (preds == y_batch).sum().item()
        total += y_batch.size(0)

print(f"BiLSTM Accuracy: {correct/total:.4f}")

BiLSTM Epoch 1 - Loss: 16736.2691
BiLSTM Epoch 2 - Loss: 15046.9893
BiLSTM Accuracy: 0.8252


# BERT

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

model_path = "/kaggle/input/bert-base-uncased/bert-base-uncased"

tokenizer = BertTokenizer.from_pretrained(model_path, local_files_only=True)
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2, local_files_only=True)

2025-06-04 00:38:13.608997: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748997493.870810      80 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748997493.942765      80 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/bert-base-uncased/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Tokenize toàn bộ dữ liệu
inputs = tokenizer(
    list(sample_df["clean_text"]),
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors="pt"
)

labels = torch.tensor(sample_df["label"].values)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, random_split

dataset = TensorDataset(inputs["input_ids"], inputs["attention_mask"], labels)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_ds, val_ds = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=32)

In [None]:
from torch.optim import AdamW
from tqdm import tqdm
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

for epoch in range(2):  # Huấn luyện 2 epoch
    total_loss = 0
    model.train()

    for batch in tqdm(train_loader):
        input_ids, attn_mask, labels = [x.to(device) for x in batch]
        labels = labels.long()

        model.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attn_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1} - Loss: {total_loss:.4f}")

100%|██████████| 40066/40066 [4:19:35<00:00,  2.57it/s]  


Epoch 1 - Loss: 14071.0535


100%|██████████| 40066/40066 [4:19:19<00:00,  2.58it/s]  

Epoch 2 - Loss: 11802.6419





In [None]:
# Đánh giá trên tập validation
model.eval()
correct, total = 0, 0

with torch.no_grad():
    for batch in val_loader:
        input_ids, attn_mask, labels = [x.to(model.device) for x in batch]
        outputs = model(input_ids=input_ids, attention_mask=attn_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f"Validation Accuracy: {accuracy:.4f}")

Validation Accuracy: 0.8550


In [None]:
# Lưu mô hình fine-tuned
model.save_pretrained("/kaggle/working/bert-sentiment-finetuned")
tokenizer.save_pretrained("/kaggle/working/bert-sentiment-finetuned")

('/kaggle/working/bert-sentiment-finetuned/tokenizer_config.json',
 '/kaggle/working/bert-sentiment-finetuned/special_tokens_map.json',
 '/kaggle/working/bert-sentiment-finetuned/vocab.txt',
 '/kaggle/working/bert-sentiment-finetuned/added_tokens.json')

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

model_path = "/kaggle/working/bert-sentiment-finetuned"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path).to(device)
model.eval()


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
# Ứng dụng
from transformers import BertTokenizer, BertForSequenceClassification
import torch.nn.functional as F
import re

# Load lại mô hình đã fine-tuned
model_path = "/kaggle/working/bert-sentiment-finetuned"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path).to(device)
model.eval()

# Hàm chia văn bản thành câu
def split_sentences(text):
    text = re.sub(r"\n+", " ", text.strip())
    return [s.strip() for s in re.split(r'[.!?]', text) if len(s.strip()) > 3]

# Phân tích cảm xúc từng câu trong văn bản
def analyze_document(text):
    sentences = split_sentences(text)
    if not sentences:
        return "Không có câu hợp lệ để phân tích."

    negative_count = 0

    for sent in sentences:
        inputs = tokenizer(sent, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
            probs = F.softmax(outputs.logits, dim=1)
            pred = torch.argmax(probs, dim=1).item()

        # In kết quả dự đoán từng câu
        print(f"{sent}")
        print(f"   → Dự đoán: {'Negative' if pred == 0 else 'Positive'} (Confidence: {probs[0][pred]:.2f})")

        if pred == 0:
            negative_count += 1

    total = len(sentences)
    ratio = negative_count / total
    result = f"\nPhân tích {total} câu:\n" \
             f"- Negative: {negative_count}\n" \
             f"- Positive: {total - negative_count}\n" \
             f"→ Tỉ lệ tiêu cực: {ratio:.2%}\n"

    if ratio >= 0.7:
        result += "⚠️ Nguy cơ trầm cảm cao"
    elif ratio >= 0.5:
        result += "Biểu hiện tiêu cực thường xuyên"
    else:
        result += "Cảm xúc ổn định"

    return result

# Thử với văn bản đầu vào
test_text = """
I didn’t get out of bed until almost 2 PM. Not because I was tired, but because I couldn’t find a reason to move. I just laid there, staring at the wall, my chest so heavy it felt like breathing was a task. I didn’t eat anything. I didn’t talk to anyone. My phone buzzed a few times, but I couldn’t bring myself to care. Everything feels so distant—like I’m watching life through glass. I feel useless. Like I’m a burden to everyone around me. Sometimes I wonder if they’d be better off if I just disappeared. No one really sees me. They see the smile I force, the “I’m fine” I repeat, but not the emptiness that’s swallowing me inside. I don’t even remember what happiness feels like. I’m not living. I’m just… existing.
"""
print(analyze_document(test_text))

I didn’t get out of bed until almost 2 PM
   → Dự đoán: Negative (Confidence: 0.96)
Not because I was tired, but because I couldn’t find a reason to move
   → Dự đoán: Negative (Confidence: 0.97)
I just laid there, staring at the wall, my chest so heavy it felt like breathing was a task
   → Dự đoán: Negative (Confidence: 0.97)
I didn’t eat anything
   → Dự đoán: Negative (Confidence: 0.92)
I didn’t talk to anyone
   → Dự đoán: Negative (Confidence: 0.98)
My phone buzzed a few times, but I couldn’t bring myself to care
   → Dự đoán: Negative (Confidence: 0.91)
Everything feels so distant—like I’m watching life through glass
   → Dự đoán: Negative (Confidence: 0.95)
I feel useless
   → Dự đoán: Negative (Confidence: 0.99)
Like I’m a burden to everyone around me
   → Dự đoán: Negative (Confidence: 0.94)
Sometimes I wonder if they’d be better off if I just disappeared
   → Dự đoán: Negative (Confidence: 0.82)
No one really sees me
   → Dự đoán: Negative (Confidence: 0.95)
They see the smi

# Train ảnh

In [None]:
# Thư viện
from pathlib import Path
import pandas as pd
from PIL import Image
from tqdm import tqdm

from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from transformers import BertTokenizer, BertForSequenceClassification
import torch.nn.functional as F
import torch

2025-06-04 17:04:29.114788: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749056669.294959      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749056669.344624      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
# Load IAM thông qua gt_test.txt
iam_root = Path("/kaggle/input/iam-trocr/IAM")
image_dir = iam_root / "image"
gt_path = iam_root / "gt_test.txt"

lines = gt_path.read_text(encoding="utf-8").splitlines()
data = []
for line in lines:
    parts = line.strip().split(maxsplit=1)
    if len(parts) == 2:
        img_name, text = parts
        img_path = image_dir / img_name
        data.append((str(img_path), text))
# Tạo DataFrame IAM
df_iam = pd.DataFrame(data, columns=["image_path", "gt_text"])

# Lấy ảnh để xử lý
df_iam_small = df_iam.copy()
df_iam_small.head()

Unnamed: 0,image_path,gt_text
0,/kaggle/input/iam-trocr/IAM/image/c04-110-00.jpg,Become a success with a disc and hey presto ! ...
1,/kaggle/input/iam-trocr/IAM/image/c04-110-01.jpg,"assuredness "" Bella Bella Marie "" ( Parlophone..."
2,/kaggle/input/iam-trocr/IAM/image/c04-110-02.jpg,I don't think he will storm the charts with th...
3,/kaggle/input/iam-trocr/IAM/image/c04-110-03.jpg,"CHRIS CHARLES , 39 , who lives in Stockton-on-..."
4,/kaggle/input/iam-trocr/IAM/image/c04-116-00.jpg,He is also a director of a couple of garages ....


In [None]:
# Load mô hình TrOCR
trocr_path = "/kaggle/input/trocr-base-handwritten/trocr-base-handwritten"

from transformers import TrOCRProcessor, VisionEncoderDecoderModel

processor = TrOCRProcessor.from_pretrained(trocr_path)
ocr_model = VisionEncoderDecoderModel.from_pretrained(trocr_path).to("cuda" if torch.cuda.is_available() else "cpu")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'transfor

In [None]:
# Hàm OCR tối ưu
def ocr_image(img_path):
    image = Image.open(img_path).convert("RGB").resize((512, 128))
    inputs = processor(images=image, return_tensors="pt").to(ocr_model.device)
    ids = ocr_model.generate(**inputs)
    return processor.batch_decode(ids, skip_special_tokens=True)[0]

In [None]:
# OCR toàn bộ ảnh
print("Đang chạy OCR ảnh IAM...")
df_iam_small["ocr_text"] = [ocr_image(p) for p in tqdm(df_iam_small["image_path"])]

Đang chạy OCR ảnh IAM...


100%|██████████| 2915/2915 [09:37<00:00,  5.05it/s]


In [None]:
from difflib import SequenceMatcher

def char_accuracy(pred, truth):
    return SequenceMatcher(None, pred, truth).ratio()

def word_accuracy(pred, truth):
    pred_words = pred.strip().split()
    truth_words = truth.strip().split()
    return SequenceMatcher(None, pred_words, truth_words).ratio()

In [None]:
# Tính từng hàng
df_iam_small["char_acc"] = df_iam_small.apply(lambda row: char_accuracy(row["ocr_text"], row["gt_text"]), axis=1)
df_iam_small["word_acc"] = df_iam_small.apply(lambda row: word_accuracy(row["ocr_text"], row["gt_text"]), axis=1)

# Trung bình độ chính xác
avg_char_acc = df_iam_small["char_acc"].mean()
avg_word_acc = df_iam_small["word_acc"].mean()

print(f"Độ chính xác trung bình (char-level): {avg_char_acc:.4f}")
print(f"Độ chính xác trung bình (word-level): {avg_word_acc:.4f}")

Độ chính xác trung bình (char-level): 0.9707
Độ chính xác trung bình (word-level): 0.9170


In [None]:
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=64).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=1)
        pred = torch.argmax(probs, dim=1).item()
        return "Positive" if pred == 1 else "Negative"

# Dự đoán cảm xúc từ văn bản OCR
print("Đang phân tích cảm xúc từ văn bản OCR...")
df_iam_small["bert_emotion"] = df_iam_small["ocr_text"].apply(predict_sentiment)

# Hiển thị kết quả
df_iam_small[["image_path", "ocr_text", "bert_emotion"]].head(100)

Đang phân tích cảm xúc từ văn bản OCR...


Unnamed: 0,image_path,ocr_text,bert_emotion
0,/kaggle/input/iam-trocr/IAM/image/c04-110-00.jpg,Become a success with a disc and her presto ! ...,Positive
1,/kaggle/input/iam-trocr/IAM/image/c04-110-01.jpg,"assuredness "" Bella Bella Marie "" ( Parlophone...",Positive
2,/kaggle/input/iam-trocr/IAM/image/c04-110-02.jpg,I don't think he will storm the charts with th...,Positive
3,/kaggle/input/iam-trocr/IAM/image/c04-110-03.jpg,"CHRIS CHARLES , 39 , who lives in Stockton - o...",Negative
4,/kaggle/input/iam-trocr/IAM/image/c04-116-00.jpg,He is also a director of a couple of garages ....,Positive
...,...,...,...
95,/kaggle/input/iam-trocr/IAM/image/d01-080-07.jpg,impossible to say .,Negative
96,/kaggle/input/iam-trocr/IAM/image/d01-085-00.jpg,Professor E. A. Turner is inclined to take vie...,Positive
97,/kaggle/input/iam-trocr/IAM/image/d01-085-01.jpg,the original of the Gospel would be unmerced ....,Negative
98,/kaggle/input/iam-trocr/IAM/image/d01-085-02.jpg,"original of the Gospel , whether written on a ...",Positive


# Thử nghiệm mô hình với bộ dữ liệu về E text khác

In [None]:
from sklearn.metrics import classification_report, accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import torch.nn.functional as F
import pandas as pd
from tqdm import tqdm

# Đọc dữ liệu Reddit Stress
df_reddit = pd.read_csv("/kaggle/input/stress-and-anxiety-posts-on-reddit/stressed_anxious_cleaned.csv")
df_reddit = df_reddit[["Text", "is_stressed/anxious"]].dropna()

# Load mô hình đã fine-tuned
model_path = "/kaggle/working/bert-sentiment-finetuned"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

# Dự đoán theo batch
def predict_batch(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=64, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=1)
        return torch.argmax(probs, dim=1).cpu().numpy()

batch_size = 32
preds = []
for i in tqdm(range(0, len(df_reddit), batch_size)):
    batch_texts = df_reddit["Text"].iloc[i:i+batch_size].tolist()
    batch_preds = predict_batch(batch_texts)
    preds.extend(batch_preds)

# Gán nhãn gốc
df_reddit["label"] = 1  # Vì tất cả dòng là "stressed/anxious"

# Dự đoán bằng mô hình đã load sẵn
df_reddit["pred"] = preds  # Từ mô hình

# Tính số dòng dự đoán đúng (dự đoán được là 1)
correct = (df_reddit["pred"] == df_reddit["label"]).sum()
total = len(df_reddit)

# Accuracy
acc = correct / total
print(f"\nAccuracy (dự đoán đúng các dòng 1): {acc:.4f}")
print(f"→ Dự đoán đúng {correct}/{total} dòng là có stress/anxiety")

100%|██████████| 118/118 [00:15<00:00,  7.45it/s]


Accuracy (dự đoán đúng các dòng 1): 0.7894
→ Dự đoán đúng 2972/3765 dòng là có stress/anxiety





# DEMO

In [None]:
from huggingface_hub import HfApi

api = HfApi()
api.create_repo(repo_id="iiamntth/bert-sentiment-finetuned", private=False)

RepoUrl('https://huggingface.co/iiamntth/bert-sentiment-finetuned', endpoint='https://huggingface.co', repo_type='model', repo_id='iiamntth/bert-sentiment-finetuned')

In [None]:
from huggingface_hub import upload_folder

upload_folder(
    folder_path="/kaggle/working/bert-sentiment-finetuned",
    repo_id="iiamntth/bert-sentiment-finetuned"
)

Uploading...:   0%|          | 0.00/438M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/iiamntth/bert-sentiment-finetuned/commit/0662131d926fc45378dd0f601d19e4da60c0ea93', commit_message='Upload folder using huggingface_hub', commit_description='', oid='0662131d926fc45378dd0f601d19e4da60c0ea93', pr_url=None, repo_url=RepoUrl('https://huggingface.co/iiamntth/bert-sentiment-finetuned', endpoint='https://huggingface.co', repo_type='model', repo_id='iiamntth/bert-sentiment-finetuned'), pr_revision=None, pr_num=None)

# Tải

In [None]:
import shutil

shutil.make_archive("/kaggle/working/bert-sentiment-finetuned", 'zip', "/kaggle/working/bert-sentiment-finetuned")

'/kaggle/working/bert-sentiment-finetuned.zip'