In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/simple-business-2/label_dict.pkl
/kaggle/input/simple-business-2/model_directory/model_directory/config.json
/kaggle/input/simple-business-2/model_directory/model_directory/model.safetensors
/kaggle/input/simple-business/updated_metadata.csv
/kaggle/input/simple-business/decisions_half/decisions_half/DRN-4688502.pdf
/kaggle/input/simple-business/decisions_half/decisions_half/DRN-4651614.pdf
/kaggle/input/simple-business/decisions_half/decisions_half/DRN-4660693.pdf
/kaggle/input/simple-business/decisions_half/decisions_half/DRN-4690458.pdf
/kaggle/input/simple-business/decisions_half/decisions_half/DRN-4800420.pdf
/kaggle/input/simple-business/decisions_half/decisions_half/DRN-4638844.pdf
/kaggle/input/simple-business/decisions_half/decisions_half/DRN-4536150.pdf
/kaggle/input/simple-business/decisions_half/decisions_half/DRN-4597561.pdf
/kaggle/input/simple-business/decisions_half/decisions_half/DRN-4417747.pdf
/kaggle/input/simple-business/decisions_half/decisions_half/

In [2]:
!pip install pdfplumber
!pip install torch
!pip install transformers
!pip install PyMuPDF

Collecting pdfplumber
  Downloading pdfplumber-0.11.3-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.3-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m73.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading

In [3]:
import pdfplumber
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import pickle
import re
import fitz  # PyMuPDF

In [4]:
excel_path = '/kaggle/input/simple-business/updated_metadata.csv'
pdf_folder = '/kaggle/input/simple-business/decisions_half/decisions_half'
with open('/kaggle/input/simple-business-2/label_dict.pkl', 'rb') as f:
    label_dict = pickle.load(f)

metadata = pd.read_csv(excel_path)

metadata = metadata.dropna(how='all')

unlabeled_indices = metadata[metadata['field'].isna()].index

def clean_text(text):
    text = re.sub(r'<[^>]+>', '', text)
    text = text.lower()
    return text

def extract_what_happened(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    
    complaint_pattern = r'The complaint\s*((?:[\s\S](?!What happened))*.)'
    complaint_match = re.search(complaint_pattern, text)
    
    happened_pattern = r'What happened\s*([\s\S]*?)(?=What I’ve decided – and why|What I provisionally said|What I provisionally decided – and why)'
    happened_match = re.search(happened_pattern, text)
    
    complaint_text = complaint_match.group(1).strip() if complaint_match else ""
    happened_text = happened_match.group(1).strip() if happened_match else ""
    
    combined_text = complaint_text + " " + happened_text
    
    return combined_text

def process_pdf_file(pdf_path):
    extracted_text = extract_what_happened(pdf_path)
    if extracted_text:
        cleaned_text = clean_text(extracted_text)
        return cleaned_text
    return None

data = []
for index, row in metadata.iterrows():
    file_name = f"{row['decision_id']}.pdf"
    field = row['field']
    pdf_path = os.path.join(pdf_folder, file_name)
    if os.path.exists(pdf_path):
        text = process_pdf_file(pdf_path)
        if text:
            data.append((text, field))
            
# Get class number
num_labels = metadata['field'].nunique()
print(f"num_labels：{num_labels}")

# Extract texts for model prediction
texts = [text for text, field in data if text]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model_directory_path = '/kaggle/input/simple-business-2/model_directory/model_directory'
model = BertForSequenceClassification.from_pretrained(model_directory_path, num_labels=num_labels)
model.to(device)
model.eval()

# Batch data
batch_size = 16
predicted_labels = []
for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i+batch_size]
    inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs['input_ids'].to(device)
    attention_masks = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_masks)
    batch_predictions = torch.argmax(outputs.logits, dim=1)
    batch_labels = [label_dict[label.item()] for label in batch_predictions.cpu().numpy()]
    predicted_labels.extend(batch_labels)

# Map the prediction back to metadata
metadata.loc[unlabeled_indices, 'field'] = predicted_labels[:len(unlabeled_indices)]

metadata.to_csv('complete_metadata.csv', index=False)


num_labels：10


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]