In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
import xml.etree.ElementTree as ET
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [None]:


data = pd.read_csv("../data/training_data.csv")

# Function to parse XML annotations
def extract_annotations_from_xml(xml_data):
    root = ET.fromstring(xml_data)
    tokens, labels = [], []
    for annotation in root.findall('.//annotation'):
        token = annotation.find(".//text").text
        label = annotation.find(".//infon[@key='type']").text
        if token and label:
            tokens.append(token)
            labels.append(label)
    return tokens, labels

# Apply the XML parsing function to annotated data
data['tokens_labels'] = data['annotated'].apply(extract_annotations_from_xml)

# print(data['tokens_labels'][0])

# Create negative samples from the unannotated column
def create_negative_samples(unannotated_text):
    tokens = unannotated_text.split()  # Split text into tokens
    labels = ["O"] * len(tokens)  # Assign "O" (no entity) to all tokens
    return tokens, labels

data['negative_samples'] = data['unannotated'].apply(create_negative_samples)

# print(data['negative_samples'][0])

# Combine positive and negative samples
all_samples = data['tokens_labels'].tolist() + data['negative_samples'].tolist()

#print(all_samples[0:10])
train_data, val_data = train_test_split(all_samples, test_size=0.2, random_state=42)

# Convert to BIO format
def create_bio_format(data):
    sentences = []
    for tokens_labels in data:
        tokens, labels = tokens_labels
        sentences.append({"tokens": tokens, "labels": labels})
    return sentences

train_bio_data = create_bio_format(train_data)
val_bio_data = create_bio_format(val_data)
# print(train_bio_data[0])

# Tokenization and alignment function
def tokenize_and_align_labels(batch, tokenizer, label_to_id):
    tokenized_inputs = tokenizer(batch["tokens"], truncation=True, is_split_into_words=True, padding="max_length", max_length=128)
    labels = []
    for i, label in enumerate(batch["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = [-100 if word_id is None else label_to_id[label[word_id]] for word_id in word_ids]
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Tokenizer and label mapping
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
unique_labels = set(label for sentence in train_bio_data for label in sentence["labels"])
label_to_id = {label: i for i, label in enumerate(sorted(unique_labels))}
id_to_label = {i: label for label, i in label_to_id.items()}

# Prepare the Hugging Face Dataset
train_dataset = Dataset.from_list(train_bio_data)
val_dataset = Dataset.from_list(val_bio_data)
raw_datasets = DatasetDict({"train": train_dataset, "validation": val_dataset})

print(raw_datasets['train'].description)

# Tokenize and align labels
tokenized_datasets = raw_datasets.map(
    lambda batch: tokenize_and_align_labels(batch, tokenizer, label_to_id),
    batched=True
)

# **Add Print Statements for Cross-Validation**
# print("==== Tokenized and Labeled Train Dataset ====")
# print(tokenized_datasets["train"].to_pandas())

# print("==== Tokenized and Labeled Validation Dataset ====")
# print(tokenized_datasets["validation"].to_pandas())

# Load SciBERT model
model = AutoModelForTokenClassification.from_pretrained(
    "allenai/scibert_scivocab_uncased", 
    num_labels=len(unique_labels),
    id2label=id_to_label,
    label2id=label_to_id
)

# Check if CUDA (GPU) is available, otherwise fallback to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the correct device (GPU or CPU)
model.to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=7,
    weight_decay=0.01,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=500,
    report_to="none"
)

# # Compute evaluation metrics
# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     true_labels = [
#         [id_to_label[l] for l in label if l != -100] for label in labels
#     ]
#     true_predictions = [
#         [id_to_label[p] for p, l in zip(prediction, label) if l != -100]
#         for prediction, label in zip(predictions, labels)
#     ]
    
#     # Flatten the lists
#     true_labels_flat = [item for sublist in true_labels for item in sublist]
#     true_predictions_flat = [item for sublist in true_predictions for item in sublist]
    
#     precision = precision_score(true_labels_flat, true_predictions_flat, average="weighted")
#     recall = recall_score(true_labels_flat, true_predictions_flat, average="weighted")
#     f1 = f1_score(true_labels_flat, true_predictions_flat, average="weighted")
#     accuracy = accuracy_score(true_labels_flat, true_predictions_flat)
    
#     return {
#         "precision": precision,
#         "recall": recall,
#         "f1": f1,
#         "accuracy": accuracy,
#     }

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Ensure all tensors are contiguous before training
for param in model.parameters():
    param.data = param.data.contiguous()

# Train the model
trainer.train()

# Save the trained model
trainer.save_model("./scibert_psc_ner_model")
tokenizer.save_pretrained("./scibert_psc_ner_model")

def test_on_large_text_file(file_path, model_path):
    # Load the tokenizer and model from the saved directory
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForTokenClassification.from_pretrained(model_path)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Read the text from the file
    with open(file_path, 'r') as file:
        text = file.read()

    # Tokenize the text and split into chunks of 510 tokens
    # (reserve space for [CLS] and [SEP] tokens)
    max_length = 512
    tokenized_input = tokenizer(text, truncation=False, add_special_tokens=False)
    input_ids = tokenized_input['input_ids']

    chunks = [
        input_ids[i: i + max_length - 2]
        for i in range(0, len(input_ids), max_length - 2)
    ]

    all_extracted_data = {}

    # Process each chunk separately
    for chunk in chunks:
        # Add [CLS] and [SEP] tokens
        chunk = [tokenizer.cls_token_id] + chunk + [tokenizer.sep_token_id]

        # Convert chunk to tensor
        chunk_tensor = torch.tensor([chunk]).to(device)

        # Create attention mask
        attention_mask = torch.tensor([[1] * len(chunk)]).to(device)

        with torch.no_grad():
            outputs = model(input_ids=chunk_tensor, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=2)

        # Convert token IDs and predictions to labels
        tokens = tokenizer.convert_ids_to_tokens(chunk)
        predicted_labels = [model.config.id2label[pred.item()] for pred in predictions[0]]

        # Group tokens into entities based on continuous labels
        current_entity = None
        for token, label in zip(tokens, predicted_labels):
            if label != "O":  # Start or continue an entity
                if current_entity is None:
                    current_entity = {"label": label, "entity": token}
                elif current_entity["label"] == label:
                    current_entity["entity"] += " " + token
            else:  # End the current entity
                if current_entity:
                    if current_entity["label"] not in all_extracted_data:
                        all_extracted_data[current_entity["label"]] = []
                    all_extracted_data[current_entity["label"]].append(current_entity["entity"])
                    current_entity = None

        # Handle any remaining entity
        if current_entity:
            if current_entity["label"] not in all_extracted_data:
                all_extracted_data[current_entity["label"]] = []
            all_extracted_data[current_entity["label"]].append(current_entity["entity"])

    # Fill the output with null for missing labels
    all_labels = [
        "control_pce", "control_voc", "treated_pce", "treated_voc", "passivating_molecule", 
        "perovskite_composition", "electron_transport_layer", "hole_transport_layer",
        "ISOS-L-1", "ISOS-L-2", "ISOS-T-1", "ISOS-T-2", "ISOS-LC", "ISOS-D-1", "ISOS-D-2"
    ]
    output = {label: all_extracted_data.get(label, [None])[0] for label in all_labels}

    return output

file_path = "6.txt"  # Path to the large text file
model_path = "./scibert_psc_ner_model"  # Path to the saved model directory

# Extract entities from the file using the saved trained model
extracted_data = test_on_large_text_file(file_path, model_path)

# Display the extracted entities
print(extracted_data)






Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.046065,0.622079,0.69985,0.646892,0.69985
2,No log,0.718007,0.79567,0.804641,0.767578,0.804641
3,No log,0.583601,0.822759,0.836826,0.813267,0.836826
4,No log,0.5159,0.821447,0.838323,0.819631,0.838323
5,No log,0.498121,0.839616,0.851048,0.834077,0.851048
6,No log,0.469655,0.850186,0.861901,0.84667,0.861901
7,No log,0.454858,0.862467,0.875374,0.861908,0.875374


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'control_pce': None, 'control_voc': None, 'treated_pce': '51 ##5 . 82', 'treated_voc': '1 . v 1 . 04 v . 90 v . 88', 'passivating_molecule': '[CLS] mmol / ml ( mg / ml ) 6 pea ) 2 ( ma ) mmol / ml ( 94 mg / ml mmol / ml ( mg / ml / mg / ( pea ) 2 ( ma ) mmol / ml ( 94 mg / ml mmol / ml mg / ml mmol / ml mg / ml ) 40 ( pea ) 2 ( ma ) 39 mmol / ml ( 94 mg / ml ) mmol / ml ( mg / ml / ml mg / ml ) 60 ( pea ) 2 ( ma ) 59 mmol / ml ( 94 mg / ml ) mmol / ml ( mg / ml mmol / ml mg / ml 2 mmol / ml ( 94 05 mg / ml ) mmol / ml ( mg / ml', 'perovskite_composition': 'pea concentration 1 ( pea ) 2 pb ##i', 'electron_transport_layer': '60 nm', 'hole_transport_layer': None, 'ISOS-L-1': None, 'ISOS-L-2': None, 'ISOS-T-1': None, 'ISOS-T-2': None, 'ISOS-LC': None, 'ISOS-D-1': None, 'ISOS-D-2': None}


In [1]:
# import torch
# from transformers import LongformerTokenizerFast, LongformerForTokenClassification, Trainer, TrainingArguments
# from datasets import Dataset, DatasetDict
# import xml.etree.ElementTree as ET
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
# import numpy as np

# # Load your data
# data = pd.read_csv("training_data.csv")

# # Function to parse XML annotations
# def extract_annotations_from_xml(xml_data):
#     root = ET.fromstring(xml_data)
#     tokens, labels = [], []
#     for annotation in root.findall('.//annotation'):
#         token = annotation.find(".//text").text
#         label = annotation.find(".//infon[@key='type']").text
#         if token and label:
#             tokens.append(token)
#             labels.append(label)
#     return tokens, labels

# # Apply the XML parsing function to annotated data
# data['tokens_labels'] = data['annotated'].apply(extract_annotations_from_xml)

# # Create negative samples from the unannotated column
# def create_negative_samples(unannotated_text):
#     tokens = unannotated_text.split()  # Split text into tokens
#     labels = ["O"] * len(tokens)  # Assign "O" (no entity) to all tokens
#     return tokens, labels

# data['negative_samples'] = data['unannotated'].apply(create_negative_samples)

# # Combine positive and negative samples
# all_samples = data['tokens_labels'].tolist() + data['negative_samples'].tolist()

# # Split the combined data into training and validation sets
# train_data, val_data = train_test_split(all_samples, test_size=0.2, random_state=42)

# # Convert to BIO format
# def create_bio_format(data):
#     sentences = []
#     for tokens_labels in data:
#         tokens, labels = tokens_labels
#         sentences.append({"tokens": tokens, "labels": labels})
#     return sentences

# train_bio_data = create_bio_format(train_data)
# val_bio_data = create_bio_format(val_data)

# def tokenize_and_align_labels(batch, tokenizer, label_to_id, max_length=16384):
#     # Tokenizing the inputs, remove `add_prefix_space=True`
#     tokenized_inputs = tokenizer(batch["tokens"], truncation=True, is_split_into_words=True, padding="max_length", max_length=max_length)
    
#     # Aligning the labels with the tokens
#     labels = []
#     for i, label in enumerate(batch["labels"]):
#         word_ids = tokenized_inputs.word_ids(i)
#         label_ids = [-100 if word_id is None else label_to_id[label[word_id]] for word_id in word_ids]
#         labels.append(label_ids)

#     # Add labels to tokenized inputs
#     tokenized_inputs["labels"] = labels
#     return tokenized_inputs


# # Instantiate the Longformer tokenizer with add_prefix_space=True
# tokenizer = LongformerTokenizerFast.from_pretrained("allenai/longformer-base-4096", add_prefix_space=True)
# unique_labels = set(label for sentence in train_bio_data for label in sentence["labels"])
# label_to_id = {label: i for i, label in enumerate(sorted(unique_labels))}
# id_to_label = {i: label for label, i in label_to_id.items()}

# # Prepare the Hugging Face Dataset
# train_dataset = Dataset.from_list(train_bio_data)
# val_dataset = Dataset.from_list(val_bio_data)
# raw_datasets = DatasetDict({"train": train_dataset, "validation": val_dataset})

# # Tokenize and align labels
# tokenized_datasets = raw_datasets.map(
#     lambda batch: tokenize_and_align_labels(batch, tokenizer, label_to_id),
#     batched=True
# )

# model = LongformerForTokenClassification.from_pretrained(
#     "allenai/longformer-base-4096", 
#     num_labels=len(unique_labels),
#     id2label=id_to_label,
#     label2id=label_to_id
# )

# # Check if CUDA (GPU) is available, otherwise fallback to CPU
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Move the model to the correct device (GPU or CPU)
# model.to(device)

# # Define training arguments
# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="epoch",
#     learning_rate=5e-5,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     num_train_epochs=7,
#     weight_decay=0.01,
#     save_steps=10_000,
#     save_total_limit=2,
#     logging_dir='./logs',
#     logging_steps=500,
#     report_to="none"
# )

# # Compute evaluation metrics
# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     true_labels = [
#         [id_to_label[l] for l in label if l != -100] for label in labels
#     ]
#     true_predictions = [
#         [id_to_label[p] for p, l in zip(prediction, label) if l != -100]
#         for prediction, label in zip(predictions, labels)
#     ]
    
#     # Flatten the lists
#     true_labels_flat = [item for sublist in true_labels for item in sublist]
#     true_predictions_flat = [item for sublist in true_predictions for item in sublist]
    
#     precision = precision_score(true_labels_flat, true_predictions_flat, average="weighted")
#     recall = recall_score(true_labels_flat, true_predictions_flat, average="weighted")
#     f1 = f1_score(true_labels_flat, true_predictions_flat, average="weighted")
#     accuracy = accuracy_score(true_labels_flat, true_predictions_flat)
    
#     return {
#         "precision": precision,
#         "recall": recall,
#         "f1": f1,
#         "accuracy": accuracy,
#     }

# # Initialize Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_datasets["train"],
#     eval_dataset=tokenized_datasets["validation"],
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics
# )

# # Ensure all tensors are contiguous before training
# for param in model.parameters():
#     param.data = param.data.contiguous()

# # Train the model
# trainer.train()

# # Save the trained model
# trainer.save_model("./longformer_psc_ner_model")
# tokenizer.save_pretrained("./longformer_psc_ner_model")

# # Function to test the model on a large text file, no chunking (16,000 tokens)
# def test_on_large_text_file(file_path):
#     # Read the text from the file
#     with open(file_path, 'r') as file:
#         text = file.read()

#     # Tokenize the entire text (up to 16,000 tokens)
#     tokenized_input = tokenizer(text, truncation=True, is_split_into_words=False, padding="max_length", max_length=16384, add_prefix_space=True)
#     chunk_tokens = tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'])
    
#     # Prepare tensor for model prediction
#     chunk_tensor = torch.tensor([tokenized_input['input_ids']]).to(device)

#     with torch.no_grad():
#         outputs = model(chunk_tensor)
#         logits = outputs.logits
#         predictions = torch.argmax(logits, dim=2)

#     # Get predicted labels
#     predicted_labels = [model.config.id2label[pred.item()] for pred in predictions[0]]

#     # Group tokens into entities based on continuous label
#     all_extracted_data = {}
#     current_entity = None
#     for token, label in zip(chunk_tokens, predicted_labels):
#         if label != "O":
#             if current_entity is None:
#                 current_entity = {"label": label, "entity": token}
#             elif current_entity["label"] == label:
#                 current_entity["entity"] += " " + token
#         else:
#             if current_entity:
#                 if current_entity["label"] not in all_extracted_data:
#                     all_extracted_data[current_entity["label"]] = []
#                 all_extracted_data[current_entity["label"]].append(current_entity["entity"])
#                 current_entity = None

#     if current_entity:
#         if current_entity["label"] not in all_extracted_data:
#             all_extracted_data[current_entity["label"]] = []
#         all_extracted_data[current_entity["label"]].append(current_entity["entity"])

#     # Fill the output with null for missing labels
#     all_labels = [
#         "control_pce", "control_voc", "treated_pce", "treated_voc", "passivating_molecule", 
#         "perovskite_composition", "electron_transport_layer", "hole_transport_layer",
#         "ISOS-L-1", "ISOS-L-2", "ISOS-T-1", "ISOS-T-2", "structure_pin_nip", 
#         "date_published", "humidity", "temperature", "time", "efficiency_tret", 
#         "efficiency_cont", "journal_publication"
#     ]
    
#     for label in all_labels:
#         if label not in all_extracted_data:
#             all_extracted_data[label] = None

#     return all_extracted_data

# # Now you can call this function to test on `6.txt`:
# extracted_entities = test_on_large_text_file("6.txt")
# print(extracted_entities)

In [None]:
# Function to process all txt files in the folder and output JSONs
def process_all_txt_files(input_folder, output_folder):
    os.makedirs(output_folder, exist_ok=True)

    txt_files = [f for f in os.listdir(input_folder) if f.endswith('.txt')]

    for txt_file in txt_files:
        file_path = os.path.join(input_folder, txt_file)
        output_json_path = os.path.join(output_folder, f"{os.path.splitext(txt_file)[0]}.json")

        extracted_data = test_on_large_text_file(file_path)

        with open(output_json_path, 'w') as json_file:
            json.dump(extracted_data, json_file, indent=4)
        
        print(f"Processed {txt_file}, saved results to {output_json_path}")

input_folder = "54txts"
output_folder = "output_jsons1"

process_all_txt_files(input_folder, output_folder)

In [5]:
import shutil

# Path to the folder you want to download
folder_to_zip = "output_jsons"
output_zip = "json.zip"

# Zip the folder
shutil.make_archive(output_zip.replace(".zip", ""), 'zip', folder_to_zip)

print("download")

json.zip is ready for download.
