In [4]:
from model import RobertaForSpanCategorization
from transformers import RobertaTokenizerFast

In [5]:
model = RobertaForSpanCategorization.from_pretrained("C:/Users/Samsung/OneDrive/Desktop/github/tuone-energy-tracker/models/fine_tune_bert_output/")
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

In [6]:
def get_offsets_and_predicted_tags(example: str, model, tokenizer, threshold=0):
    """
    Get prediction of model on example, using tokenizer
    Args:
      - example (str): The input text
      - model: The span categorizer
      - tokenizer: The tokenizer
      - threshold: The threshold to decide whether the token should belong to the label. Default to 0, which corresponds to probability 0.5.
    Returns:
      - List of (token, tags, offset) for each token.
    """
    # Tokenize the sentence to retrieve the tokens and offset mappings
    raw_encoded_example = tokenizer(example, return_offsets_mapping=True)
    encoded_example = tokenizer(example, return_tensors="pt")
    
    # Call the model. The output LxK-tensor where L is the number of tokens, K is the number of classes
    out = model(**encoded_example)["logits"][0]
    
    # We assign to each token the classes whose logit is positive
    predicted_tags = [[i for i, l in enumerate(logit) if l > threshold] for logit in out]
    
    return [{"token": token, "tags": tag, "offset": offset} for (token, tag, offset) 
            in zip(tokenizer.batch_decode(raw_encoded_example["input_ids"]), 
                   predicted_tags, 
                   raw_encoded_example["offset_mapping"])]

In [7]:
example = "GSR Capital invests in NEVS & building battery factory"
for item in get_offsets_and_predicted_tags(example, model, tokenizer):
    print(f"""{item["token"]:15} - {item["tags"]}""")

<s>             - []
G               - []
SR              - []
 Capital        - []
 invests        - []
 in             - []
 NE             - []
VS              - []
 &              - []
 building       - []
 battery        - []
 factory        - []
</s>            - []


In [17]:
import json
import pandas as pd
file_path = 'C:/Users/Samsung/OneDrive/Desktop/github/tuone-energy-tracker/data/tuone_labelling.jsonl'

# Read the JSONL file into a list of dictionaries
data = []
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        data.append(json.loads(line))

# Create a DataFrame
df = pd.DataFrame(data)

# Extract the "text" column into a list
text_list = df['text'].tolist()

In [21]:
text_list = text_list[:100]

In [27]:
tag2id = {'ORG': 1, 'TECH': 2, 'LOC': 3, 'STATUS': 4, 'CAPACITY': 5, 'VALUE': 6, 'SUBSIDY': 7, 'JOBS': 8}
id2tag = {v: k for k, v in tag2id.items()}

# Label to ID mapping for "IOB" tagging scheme
label2id = {
    'O': 0,
    **{f'B-{k}': 2*v - 1 for k, v in tag2id.items()},
    **{f'I-{k}': 2*v for k, v in tag2id.items()}
}
id2label = {v: k for k, v in label2id.items()}

def get_tagged_groups(example: str, model, tokenizer):
    """
    Get prediction of model on example, using tokenizer
    Returns:
    - List of spans under offset format {"start": ..., "end": ..., "tag": ...}, sorted by start, end then tag.
    """
    offsets_and_tags = get_offsets_and_predicted_tags(example, model, tokenizer)
    predicted_offsets = {l: [] for l in tag2id}
    last_token_tags = []
    for item in offsets_and_tags:
        (start, end), tags = item["offset"], item["tags"]
        
        for label_id in tags:
            label = id2label[label_id]
            tag = label[2:] # "I-PER" => "PER"
            if label.startswith("B-"):
                predicted_offsets[tag].append({"start": start, "end": end})
            elif label.startswith("I-"):
                # If "B-" and "I-" both appear in the same tag, ignore as we already processed it
                if label2id[f"B-{tag}"] in tags:
                    continue
                
                if label_id not in last_token_tags and label2id[f"B-{tag}"] not in last_token_tags:
                    predicted_offsets[tag].append({"start": start, "end": end})
                else:
                    predicted_offsets[tag][-1]["end"] = end
        
        last_token_tags = tags
        
    flatten_predicted_offsets = [{**v, "tag": k, "text": example[v["start"]:v["end"]]} 
                                 for k, v_list in predicted_offsets.items() for v in v_list if v["end"] - v["start"] >= 3]
    flatten_predicted_offsets = sorted(flatten_predicted_offsets, 
                                       key = lambda row: (row["start"], row["end"], row["tag"]))
    return flatten_predicted_offsets

# List to store results
results = []

# Process each text in the list
for text in text_list:
    tagged_groups = get_tagged_groups(text, model, tokenizer)
    results.append(tagged_groups)

# Convert results to DataFrame for better visualization
df_results = pd.DataFrame(results)

# Print the results
for result in results:
    print(result)

[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]


In [1]:
import json

# Load the data from the JSONL file
file_path = "C:/Users/Samsung/OneDrive/Desktop/github/tuone-energy-tracker/data/tuone_labelling.jsonl"
max_length = 0

with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        data = json.loads(line.strip())
        text_length = len(data["text"])
        if text_length > max_length:
            max_length = text_length

max_length


2511