In [1]:
import json

with open("negacio_uab_revised_version.json", "r", encoding="utf-8") as f:
    data = json.load(f)


In [143]:
for d in data:
   print(d['predictions'][0]['result'])

[{'value': {'start': 1350, 'end': 1354, 'labels': ['NEG']}, 'id': 'ent0', 'from_name': 'label', 'to_name': 'text', 'type': 'labels'}, {'value': {'start': 1996, 'end': 1999, 'labels': ['NEG']}, 'id': 'ent1', 'from_name': 'label', 'to_name': 'text', 'type': 'labels'}, {'value': {'start': 2385, 'end': 2389, 'labels': ['NEG']}, 'id': 'ent2', 'from_name': 'label', 'to_name': 'text', 'type': 'labels'}, {'value': {'start': 2490, 'end': 2493, 'labels': ['NEG']}, 'id': 'ent3', 'from_name': 'label', 'to_name': 'text', 'type': 'labels'}, {'value': {'start': 2557, 'end': 2560, 'labels': ['NEG']}, 'id': 'ent4', 'from_name': 'label', 'to_name': 'text', 'type': 'labels'}, {'value': {'start': 2657, 'end': 2660, 'labels': ['NEG']}, 'id': 'ent5', 'from_name': 'label', 'to_name': 'text', 'type': 'labels'}, {'value': {'start': 2748, 'end': 2751, 'labels': ['NEG']}, 'id': 'ent6', 'from_name': 'label', 'to_name': 'text', 'type': 'labels'}, {'value': {'start': 3038, 'end': 3049, 'labels': ['UNC']}, 'id': 'en

In [4]:
predictions = data[0]['predictions']

In [5]:
predictions= data[0]['predictions'][0]['result']

In [6]:
text = data[0]['data']['text']

In [7]:
# create a list of labels for each token
labels = ['O'] * len(text)  # initialize all labels to 'O'

In [8]:
start = predictions[0]['value']['start']
end = predictions[0]['value']['end']
labels = predictions[0]['value']['labels'][0]

In [75]:
len(data[0]['data']["text"])

3655

In [None]:
for annotation in data[0]['predictions'][0]['result']:
    print(annotation['value']['labels'][0])


In [None]:
data[0]['predictions'][0]['result']

In [106]:
from torch.utils.data import Dataset
from transformers import AutoTokenizer

class NegationScopeDataset(Dataset):

    def __init__(self, json_file, tokenizer_name):
        super().__init__()

        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

        with open(json_file, encoding="utf-8") as f:
            self.data = json.load(f)
            
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):

        example = self.data[idx]['data']
        text = example['text'] 
        labels = [0] * len(text.split())  # initialize all labels to 0
        
        for annotation in self.data[idx]['predictions'][0]['result']:
            if annotation['value']['labels'][0]== 'NEG':
                
                #Select start and end
                start_idx = annotation['value']['start']
                end_idx = annotation['value']['end']

                # Find the tokens that correspond to the start and end indices, but does it work????
                start_token_idx = len(self.tokenizer.tokenize(text[:start_idx]))
                end_token_idx = len(self.tokenizer.tokenize(text[:end_idx])) - 1

                # Set the labels of the tokens between start and end to 1
                for i in range(start_token_idx, end_token_idx + 1):
                    labels[i] = 1

        return self.tokenizer.encode(text, add_special_tokens=True), labels


In [119]:
from torch.utils.data import DataLoader

# Create a NegationScopeDataset
dataset = NegationScopeDataset(json_file='negacio_uab_revised_version.json', tokenizer_name='bert-base-uncased')

# Create a DataLoader for the dataset
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)


In [None]:
for d in data:
    text = d['data']['text']
    labels = ['O'] * len(text)
    print(text)
    for annotation in d['predictions'][0]['result']:
        print(annotation)
        start = annotation['value']['start']
        end = annotation['value']['end']
        label = annotation['value']['labels'][0]

In [23]:
import json
import nltk
from sklearn.model_selection import train_test_split
import sklearn_crfsuite
from sklearn_crfsuite.metrics import flat_classification_report

# Load data from JSON file
with open("negacio_uab_revised_version.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Create a list of labeled instances
instances = []

for d in data:
    text = d['data']['text']
    labels = ['O'] * len(text)
    
    for annotation in d['predictions'][0]['result']:

        label = annotation['value']['labels'][0]

        if label == "NEG":
            start = annotation['value']['start']
            end = annotation['value']['end']
            labels[start:end] = ['B-' + label] + ['I-' + label] * (end - start - 1)

    instances.append((text, labels))

In [None]:
def preprocess_text(text):
    # Tokenize text
    tokens = nltk.word_tokenize(text)
    # Extract PoS tags and lemma
    pos_tags = [pos for _, pos in nltk.pos_tag(tokens)]
    lemma = [nltk.stem.WordNetLemmatizer().lemmatize(token) for token in tokens]

    # Extract features
    features = {}
    for i in range(len(tokens)):
        word = tokens[i]
        pos = pos_tags[i]
        features[f'word.lower()_{i}'] = word.lower()
        features[f'word[-3:]_{i}'] = word[-3:]
        features[f'word[-2:]_{i}'] = word[-2:]
        features[f'word.isupper()_{i}'] = word.isupper()
        features[f'word.istitle()_{i}'] = word.istitle()
        features[f'word.isdigit()_{i}'] = word.isdigit()
        features[f'pos_{i}'] = pos
        features[f'lemma_{i}'] = lemma[i]
        features['bias'] = 1.0
        
    return features

# Convert data to feature vectors
X = [[preprocess_text(token) for token in text] for text, _ in instances]
y = [[label for label in labels] for _, labels in instances]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train CRF model
crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True)
crf.fit(X_train, y_train)

In [25]:
# Evaluate model on testing data
y_pred = crf.predict(X_test)


In [37]:
from sklearn.metrics import classification_report

# Convert the predicted labels and true labels to the same format
y_pred = [pred_labels for doc_labels in y_pred for pred_labels in doc_labels]
y_true = [true_labels for doc_labels in y_test for true_labels in doc_labels]

# Compute the classification report
report = classification_report(y_true, y_pred)
print(report)


              precision    recall  f1-score   support

       B-NEG       0.00      0.00      0.00      1003
       I-NEG       0.06      0.00      0.00      3425
           O       0.98      1.00      0.99    290060

    accuracy                           0.98    294488
   macro avg       0.35      0.33      0.33    294488
weighted avg       0.97      0.98      0.98    294488

