In [4]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel

# Load the CSV file into a Pandas dataframe
df = pd.read_csv('data/fpsc_relevant_ocm.csv')

# Load the pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased')

# Define a function to generate embeddings for a given text
def generate_embedding(text):
    # Tokenize the text
    input_ids = tokenizer.encode(text, add_special_tokens=True)
    # Convert the input to a tensor
    input_tensor = torch.tensor([input_ids])
    # Feed the input tensor through the BERT model
    with torch.no_grad():
        last_hidden_states = model(input_tensor)[0]
    # Extract the embedding for the [CLS] token
    cls_embedding = last_hidden_states[:, 0, :]
    # Return the embedding as a NumPy array
    return cls_embedding.numpy()

# Generate embeddings for each text field in the dataframe
embeddings = []
for text in df['textrecord']:
    embedding = generate_embedding(text)
    embeddings.append(embedding)

# Cluster the embeddings using K-Means
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5, random_state=42)
cluster_labels = kmeans.fit_predict(embeddings)

# Extract SPOs from the clustered texts
from spacy.lang.en import English
nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer'))

spos = []
for cluster in set(cluster_labels):
    cluster_texts = df[cluster_labels == cluster]['text']
    cluster_doc = nlp(' '.join(cluster_texts))
    for sent in cluster_doc.sents:
        sent_text = sent.text
        sent_embedding = generate_embedding(sent_text)
        # Perform SPO extraction here and append to spos list
        spos.append((subject, predicate, object))

# Print the extracted SPOs
for spo in spos:
    print(spo)


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Token indices sequence length is longer than the specified maximum sequence length for this model (5468 > 512). Running this sequence through th

RuntimeError: The expanded size of the tensor (5468) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 5468].  Tensor sizes: [1, 512]

In [27]:
import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Define the text to process
text = "John hit the ball with a bat. Paris is the capital of France"


In [34]:
# pip install allennlp allennlp-models

Collecting allennlp
  Downloading allennlp-2.10.1-py3-none-any.whl (730 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m730.2/730.2 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hCollecting allennlp-models
  Downloading allennlp_models-2.10.1-py3-none-any.whl (464 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m464.5/464.5 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m
[?25hCollecting lmdb>=1.2.1
  Downloading lmdb-1.4.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (306 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m306.5/306.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m31m14.3 MB/s[0m eta [36m0:00:01[0m
Collecting pytest>=6.2.5
  Downloading pytest-7.2.2-py3-none-any.whl (317 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.2/317.2 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00

In [35]:
from allennlp.predictors.predictor import Predictor
import allennlp_models.tagging

predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz")
predictor.predict(
    sentence="Did Uriah honestly think he could beat the game in under three hours?."
)


  warn(f"Failed to load image Python extension: {e}")
[nltk_data] Downloading package punkt to /home/hasan-sh/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /home/hasan-sh/nltk_data...


Downloading (…)/main/tokenizer.json: 100%|███| 466k/466k [00:00<00:00, 1.14MB/s]
Downloading (…)"pytorch_model.bin";: 100%|███| 440M/440M [00:36<00:00, 12.1MB/s]
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification m

{'verbs': [{'verb': 'Did',
   'description': '[V: Did] Uriah honestly think he could beat the game in under three hours ? .',
   'tags': ['B-V',
    'O',
    'O',
    'O',
    'O',
    'O',
    'O',
    'O',
    'O',
    'O',
    'O',
    'O',
    'O',
    'O',
    'O']},
  {'verb': 'think',
   'description': 'Did [ARG0: Uriah] [ARGM-ADV: honestly] [V: think] [ARG1: he could beat the game in under three hours] ? .',
   'tags': ['O',
    'B-ARG0',
    'B-ARGM-ADV',
    'B-V',
    'B-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'I-ARG1',
    'O',
    'O']},
  {'verb': 'could',
   'description': 'Did Uriah honestly think he [V: could] beat the game in under three hours ? .',
   'tags': ['O',
    'O',
    'O',
    'O',
    'O',
    'B-V',
    'O',
    'O',
    'O',
    'O',
    'O',
    'O',
    'O',
    'O',
    'O']},
  {'verb': 'beat',
   'description': 'Did Uriah honestly think [ARG0: he] [ARGM-MOD: could] [V: beat] [ARG1:

In [54]:

# Apply NLP preprocessing
doc = nlp(text)

# Define a list to hold SPOs
spos = []

# Iterate over the sentences in the text
for sent in doc.sents:
    # Get the SRL predictions for the sentence
    srl_results = predictor.predict_tokenized(tokenized_sentence=[t.text for t in sent])
    # Iterate over the SRL predictions for each predicate in the sentence
    for predicate_index, predicate in enumerate(srl_results["verbs"]):
        # Get the predicate text and its semantic roles
        predicate_text = predicate["verb"]
        roles = predicate["tags"]
        # Find the ARG0 and ARG1 roles
        subj = ""
        obj = ""
        for i, role in enumerate(roles):
            if role == "B-ARG0":
                # print(sent, dir(srl_results["words"][i]))
                subj = sent[srl_results["words"][i]["startswith"]:srl_results["words"][i]["endswith"]]
            elif role == "B-ARG1":
                obj = sent[srl_results["words"][i]["startswith"]:srl_results["words"][i]["endswith"]]
        # Add the SPO to the list
        spos.append((subj.text, predicate_text, obj.text))

# Print the generated SPOs
for spo in spos:
    print(spo)

TypeError: string indices must be integers

In [36]:
predictor.predict(
    sentence="My name is Hasan and I am 27 years old."
)


{'verbs': [{'verb': 'is',
   'description': '[ARG1: My name] [V: is] [ARG2: Hasan] and I am 27 years old .',
   'tags': ['B-ARG1',
    'I-ARG1',
    'B-V',
    'B-ARG2',
    'O',
    'O',
    'O',
    'O',
    'O',
    'O',
    'O']},
  {'verb': 'am',
   'description': 'My name is Hasan and [ARG1: I] [V: am] [ARG2: 27 years old] .',
   'tags': ['O',
    'O',
    'O',
    'O',
    'O',
    'B-ARG1',
    'B-V',
    'B-ARG2',
    'I-ARG2',
    'I-ARG2',
    'O']}],
 'words': ['My',
  'name',
  'is',
  'Hasan',
  'and',
  'I',
  'am',
  '27',
  'years',
  'old',
  '.']}

In [29]:
print(doc.ents)#[0].label_)

(John, Paris, France)
