In [1]:
!pip install transformers
!pip install pandas



In [10]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
# model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
model_name = "Davlan/distilbert-base-multilingual-cased-ner-hrl"  # splits words into odd tokens.  e.g. Abram becomes 'Ab' + '##bram'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "It came about when Abram came into Egypt, the Egyptians saw that the woman was very beautiful."
ner_results = nlp(example)
print(ner_results)

[{'entity': 'B-PER', 'score': 0.99987364, 'index': 5, 'word': 'Ab', 'start': 19, 'end': 21}, {'entity': 'I-PER', 'score': 0.99761295, 'index': 6, 'word': '##ram', 'start': 21, 'end': 24}, {'entity': 'B-LOC', 'score': 0.9997857, 'index': 9, 'word': 'Egypt', 'start': 35, 'end': 40}]


In [7]:
# import time
# start_time = time.time()
# result = nlp("Then Pharaoh called Abram and said, What is this you have done to me? Why did you not tell me that she was your wife?")
# elapsed_time = time.time() - start_time
# print(f"ner took {elapsed_time} seconds {result}")

converted_test = tokenizer.convert_ids_to_tokens(tokenizer.encode("Nader Jokhadar had given Syria the lead with a well-struck header in the seventh minute."))

print(converted_test)


['[CLS]', 'Nad', '##er', 'Jo', '##kha', '##dar', 'had', 'given', 'Syria', 'the', 'lead', 'with', 'a', 'well', '-', 'struck', 'head', '##er', 'in', 'the', 'seventh', 'minute', '.', '[SEP]']


In [28]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [45]:
import pandas as pd
def process_bible_csv(base_path="drive/MyDrive/data", file_path="new american standard with distilibert-base-multilingual-cased-ner-hrl.csv", save_every=100):
  complete_path = f"{base_path}/{file_path}"
  df = pd.read_csv(complete_path)
  print("processing...")

  if'named_entities' not in df.columns:
    df['named_entities'] = None

  process_count = 0
  for index, row in df.iterrows():
    if pd.isna(row['named_entities']):
      verse_text = row['verse_text']
      named_entities = nlp(verse_text)
      # print(f"entities: {named_entities} for verse: {verse_text}")
      df.at[index, 'named_entities'] = str(named_entities)
      process_count += 1
      if process_count > save_every:
        start_time = time.time()
        df.to_csv(complete_path, index=False)
        elapsed_time = time.time() - start_time
        # df = pd.read_csv(complete_path)
        print(f"save of {len(df)} rows took {elapsed_time} seconds. index: {index}")
        process_count = 0
  # final save
  df.to_csv(complete_path, index=False)
  print(f"final df size is {len(df)}")

In [46]:
process_bible_csv()

processing...
save of 31102 rows took 0.3339531421661377 seconds. index: 100
save of 31102 rows took 0.14854717254638672 seconds. index: 201
save of 31102 rows took 0.15734457969665527 seconds. index: 302
save of 31102 rows took 0.15687918663024902 seconds. index: 403
save of 31102 rows took 0.27924609184265137 seconds. index: 504
save of 31102 rows took 0.17500782012939453 seconds. index: 605
save of 31102 rows took 0.28442907333374023 seconds. index: 706
save of 31102 rows took 0.1642286777496338 seconds. index: 807
save of 31102 rows took 0.16824126243591309 seconds. index: 908
save of 31102 rows took 0.16315817832946777 seconds. index: 1009
save of 31102 rows took 0.1601543426513672 seconds. index: 1110
save of 31102 rows took 0.16078877449035645 seconds. index: 1211
save of 31102 rows took 0.2874486446380615 seconds. index: 1312
save of 31102 rows took 0.2891550064086914 seconds. index: 1413
save of 31102 rows took 0.16214942932128906 seconds. index: 1514
save of 31102 rows took 0