In [19]:
import pandas as pd
import glob
from tqdm.notebook import tqdm

from transformers import AutoTokenizer, AutoModelForTokenClassification, BertTokenizer
import torch 
from transformers import pipeline

In [3]:
tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/roberta-large-ner-english")
model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/roberta-large-ner-english")
ner = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")

In [5]:
# Test string and example tokens from tokenizer.
t_str = "If it's flipping hamburgers at McDonald's, be the best hamburger flipper in the world. Whatever it is you do you have to master your craft. - Snoop Dogg"

out = tokenizer(t_str)
print(tokenizer.convert_ids_to_tokens(out.input_ids))

['<s>', 'ĠIf', 'Ġit', "'s", 'Ġflipping', 'Ġhamb', 'urg', 'ers', 'Ġat', 'ĠMcDonald', "'s", ',', 'Ġbe', 'Ġthe', 'Ġbest', 'Ġhamb', 'urger', 'Ġfl', 'ipper', 'Ġin', 'Ġthe', 'Ġworld', '.', 'ĠWhatever', 'Ġit', 'Ġis', 'Ġyou', 'Ġdo', 'Ġyou', 'Ġhave', 'Ġto', 'Ġmaster', 'Ġyour', 'Ġcraft', '.', 'Ġ-', 'ĠSno', 'op', 'ĠDog', 'g', '</s>']


In [6]:
# Example NER output
ner(t_str)

[{'entity_group': 'ORG',
  'score': 0.87588465,
  'word': " McDonald's",
  'start': 31,
  'end': 41},
 {'entity_group': 'PER',
  'score': 0.9990082,
  'word': ' Snoop Dogg',
  'start': 142,
  'end': 152}]

In [6]:
# Generate list of page paths required for Nikolaus' page_ner function
page_paths = glob.glob('code-and-results/sample-14/*/*.txt')
print(len(page_paths))

562


In [9]:
page_paths

['code-and-results/sample-14/mdp.39015046352020/00000106.txt',
 'code-and-results/sample-14/mdp.39015046352020/00000073.txt',
 'code-and-results/sample-14/mdp.39015046352020/00000071.txt',
 'code-and-results/sample-14/mdp.39015046352020/00000111.txt',
 'code-and-results/sample-14/mdp.39015046352020/00000064.txt',
 'code-and-results/sample-14/mdp.39015046352020/00000070.txt',
 'code-and-results/sample-14/mdp.39015046352020/00000074.txt',
 'code-and-results/sample-14/mdp.39015046352020/00000060.txt',
 'code-and-results/sample-14/mdp.39015046352020/00000100.txt',
 'code-and-results/sample-14/mdp.39015046352020/00000101.txt',
 'code-and-results/sample-14/mdp.39015046352020/00000088.txt',
 'code-and-results/sample-14/mdp.39015046352020/00000102.txt',
 'code-and-results/sample-14/mdp.39015046352020/00000116.txt',
 'code-and-results/sample-14/mdp.39015046352020/00000089.txt',
 'code-and-results/sample-14/mdp.39015046352020/00000207.txt',
 'code-and-results/sample-14/mdp.39015046352020/0000015

In [14]:
def page_ner(page_path:str, vol_id):
  page_text= open(page_path, errors='replace').read()
  # print(page_text)
# tokenized_text = word_tokenize(page_text)
  classified_text = ner(page_text)
  # print(classified_text)
  word_lst=[]
  #class_lst=[]
  for item in classified_text:
    # print(item)
    # print(type(item))#<class 'tuple'>
    aList = list(item.values())
    word_lst.append(aList)
  #ner_df = pd.DataFrame({'word': word_lst,'ner': class_lst})
  page_id= page_path.split('/')[-1].rstrip('.txt')
  ner_df = pd.DataFrame(word_lst, columns=['entity_group', 'score', 'word', 'start', 'end'])
  ner_df['vol_id'] = vol_id
  ner_df['page_id'] = page_id

  return ner_df


In [None]:
# pt = open(page_paths[0], errors='replace').read()

page_ner(page_paths[0], 'mdp.39015046352020')

In [23]:
# roberta-large-ner: https://huggingface.co/Jean-Baptiste/roberta-large-ner-english
# tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/roberta-large-ner-english")
# model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/roberta-large-ner-english")

# BERT base NER: https://huggingface.co/dslim/bert-base-NER
# tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
# tokenizer = AutoTokenzier.from_pretrained("Jean-Baptiste/roberta-large-ner-english")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

ner = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")

df_l = []

for page in tqdm(page_paths, total=len(page_paths)):
    # print(page)
    htid = page.split('/')[1]
    # print(htid)
    vol_df = page_ner(page, htid)
    df_l.append(vol_df)
    # print(vol_df.shape, flush=True)

ent_df = pd.concat(df_l, ignore_index=True)

# reordering columns
ent_df = ent_df[['vol_id','page_id','start', 'end','word','entity_group','score']]

print(ent_df.shape)
ent_df.head()

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. 
The class this function is called from is 'BertTokenizer'.


TypeError: stat: path should be string, bytes, os.PathLike or integer, not NoneType

In [75]:
# Final review and save
ent_df.head(25)
# ent_df.index
ent_df.to_csv("roberta-large-ner-output.tsv", sep='\t')
# ent_df.to_csv("bert-base-ner-output.tsv", sep='\t')