## Load Data

In [None]:
! mkdir data
! wget https://github.com/language-ml/4-token-classification/raw/main/Multilingual-NER/en_test.csv -P ./data
! wget https://github.com/language-ml/4-token-classification/raw/main/Multilingual-NER/en_train.csv -P ./data
! wget https://github.com/language-ml/4-token-classification/raw/main/Multilingual-NER/fa_test.csv -P ./data
! wget https://github.com/language-ml/4-token-classification/raw/main/Multilingual-NER/fa_train.csv -P ./data

In [2]:
import pandas as pd
from tqdm import tqdm

In [3]:
PATH = 'data/'
PATH = PATH.rstrip('/')

In [4]:
def load_data(name, test=False):
    print(f'Processing {name}')
    df = pd.read_csv(PATH + '/' + name)
    df.columns = ['index', 'Token', 'Tag']
    df.set_index('index', drop=True, inplace=True)
    mask = df.Token.str.startswith('# id')
    indices = list(map(lambda x:x+1, df.index[mask].tolist())) + [df.index[-1]]
    sentence = 0
    df['Sent'] = None
    for index in tqdm(range(len(indices)-1)):
        df.loc[indices[index]:indices[index+1], 'Sent'] = sentence
        sentence += 1
    df.drop(df.index[mask], inplace=True)
    # df.set_index('Sent',append=True, inplace=True)
    if test:
      df.drop(columns='Tag', inplace=True)
    return df

In [5]:
# EN Data
en_data = load_data('en_train.csv')
# EN Test (Deploy)
en_deploy_test = load_data('en_test.csv', test=True)

# FA Data 
fa_data = load_data('fa_train.csv')
# FA TEST (Desploy)
fa_deploy_test = load_data('fa_test.csv', test=True)

Processing en_train.csv


100%|██████████| 15300/15300 [00:05<00:00, 2988.40it/s]


Processing en_test.csv


100%|██████████| 800/800 [00:00<00:00, 906.28it/s]


Processing fa_train.csv


100%|██████████| 15300/15300 [00:05<00:00, 2673.35it/s]


Processing fa_test.csv


100%|██████████| 800/800 [00:01<00:00, 621.55it/s]


### Split Train Dataset

In [7]:
# from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit

### En: split Train, Eval (Val), Test

In [17]:
splitter = GroupShuffleSplit(test_size=0.3, n_splits=2,  random_state = 42)
split = splitter.split(en_data, groups=en_data['Sent'])
en_train_inds, en_val_test_inds = next(split)

# En Train and Eval+Test
en_train = en_data.iloc[en_train_inds]
en_val_test = en_data.iloc[en_val_test_inds]

splitter = GroupShuffleSplit(test_size=0.33, n_splits=2,  random_state = 42)
split = splitter.split(en_val_test, groups=en_val_test['Sent'])
en_val_inds, en_test_inds = next(split)

# En Eval and Test
en_val = en_data.iloc[en_val_inds]
en_test = en_data.iloc[en_test_inds]


# Split X (Features), y (Label)
en_train_X = en_train[['Sent', 'Token']]
en_val_X =  en_val[['Sent', 'Token']]
en_test_X = en_test[['Sent', 'Token']]

en_train_y = en_train[['Tag']]
en_val_y = en_val[['Tag']]
en_test_y = en_test[['Tag']]

### Fa: split Train, Eval (Val), Test

In [28]:

splitter = GroupShuffleSplit(test_size=0.3, n_splits=2,  random_state = 42)
split = splitter.split(fa_data, groups=fa_data['Sent'])
fa_train_inds, fa_val_test_inds = next(split)

# Fa Train and Eval+Test
fa_train = fa_data.iloc[fa_train_inds]
fa_val_test = fa_data.iloc[fa_val_test_inds]

splitter = GroupShuffleSplit(test_size=0.33, n_splits=2,  random_state = 42)
split = splitter.split(fa_val_test, groups=fa_val_test['Sent'])
fa_val_inds, fa_test_inds = next(split)

# Fa Eval and Test
fa_val = fa_data.iloc[fa_val_inds]
fa_test = fa_data.iloc[fa_test_inds]


# Split X (Features), y (Label)
fa_train_X = fa_train[['Sent', 'Token']]
fa_val_X =  fa_val[['Sent', 'Token']]
fa_test_X = fa_test[['Sent', 'Token']]

fa_train_y = fa_train[['Tag']]
fa_val_y = fa_val[['Tag']]
fa_test_y = fa_test[['Tag']]

## Pretrained Roberta Model

In [23]:
! pip install transformers --quiet

[K     |████████████████████████████████| 3.5 MB 4.4 MB/s 
[K     |████████████████████████████████| 596 kB 33.3 MB/s 
[K     |████████████████████████████████| 6.8 MB 22.6 MB/s 
[K     |████████████████████████████████| 67 kB 4.4 MB/s 
[K     |████████████████████████████████| 895 kB 36.1 MB/s 
[?25h

In [24]:
import matplotlib.pyplot as plt
import numpy as np

# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim

# Hugging Face
from transformers import RobertaForTokenClassification, RobertaTokenizer
from transformers import pipeline


## Enlgish NER

## Persian NER

In [25]:
model_name_or_path = "HooshvareLab/roberta-fa-zwnj-base-ner" 
tokenizer = RobertaTokenizer.from_pretrained(model_name_or_path)
model = RobertaForTokenClassification.from_pretrained(model_name_or_path)

Downloading:   0%|          | 0.00/358 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/855k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/315 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.62M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/449M [00:00<?, ?B/s]

In [26]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

In [32]:
example = fa_train_X[fa_train_X.Sent == 418]['Token'].sort_index().tolist()
sent = "".join(example)
results = nlp(sent)
outputs = [(example[result['index']-1], result['entity']) for result in results]
print(sent)
print(results)
print(outputs)

اینان در مراسم عرض سپاه شاه طهماسب در اطراف هرات ، پانزده هزار سوار به میدان آوردند و در سرکوبی طوایف شورشی قزلباش ، از جمله طایفة استاجلو ، نقش مؤثری داشتند . 
[{'entity': 'B-PER', 'score': 0.8694672, 'index': 6, 'word': 'ĠØ´Ø§Ùĩ', 'start': None, 'end': None}, {'entity': 'I-PER', 'score': 0.7774506, 'index': 7, 'word': 'ĠØ·ÙĩÙħØ§Ø³Ø¨', 'start': None, 'end': None}, {'entity': 'B-LOC', 'score': 0.99991906, 'index': 10, 'word': 'ĠÙĩØ±Ø§Øª', 'start': None, 'end': None}, {'entity': 'B-PER', 'score': 0.54420865, 'index': 27, 'word': 'ĠØ·', 'start': None, 'end': None}, {'entity': 'I-PER', 'score': 0.43737274, 'index': 28, 'word': 'Ø§ÛĮÙģ', 'start': None, 'end': None}, {'entity': 'I-PER', 'score': 0.90173113, 'index': 30, 'word': 'ĠØ§Ø³Øª', 'start': None, 'end': None}, {'entity': 'I-PER', 'score': 0.5651455, 'index': 31, 'word': 'Ø§Ø¬', 'start': None, 'end': None}, {'entity': 'I-PER', 'score': 0.54423237, 'index': 32, 'word': 'ÙĦÙĪ', 'start': None, 'end': None}]
[('شاه ', 'B-PER'), ('طهماسب '

In [None]:
example