## Load Data

In [228]:
import pandas as pd

In [248]:
def load_data(name, test=False):
    print(f'Processing {name}')
    df = pd.read_csv(PATH + '/' + name)
    df.columns = ['index', 'Token', 'Tag']
    df.set_index('index', drop=True, inplace=True)
    mask = df.Token.str.startswith('# id')
    indices = list(map(lambda x:x+1, df.index[mask].tolist())) + [df.index[-1]]
    sentence = 0
    df['Sent'] = None
    for index in tqdm(range(len(indices)-1)):
        df.loc[indices[index]:indices[index+1], 'Sent'] = sentence
        sentence += 1
    df.drop(df.index[mask], inplace=True)
    df.set_index('Sent',append=True, inplace=True)
    if test:
      df.drop(columns='Tag', inplace=True)
    return df

In [249]:
PATH = 'data/'
PATH = PATH.rstrip('/')

# EN Train
en_train_data = load_data('en_train.csv')

# EN Test
en_test = load_data('en_test.csv', test=True)
# FA Train
fa_train_data = load_data('fa_train.csv')

# FA TEST
fa_test = load_data('fa_test.csv', test=True)

Processing en_train.csv


100%|██████████| 15300/15300 [00:01<00:00, 9189.92it/s]


Processing en_test.csv


100%|██████████| 800/800 [00:00<00:00, 1855.76it/s]


Processing fa_train.csv


100%|██████████| 15300/15300 [00:01<00:00, 8964.88it/s]


Processing fa_test.csv


100%|██████████| 800/800 [00:00<00:00, 1671.81it/s]


In [250]:
fa_train_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Token,Tag
index,Sent,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,مردی,B-CW
2,0,برای,I-CW
3,0,تمام,I-CW
4,0,فصول,I-CW
5,0,–,O
...,...,...,...
293541,15298,بلکسی,B-CORP
293542,15298,گرلز,I-CORP
293544,15299,ووپاک,B-CORP
293545,15299,مدیر,O


### Split Train Dataset

In [251]:
from sklearn.model_selection import train_test_split

In [238]:
en_train, en_eval, en_train_label, en_eval_label = train_test_split(
    en_train_data[['Token','Sent']], 
    en_train_data['Tag'], 
    test_size=0.2, 
    random_state=42)

In [239]:
fa_train, fa_eval, fa_train_label, fa_eval_label = train_test_split(
    fa_train_data[['Token','Sent']], 
    fa_train_data['Tag'], 
    test_size=0.2, 
    random_state=42)

In [157]:
fa_train

Unnamed: 0,Token,Sent
258979,در,14189
143298,۸,7858
1143,تلویزیونی,62
123925,سال,6790
149123,بنیان,8182
...,...,...
119879,دیوولور,6576
259178,ینا,14199
131932,۱۰۲۵,7240
146867,فرانسیسکو,8055


## Pretrained Roberta Model

In [7]:
! pip install transformers --quiet

In [8]:
import matplotlib.pyplot as plt
import numpy as np

# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim

# Hugging Face
from transformers import RobertaForTokenClassification, RobertaTokenizer
from transformers import pipeline

# tqdm
from tqdm import tqdm

## Enlgish NER

## Persian NER

In [263]:
model_name_or_path = "HooshvareLab/roberta-fa-zwnj-base-ner" 
tokenizer = RobertaTokenizer.from_pretrained(model_name_or_path)
model = RobertaForTokenClassification.from_pretrained(model_name_or_path)

In [264]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

In [303]:
example = fa_train[fa_train.Sent == 7]['Token'].sort_index().tolist()
sent = "".join(example)
results = nlp(sent)
outputs = [(example[result['index']-1], result['entity']) for result in results]
print(sent)
print(results)
print(outputs)

۱ آبان اهواز ) منتقد فیلم نویس و کارگردان اهل کشور ایران است . 
[{'entity': 'B-DAT', 'score': 0.46856514, 'index': 1, 'word': 'ĠÛ±', 'start': None, 'end': None}, {'entity': 'I-DAT', 'score': 0.9064486, 'index': 2, 'word': 'ĠØ¢Ø¨Ø§ÙĨ', 'start': None, 'end': None}, {'entity': 'B-LOC', 'score': 0.6031945, 'index': 3, 'word': 'ĠØ§ÙĩÙĪØ§Ø²', 'start': None, 'end': None}, {'entity': 'B-LOC', 'score': 0.6845396, 'index': 12, 'word': 'ĠØ§ÛĮØ±Ø§ÙĨ', 'start': None, 'end': None}]
[('۱ ', 'B-DAT'), ('آبان ', 'I-DAT'), ('اهواز ', 'B-LOC'), ('ایران ', 'B-LOC')]
