# Train Spacy model

In [4]:
!python -m spacy download xx_ent_wiki_sm

Collecting xx-ent-wiki-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.7.0/xx_ent_wiki_sm-3.7.0-py3-none-any.whl (11.1 MB)
     ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
     --------------------------------------- 0.0/11.1 MB 660.6 kB/s eta 0:00:17
      --------------------------------------- 0.1/11.1 MB 1.7 MB/s eta 0:00:07
     - -------------------------------------- 0.4/11.1 MB 2.9 MB/s eta 0:00:04
     --- ------------------------------------ 0.9/11.1 MB 5.2 MB/s eta 0:00:02
     ----- ---------------------------------- 1.6/11.1 MB 7.2 MB/s eta 0:00:02
     ------ --------------------------------- 1.9/11.1 MB 8.2 MB/s eta 0:00:02
     ------ --------------------------------- 1.9/11.1 MB 8.2 MB/s eta 0:00:02
     ------- -------------------------------- 2.1/11.1 MB 6.0 MB/s eta 0:00:02
     ------- -------------------------------- 2.1/11.1 MB 6.0 MB/s eta 0:00:02
     -------- --------------------------


[notice] A new release of pip is available: 23.0.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.load("xx_ent_wiki_sm") 
doc_bin = DocBin() # create a DocBin object


## Prepare data

In [None]:
import json

with open("./train/data/train_data.json") as f:
    training_data = json.load(f)

In [45]:
for training_example in tqdm(training_data['annotations']): 
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)

    doc.ents = ents 
    doc_bin.add(doc)
doc_bin.to_disk("training_data.spacy") # save the docbin object

 77%|███████▋  | 142/185 [00:00<00:00, 242.66it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity


100%|██████████| 185/185 [00:00<00:00, 246.63it/s]

Skipping entity
Skipping entity





## Training

In [6]:
!python -m spacy init fill-config base_config.cfg config.cfg

✔ Auto-filled config with all values
✔ Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
!python -m spacy train config.cfg --output ./output --paths.train ./train/data/training_data.spacy --paths.dev ./train/data/training_data.spacy 

^C


ℹ Saving to output directory: output
ℹ Using CPU
[1m
✔ Initialized pipeline
[1m
ℹ Pipeline: ['tok2vec', 'ner']
ℹ Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    122.30    0.08    0.04    0.57    0.00
  1     200       2139.51   4107.62   75.64   86.45   67.24    0.76
  2     400         55.94    241.61   86.75   88.69   84.90    0.87
  3     600       1147.14    233.04   89.74   86.32   93.45    0.90
  4     800         69.92     90.63   91.25   91.91   90.60    0.91
  5    1000      40366.42    383.94   94.38   93.07   95.73    0.94
  6    1200         65.99     55.62   98.12   99.42   96.87    0.98
  7    1400         81.13     42.94   97.17   96.62   97.72    0.97
  8    1600        236.94     61.39   97.88   97.19   98.58    0.98
  9    1800         59.46     21.78   98.15   98.01   98.29    0.98
 10    2000         68.29     30.26   98.15

## Predict

In [10]:
from pypdf import PdfReader

def convert_pdf_to_text(file_path: str) -> None:
    """
    This function converts PDF file to string. 
    Because email and name must appear on the first page so, return information only on the first page. 
    
    Args:
        file_path: path of pdf file
    """
    
    reader = PdfReader(file_path)

    page = reader.pages[0] # only on first page
    text = page.extract_text()
    
    text = text.replace("\n"," ")
    text = text.replace("\r", " ")
    text = " ".join(text.split(" "))
    return text 

In [29]:
my_ner_model = spacy.load("output/model-best") #load the best model

In [30]:
text = convert_pdf_to_text("resumes/android_01.pdf")
text

'  Page 1 of 13   CURRICULUM VITAE     Personal detail   Name  Nguyen Thanh Xuan   Nationality  Vietnamese   Date of Birth  November 20, 1985   Sex Male   Marital status  Married   Phone  84-946858197   Home address  72/486I Phan Huy Ich Street , Go Vap District ,  Hochiminh City , Viet  Nam   Email  xuanusm@gmail.com     Employment history   Since 11.2014  Lazada Tech hub ( http://www.lazada.com ,  http://techhub.lazada.com/  )  Senio r Android Developer     Participated in developing android app product.   Since 06.2014  VNG  (http://www.vng.com.vn/ )  Supervisor   Participated in Mobile publishing department,  software development part.   Since 06.2011  Forix  (http://www.forixusa.com  )  Technical Android leader     Participated in developing mobile application,  especially on Android platform.   09.2010  – 05.2011  Codix ( http://www.codix.eu )  Senior Software developer   Worked in Vietnamese office of Codix.   Participated in developing iMX,  a unique software  solution providin

In [31]:
doc = my_ner_model(text) # input sample text

spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter

In [32]:
doc.to_json()['ents']

[{'start': 62, 'end': 79, 'label': 'Name'},
 {'start': 301, 'end': 318, 'label': 'Email'}]