<a href="https://colab.research.google.com/github/khalidhegazy/ITI_ALEX_NLP/blob/main/Spacy_ner_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!python -m spacy init config config.cfg --lang en --pipeline ner

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [16]:
import pandas as pd
df = pd.read_csv('ner.csv')
df.head()

Unnamed: 0,Sentence #,Sentence,POS,Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...","['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"['PRP', 'VBD', 'IN', 'DT', 'NNS', 'IN', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","['NNS', 'VBD', 'DT', 'NN', 'IN', 'NNS', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,Sentence: 5,The protest comes on the eve of the annual con...,"['DT', 'NN', 'VBZ', 'IN', 'DT', 'NN', 'IN', 'D...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_df, dev_df = train_test_split(df, test_size=0.2, random_state=42)
train_df.to_csv("train.csv", index=False)
dev_df.to_csv("dev.csv", index=False)


In [21]:
import pandas as pd
import ast
import spacy
from spacy.tokens import DocBin

def convert_to_spacy(input_csv, output_spacy):
    df = pd.read_csv(input_csv)
    df['Tag'] = df['Tag'].apply(ast.literal_eval)

    nlp = spacy.blank("en")  # use built-in tokenizer
    doc_bin = DocBin()

    for _, row in df.iterrows():
        text = row['Sentence']
        tags = row['Tag']
        doc = nlp(text)

        # Skip if token count doesn't match tag count
        if len(doc) != len(tags):
            print(f"Skipping: token/tag mismatch — {text}")
            continue

        ents = []
        start = 0
        for token, tag in zip(doc, tags):
            if tag.startswith("B-"):
                start = token.idx
                end = token.idx + len(token)
                label = tag[2:]
                ents.append((start, end, label))
            elif tag.startswith("I-") and ents:
                ents[-1] = (ents[-1][0], token.idx + len(token), ents[-1][2])

        # Create doc.ents from spans
        doc.ents = [doc.char_span(start, end, label) for start, end, label in ents if doc.char_span(start, end, label)]
        doc_bin.add(doc)

    doc_bin.to_disk(output_spacy)
    print(f"Saved to {output_spacy}")

# Convert both train and dev
convert_to_spacy("train.csv", "train.spacy")
convert_to_spacy("dev.csv", "dev.spacy")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Skipping: token/tag mismatch — Detlev Mehlis says he believes the four - a top aide to pro-Syrian President Emile Lahoud and three former generals - were involved in planning the assassination .
Skipping: token/tag mismatch — Meanwhile , in Baghdad , the leader , Abdel Aziz al-Hakim of Iraq 's largest Shi'ite political party - Supreme Council of the Islamic Revolution in Iraq endorsed the draft constitution and urged Shi'ites to vote " yes " in next month 's national referendum .
Skipping: token/tag mismatch — Russian President Vladimir Putin has ordered his administration to study foreign and domestic criticism of a bill to strictly regulate non-governmental organizations .
Skipping: token/tag mismatch — Arabic language al-Jazeera television has broadcast what it says is a new videotape of kidnapped U.S. journalist Jill Carroll pleading for freedom for female Iraqi prisoners .
Skipping: token/tag mismatch — Afghan presid

In [22]:
from spacy.tokens import DocBin

doc_bin = DocBin().from_disk("./train.spacy")

nlp = spacy.blank("en")
docs = list(doc_bin.get_docs(nlp.vocab))[:3]

for i, doc in enumerate(docs):
    print(f"\nDocument {i+1}:")
    print("Text:", doc.text)
    print("Entities:", [(ent.text, ent.label_) for ent in doc.ents])


Document 1:
Text: The Swiss star was upset Wednesday by German Tommy Haas in the opening match of the Kooyong Classic in Melbourne .
Entities: [('Swiss', 'gpe'), ('Wednesday', 'tim'), ('German', 'gpe'), ('Tommy Haas', 'per'), ('Kooyong Classic', 'eve'), ('Melbourne', 'geo')]

Document 2:
Text: Israeli soldiers have killed two Palestinians in the southern Gaza Strip .
Entities: [('Israeli', 'gpe'), ('Palestinians', 'gpe'), ('Gaza Strip', 'geo')]

Document 3:
Text: A strong earthquake has struck off the coast of Indonesia , but no tsunami alert was issued .
Entities: [('Indonesia', 'geo')]


In [23]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     48.97    0.16    0.33    0.10    0.00
  0     200        309.09   2968.14   61.26   68.19   55.61    0.61
  0     400        182.92   2016.10   70.42   74.11   67.07    0.70
  0     600        196.06   1822.70   73.12   74.96   71.36    0.73
  0     800        264.98   2137.90   76.59   77.05   76.13    0.77
  0    1000        326.91   2409.67   78.29   81.78   75.08    0.78
  0    1200        364.06   2748.78   78.98   79.55   78.42    0.79
  0    1400        460.94   3276.70   79.76   82.50   77.19    0.80
  0    1600        534.15   3921.73   82.16   83.20   81.16    0.82
  0    1800        842.00   4339.26   82.66   