In [409]:
import spacy
import pandas as pd

In [410]:
nlp = spacy.load("en_core_web_sm")

In [411]:
def spacy_extract_entity_types(article):
    doc = nlp(article)
    article_dict={}
    for ent in doc.ents:
        text = ent.text
        label = ent.label_
        article_dict[text] = label
    return article_dict

In [412]:
def agregate_statics(article_dict):
    article_frame = pd.DataFrame(article_dict.items(),columns = ['text', 'label'])
    ents_count = article_frame.groupby(by="label", as_index=False).agg({'text':pd.Series.count})
    return ents_count,article_frame

In [413]:
path= r"C:\Users\Leon\Desktop\10 articles\1.txt"
with open(path,"r",encoding='utf8') as in_file:
        text = in_file.read()
text_ents_dict = spacy_extract_entity_types(text)       
ents_count,article_frame = agregate_statics(text_ents_dict)

In [414]:
ents_count

Unnamed: 0,label,text
0,CARDINAL,18
1,DATE,56
2,EVENT,2
3,FAC,1
4,GPE,37
5,LOC,2
6,MONEY,30
7,NORP,13
8,ORDINAL,3
9,ORG,53


Extract file to csv for taging

In [415]:
article_frame.to_csv("arts_tag.csv", encoding='utf8')

Load csv file after tagging

In [416]:
path= r"C:\Users\Leon\Desktop\articles_tag_completed.csv"
with open(path,"r") as file:
    taging_results = pd.read_csv(file, index_col=0)

In [422]:
taging_results

Unnamed: 0,text,label,correct_incorrect,"""correct classification"""
0,Black Friday,EVENT,1,
1,Americans,NORP,1,
2,Friday,DATE,1,
3,Thanksgiv,GPE,0,EVENT
4,this year,DATE,1,
...,...,...,...,...
285,8.5,MONEY,1,
286,8.99,MONEY,1,
287,9endings,DATE,1,
288,Israeli,NORP,1,


Preparing data for representation - calculating errors and precision for each label

In [418]:
ents_count_ater_taging = taging_results.groupby(by="label", as_index=False).agg({'correct_incorrect':pd.Series.sum})
all_data = pd.concat([ents_count,ents_count_ater_taging.correct_incorrect], axis=1)
# calculating the Erros (incorect labels), and then add this calculation to main table
erros = ents_count.text - ents_count_ater_taging.correct_incorrect
erros = pd.DataFrame(erros)
erros.columns = ['number_of_incorrect']
all_data = pd.concat([all_data,erros], axis=1 )
# calculating the Precision for each table, and then add this calculation to main table
pr = (all_data.correct_incorrect/all_data.text)*100
pr = pr.round(1)
pr = pd.DataFrame(pr)
pr.columns = ['precision']
pr['precision'] = pr['precision'].apply(lambda x : str(x) + '%')
all_data_results= pd.concat([all_data,pr], axis=1 )

In [419]:
all_data_results

Unnamed: 0,label,text,correct_incorrect,number_of_incorrect,precision
0,CARDINAL,18,18,0,100.0%
1,DATE,56,53,3,94.6%
2,EVENT,2,2,0,100.0%
3,FAC,1,1,0,100.0%
4,GPE,37,35,2,94.6%
5,LOC,2,2,0,100.0%
6,MONEY,30,30,0,100.0%
7,NORP,13,11,2,84.6%
8,ORDINAL,3,3,0,100.0%
9,ORG,53,40,13,75.5%


Calculating the main precision of spacy vs manual review:

In [420]:
all_data.sum()

label                  CARDINALDATEEVENTFACGPELOCMONEYNORPORDINALORGP...
text                                                                 290
correct_incorrect                                                    265
number_of_incorrect                                                   25
dtype: object

In [421]:
print(f"The precision is {round(265/290*100,2)} %")

The precision is 91.38 %
