In [None]:
# Named Entity Recognition is the process of locating and classifying named entities 
# in text into pre-defined categories (persons, places, and so on).
# Part of speech tagging tags the parts of speech computationally identified within a text.
# Here's how we perform NER and POS tagging using NLTK...

In [None]:
import nltk
from nltk.tokenize import PunktSentenceTokenizer

# Open the text to train our model on
with open('training_text.txt', 'r') as file:
    train_text = file.read().replace('\n', '')
    
# Open the text we want to perform NER and POS tagging on
with open('tokenized.txt', 'r') as file:
    sample_text = file.read().replace('\n', '')

# Train the model
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

# Run the model on our tokenized text
tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized[5:]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEnt = nltk.ne_chunk(tagged, binary=True)
            namedEnt.draw()
    except Exception as e:
        print(str(e))


process_content()

In [None]:
# ...and with spacy. This workflow (for NER only) is a bit simpler. Instructions on POS tagging with Spacy are below.

In [None]:
import spacy


nlp = spacy.load("fr_core_news_md")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [None]:
# We can perform part of speech tagging using the spacy_lefff library.
# From the library's GitHub repository:
# "This package allows to bring Lefff lemmatization and part-of-speech tagging to a spaCy custom pipeline.
# When POS tagging and Lemmatizaion are combined inside a pipeline, it improves your text preprocessing for 
# French compared to the built-in spaCy French processing.""

In [None]:
import spacy
from spacy_lefff import LefffLemmatizer, POSTagger
import csv

nlp = spacy.load('fr')
pos = POSTagger()
french_lemmatizer = LefffLemmatizer(after_melt=True, default=True)
nlp.add_pipe(pos, name='pos', after='parser')
nlp.add_pipe(french_lemmatizer, name='lefff', after='pos')

# Open our file
with open('tokenized.txt', 'r') as file:
    text_data = file.read().replace('\n', '')

# Specify the information we want
doc = nlp(text_data)
for d in doc:
    print(d.text, d.pos_, d._.melt_tagger, d._.lefff_lemma, d.tag_, d.lemma_)
    
# Output our tagged data into a CSV
with open('text_data.csv', 'w') as csvfile:
    fieldnames = ['text', 'pos', 'melt', 'lefff_lemma', 'tag', 'lemma']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    
    for d in doc:
        writer.writerow({'text': d.text, 'pos': d.pos_, 'melt': d._.melt_tagger, 'lefff_lemma': d._.lefff_lemma, 'tag': d.tag_, 'lemma': d.lemma_})