In [None]:
# Named Entity Recognition is the process of locating and classifying named entities 
# in text into pre-defined categories (persons, places, and so on).
# Part of speech tagging tags the parts of speech computationally identified within a text.
# Instructions on POS tagging with Spacy are below.

In [None]:
import spacy

with open('tokenized.txt', 'r') as file:
    text_data = file.read()
    
nlp = spacy.load("fr_core_news_md")
doc = nlp(text_data)

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [None]:
# We can perform part of speech tagging using the spacy_lefff library.
# From the library's GitHub repository:
# "This package allows to bring Lefff lemmatization and part-of-speech tagging to a spaCy custom pipeline.
# When POS tagging and Lemmatizaion are combined inside a pipeline, it improves your text preprocessing for 
# French compared to the built-in spaCy French processing.""

In [None]:
import spacy
from spacy_lefff import LefffLemmatizer, POSTagger
import csv

#nlp = spacy.load('fr')
nlp = spacy.load("fr_core_news_md")
pos = POSTagger()
french_lemmatizer = LefffLemmatizer(after_melt=True, default=True)
nlp.add_pipe(pos, name='pos', after='parser')
nlp.add_pipe(french_lemmatizer, name='lefff', after='pos')

# Open our file
with open('tokenized.txt', 'r') as file:
    text_data = file.read()

    
# Specify the information we want
doc = nlp(text_data)
for d in doc:
    print(d.text, d.pos_, d._.melt_tagger, d._.lefff_lemma, d.tag_, d.lemma_)
    
# Output our tagged data into a CSV
with open('text_data.csv', 'w') as csvfile:
    fieldnames = ['text', 'pos', 'melt', 'lefff_lemma', 'tag', 'lemma']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    
    for d in doc:
        if d.pos_ != "SPACE":
            writer.writerow({'text': d.text, 'pos': d.pos_, 'melt': d._.melt_tagger, 'lefff_lemma': d._.lefff_lemma, 'tag': d.tag_, 'lemma': d.lemma_})