In [1]:
#Step 1
#Import libraries
from io import StringIO
import os
from os.path import isfile, join
import fitz
import pandas as pd
import re
import spacy
from spacy import displacy
from dframcy import DframCy
from datetime import datetime

In [2]:
#Step 2
#Set up directory paths and column names
pdf_dir = "/your/pdf/directory/"
csv_dir = "/your/csv/directory/"
csv_name = "repertoire.csv"

columns = ['TITLE','DATE','TEXT']

In [None]:
#Step 3
#Migrate TXT to CSV

#Itirate through the pdf directory
corpus = (f for f in os.listdir(pdf_dir) if not f.startswith('.') and isfile(join(pdf_dir, f)))
#Create a dataframe where we save text from each pdf
df = pd.DataFrame(columns=columns)
#Iterate through each pdf
for filename in corpus:
    output_string = StringIO()
    #scrape OCR layer from pdf
    text_file = fitz.open(join(pdf_dir,filename))
    #title is the filename minus the .pdf ending
    title = filename[:-4]
    #In my corpus, the date is embeded in the filename. 
    #This series of code generates a date field.
    start = len(filename) - 14
    date = filename[start:-4]
    date = date.replace('_','-')
    #Now iterate through each page of the pdf
    for page in text_file:
        output_string.write(page.getText("text"))
    #the text generated from the above for loop is saved in the doc object
    doc = output_string.getvalue()
    #Here I search the doc object for the first word or phrase that identifies the portion of text that
    #I want to extract. This process took refining to find the best regular expression
    first = re.search(r"place|regular|expressions|here",doc)
    if line = "None"
        
    else:
        length = len(doc)
        #I shorten the doc object to include text that appears on and after the first word of the text.
        doc_end = doc[first.start():length]
        #Here I search the doc object for the last word or phrase that identifies the end of the text that
        #I want to extract. This process took refining to find the best regular expression
        last = re.search(r"place|regular|expressions|here",doc_end)
    if last == None:
        line = doc_end
    else:
        line = doc_end[:last.start()]
    
    line = line.replace('/n',' ')
    #I create a list with the title, date and text from each file
    data = [title, date, line]
    #I turn the list into a pandas series with the index of my dataframe's column names
    data_s = pd.Series(data, index=df.columns)
    #I finally append the panda series to my dataframe as a row.
    df = df.append(data_s,ignore_index=True)
#I sort dataframe by date.
df = df.sort_values(by=['DATE'])
#I save dataframe as a csv file.
df.to_csv(join(csv_dir, csv_name))

In [136]:
#Step 4
#Save the dates that came back with a nill value
#With this csv file, I can check that I didn't miss an advertisement
csv_name = 'repertoire.csv'
df = pd.read_csv(join(csv_dir,csv_name))
missing = df[df['TEXT'] == "None"]
csv_name = "missing_repertoire.csv"
path = "/your/path/here/"

missing.to_csv(join(path,csv_name))

In [1]:
#Step 5
#Load NLP language model with spaCy

nlp = spacy.load("fr_core_news_sm")

In [2]:
#Step 6
#Run the random date function to prepare for the creation of a training set
def random_dates(series1):
    import random

    dates = [d for d in series1]
    dates.sort()  # make sure that the filenames have a fixed order before shuffling
    random.seed(230)
    random.shuffle(dates) # shuffles the ordering of filenames (deterministic given the chosen seed)

    split_1 = int(0.05 * len(dates))
    train_dates = dates[:split_1]
    
    return train_dates

In [None]:
#Step 7
#Reload the csv file with raw data
csv_dir = "/your/csv/directory/"
csv_name = "repertoire.csv"

columns = ['TITLE','DATE','TEXT']
df = pd.read_csv(join(csv_dir,csv_name))

# This step generates a series of randomly selected dates that creates tagged csv files. 
columns = ['Word','Tag','Date']
#create an empty pandas dataframe to store NLP data with text
tagged_df = pd.DataFrame(columns = columns)
#Put in the path where you want traininig set csv files stored
training_path = '/your/training/path/here'

#Filter out any dates that doesn't have an advertisement. 
all_dates_df = df[df['TEXT'] != "None"]
#generate a random selection of dates
dates = random_dates(all_dates_df['DATE'].unique())
#iterate through dates
for day in dates:
    tagged_df = pd.DataFrame(columns = columns)
    #Pass the text value from 
    text = df['TEXT'][df['DATE'] == day].values
    #create a dataframe from the nlp model
    dframcy = DframCy(nlp)
    #transform text into a tagged NLP object with parts of speech and entity tages
    doc = dframcy.nlp(text)
    #transform doc into a dataframe with parts of speech and entity tags
    annotation_dataframe = dframcy.to_dataframe(doc)
    #set up name of csv training file
    f_name = "training_csv_{}.csv".format(day)
    #save dataframe into a csv traininig file
    annotation_dataframe.to_csv(join(training_path,f_name))

In [None]:
#step 8 - Manually tag the training CSV Files. Add entity tags under the "token_tag_" column

In [None]:
#step 9 - Run the training set python script "semi_automate_model.py"- this script cannot run in Jupyter Notebook

In [None]:
#step 10 - Once you have run semi_automate_model.py. Reload spaCy's nlp model with trained data
nlp = spacy.load('/path/to/your/model/location/here')

In [None]:
#step 11 - Load NER function
def return_NER(sentence):
    # Tokenize a phrase
    doc = nlp(sentence)
    # Return the text and label for each entity
    return [(X.text, X.label_) for X in doc.ents]

In [None]:
#step 12 - Generate CSV with Tagged Entities from Newspaper Advertisements 
csv_dir = "/path/to/your/csv/files"
#here is the csv filename with your raw data
raw_csv = "raw_data.csv"
#here is the csv filename for your tagged dataframe
final_csv = "final_data.csv"
#reload the dataframe with your raw data
df = pd.read_csv(join(csv_dir,raw_csv))
#set up a dataframe for your tagged data
columns = ['Title','Word','Tag','Date']
tagged_df = pd.DataFrame(columns = columns)

dates = df['DATE']
#iterate through the dates of your dataframe
for day in dates:
    #retrieve text for the date
    text = df['TEXT'][df['DATE'] == day].values
    #retrive the title for the date
    title = df['TITLE'][df['DATE'] == day].values
    #Tokenize the text - This generates a series of words and tags
    data = return_NER(text[0])
    #Add tokenized text to a new dataframe
    new_df = pd.DataFrame(data, columns =['Word', 'Tag'])
    #add title to a new dataframe
    new_df['Title'] = title[0]
    #add date to a new dataframe
    new_df['Date'] = day
    #append data to tagged dataframe
    tagged_df = tagged_df.append(new_df, ignore_index=True)
#save tagged dataframe to final csv file
tagged_df.to_csv(join(csv_dir, final_csv))