In [34]:
import os
import pandas as pd
import logging
import numpy as np
import pickle
import json
from tqdm import tqdm
from unidecode import unidecode

In [35]:
###### Enable if debugging, to reflect changes in .py files without restarting kernel #####
# %reload_ext autoreload
# %autoreload 2
from data.docxparser import getCCD, getAnnotations
from data.ccdParser import parseDocument, parseParagraphs
from data.data_processing import clean_text
from data.dateparser import datetime_parsing
from utils.data_utils import write_json_dump,write_dict,write,load_config
from data.vectorise_docs import vectorize_docs

from extract_5w1h.extract_5w1h import run_5w1h_extract
from extract_5w1h.vectorise_5w1h import vectorise_5w1h

2023-02-28 20:33:22.911335: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-28 20:33:23.493839: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-02-28 20:33:23.493913: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64


In [135]:
config_override={}

config=load_config(overrides=config_override)

##Additional Parameter
# overrides = <dict> 
# -example: To set source/target directories
# config_override = dict(
#                        source_dir = "/nfs/jup/sensitivity_classifier/threading/e2e_2023/92Folder",
#                        target_dir = "/nfs/jup/sensitivity_classifier/threading/e2e_data/"
#                       )

In [136]:
config

namespace(source_dir='/app/92Folder',
          target_dir='/app/DocumentSets',
          nltk_dir='/app/nltk_data',
          transformer_dir='/app/transformer_models',
          emb_dir='/app/DocumentSets/embeddings',
          data5w1h_dir='/app/DocumentSets/5w1h',
          threads_dir='/app/DocumentSets/threads',
          seqint_dir='/app/DocumentSets/seqint',
          hint_dir='/app/DocumentSets/hint',
          dpp_dir='/app/DocumentSets/dpp',
          thread_sample_dir='/app/DocumentSets/sampling',
          passage_dict_file='/app/DocumentSets/passages.jsonl.gz',
          passage_docids_file='/app/DocumentSets/passage_doc_ids.txt.gz',
          passage_parts_file='/app/DocumentSets/passages_parts.txt',
          date_dict_file='/app/DocumentSets/date_date.txt.gz',
          data5w1h_dict_file='/app/DocumentSets/5w1h/data_5w1h.json.gz',
          entity_dict_file='/app/DocumentSets/5w1h/who_where_entities.json.gz',
          date_features_file='/app/DocumentSets/5w1h/dt_fea

## Parse Documents

In [127]:
class DataProcessor:
    def __init__(self,config=None,default_date=None,auto_extract_dates=False) -> None:
        if config is None:
            config=load_config()
        
        self.config=config
        self.log = logging.getLogger(__name__)
        self.default_date = pd.to_datetime('2000-01-01 00:00:00') if default_date is None else default_date    #Can be used in case there is no date field    
        self.auto_extract_dates = auto_extract_dates
        

    #READ DOCS
    def read_docs(self,pathCCD,pathJSON):
        config=self.config

        files_to_process = []

        # getting a list of all #wc.docx files in subfolders of target folder
        for subdir, dirs, files in os.walk(config.source_dir):
            for file in files:
                filepath = subdir + os.sep + file
                if filepath.endswith("#wc.docx") and not filepath.endswith("#wcmeta.docx"):  # ignore meta files
                    files_to_process = files_to_process + [filepath]

        # going through list and trying to create a ccd and json from each discovered file
        failed_files = []
        successful_files = []

        self.log.warning(f"Reading {len(files_to_process)} files")
        # get filenames and use docx parser to get ccd and json
        for filepath in tqdm(files_to_process):
            filename = os.path.basename(filepath)

            ccd = getCCD(filepath)
            jsn = getAnnotations(filepath)
            json_string = json.dumps(jsn)

            # TO DO: make ccdParser return 2 for json or ccd if either of those parsing processes fail for a doc

            # if parsing file processes successfully, save ccd and json
            if 2 not in [ccd, json]:
                with open(os.path.join(pathCCD, os.path.splitext(filename)[0] + ".xml"), 'w') as f:
                    f.write(ccd)
                    # f.close
                with open(os.path.join(pathJSON, os.path.splitext(filename)[0] + ".json"), 'w') as f:
                    f.write(json_string)
                successful_files = successful_files + [filepath]

            else:
                # i.e. if parsing failed add to failed files
                failed_files = failed_files + [filepath]

        return successful_files, failed_files

    def get_passages(self,successful_files,min_len=10,max_len=500):
        config=self.config
        paraTitles = []
        paraCollection = []
        paraLabels = []
        paraCreated = []

        totPassages=0

        self.log.warning(f"Parsing passages from {len(successful_files)} files")
        # for filepaths to files that were succesfully parsed from ccd
        for filepath in tqdm(successful_files):
            # get filename from path
            fileN = filepath.rsplit('/', 1)[-1][0:-5]

            ####### CCD Document Parser #########
            # call paragraph parser which opens ccd and json on its own
            temp_paragarphs, temp_labelsP, doc_created = parseParagraphs(config.target_dir, fileN)
            ####################################
            
            # Assigning a dummy date or extract date from text in case the documents do not have create-datetime field
            if doc_created is None:
                candidates = []
                if self.auto_extract_dates:
                    for p in temp_paragarphs:
                        candidates,_ = datetime_parsing(p,self.default_date)
                        if len(candidates): #Break after the first paragraph with dates
                            break
                doc_created=candidates[0] if len(candidates) else self.default_date #Select the first date object
            
            # Assigning dummy labels in case the documents do not have sensitivity ground-truth
            if temp_labelsP is None:
                temp_labelsP=np.ones(len(temp_paragarphs),dtype=int).tolist()

            totPassages+=len(temp_paragarphs)

            #Filter Passages based on number of words
            paragarphs, labelsP,titles=[],[],[]
            para_id=0
            for p,l in zip(temp_paragarphs, temp_labelsP):
                words=p.split()
                if len(words) >= min_len and len(words) <= max_len:
                    paragarphs.append(clean_text(p))
                    labelsP.append(l)
                    titles.append(fileN+"_"+str(para_id))
                para_id+=1

            # each paragraph that is returned needs to be linked back to its filename, so ([fileName] * (no_of_paras_returned))
            # tempPara = [fileN+"_"+str(i) for i in range(len(paragarphs))]

            tempCreated = [doc_created] * len(paragarphs)

            # add the titles, paragraphs and labels for each para to respective holding lists
            paraTitles = paraTitles + titles
            paraCollection = paraCollection + paragarphs
            paraLabels = paraLabels + labelsP
            paraCreated = paraCreated + tempCreated

        self.log.warning(f"Total Passages: {totPassages}. Filtered Passages (length between [{min_len},{max_len}]): {len(paraCollection)}")


        # once all docs have been parsed to paragraphs and stored in the above three lists, convert to dataframe
        # dataframes need to have [sourcefile, text, label]
        paragraphDF = pd.DataFrame({'doc_id': paraTitles, 'text': paraCollection, 'label': paraLabels, 'created': paraCreated}).sort_values(["created", "doc_id"])

        return paragraphDF

    def process(self,min_len=10, max_len=500,collection_split_size=40000,mockup=False):
        config=self.config
        
        #######CCD#########
        pathCCD = os.path.join(config.target_dir,"ccd")
        pathJSON = os.path.join(config.target_dir,"json")
        
        if not os.path.exists(pathCCD):
            os.mkdir(pathCCD)
        if not os.path.exists(pathJSON):
            os.mkdir(pathJSON)

        successful_files, failed_files = self.read_docs(pathCCD, pathJSON)
        #######CCD#########
        
        
        
        paragraphDF = self.get_passages(successful_files, min_len=min_len, max_len=max_len)

        paragraphDF["created"] = paragraphDF["created"].apply(lambda x: str(x) if x is not None else x)
        write(config.passage_docids_file,paragraphDF["doc_id"].values,mode="wt",compress=True)
        write_json_dump(config.passage_dict_file, paragraphDF.to_dict(orient="records"))
        write_dict(config.date_dict_file,paragraphDF.set_index("doc_id")["created"].to_dict(),mode="wt",compress=True)
        paragraphDF["created"] = pd.to_datetime(paragraphDF["created"])


        if mockup:
            #START MOCKUP
            self.log.warning("\n::::MOCKING DATA::::\n")
            mock_passages=pickle.load(open(os.path.join(config.source_dir,"aug.p"),"rb"))
            paragraphDF = paragraphDF[:3]
            st_ind=np.max(paragraphDF.index.values)+1
            doc_id = 0
            df_data = []
            for i in range(4):
                st=i*300
                for p1, p2, p3 in zip(mock_passages[st:st+100], mock_passages[st+100:st+200], mock_passages[st+200:st+300]):
                    y, m, d = np.random.randint(1995, 2000), np.random.randint(1, 12), np.random.randint(1, 28)
                    df_data.append([f"doc{doc_id}#wc_{0}", p1, np.random.choice([0,1],1)[0], "{}-{:02}-{:02}".format(y, m, d)])
                    df_data.append([f"doc{doc_id}#wc_{1}", p2, np.random.choice([0,1],1)[0], "{}-{:02}-{:02}".format(y, m, d)])
                    df_data.append([f"doc{doc_id}#wc_{2}", p3, np.random.choice([0,1],1)[0], "{}-{:02}-{:02}".format(y, m, d)])
                    doc_id+=1

            temp=pd.DataFrame(df_data, columns=paragraphDF.columns, index=range(st_ind, st_ind + len(df_data)))
            temp["created"] = pd.to_datetime(temp["created"])
            paragraphDF = pd.concat([paragraphDF, temp])

            paragraphDF["created"] = paragraphDF["created"].apply(lambda x: str(x) if x is not None else x)
            write(config.passage_docids_file,paragraphDF["doc_id"].values,mode="wt",compress=True)
            write_json_dump(config.passage_dict_file, paragraphDF.to_dict(orient="records"))
            write_dict(config.date_dict_file,paragraphDF.set_index("doc_id")["created"],mode="wt",compress=True)
            paragraphDF["created"] = pd.to_datetime(paragraphDF["created"])
            #### END MOCKUP ###


        #Identifying Collection Splits, i.e., Parts
        st = 0
        en = collection_split_size
        pidx = 0
        parts = {}
        while st < paragraphDF.shape[0]:
            parts[pidx] = (st, st + paragraphDF.iloc[st:en].shape[0])
            pidx += 1
            st = en
            en = st + collection_split_size

        write_dict(config.passage_parts_file, parts)
        self.log.warning(f"Identified {len(parts)} parts (i.e., splits) of the collection.")


        return paragraphDF

### Read Documents

In [134]:
data_processor = DataProcessor(config)


##Additional Parameters
#default_date = pd.to_datetime('2000-01-01')  #To be used as document create date in case there is no create_date attribute
#auto_extract_dates = True                    #To extract date from document text in case there is no create_date attribute

In [132]:
collection_split_size=40000        #If passages are more than this number then the threading process will be splitted into batches
paragraphDF = data_processor.process(collection_split_size=collection_split_size)

print(f"Total Passage: {paragraphDF.shape[0]}")
paragraphDF.head()

Reading 10 files
100%|██████████| 10/10 [00:00<00:00, 27.30it/s]
Parsing passages from 10 files
100%|██████████| 10/10 [00:00<00:00, 466.22it/s]
Total Passages: 70. Filtered Passages (length between [10,500]): 30
Identified 1 parts (i.e., splits) of the collection.


Total Passage: 30


Unnamed: 0,doc_id,text,label,created
0,doca#wc_3,1. This is to confirm (belatedly) that a new ...,1,1995-02-06
1,doca#wc_4,2. You should also be aware that the distribu...,1,1995-02-06
2,doca#wc_5,3. I would be grateful if you could pass a co...,1,1995-02-06
3,docb#wc_3,1. This is to confirm (belatedly) that a new ...,1,1995-02-06
4,docb#wc_4,2. You should also be aware that the distribu...,1,1995-02-06


### Vectorise Documents

In [15]:
vectorize_docs(config, vect_tfidf=True,vect_minilm=True,vect_roberta=False,sbert_batch=10000, use_gpu=True)

TFIDF Vectorise
	TFIDF Vectors Saved at: /nfs/jup/sensitivity_classifier/threading/e2e_data/embeddings/psg_tfidf_emb.npz
	TFIDF-LSA Vectors Saved at: /nfs/jup/sensitivity_classifier/threading/e2e_data/embeddings/psg_tfidflsa_emb.npz
	TFIDF Features Saved at: /nfs/jup/sensitivity_classifier/threading/e2e_data/embeddings/psg_tfidf_features.txt.gz
MiniLM Vectorise
	Running on GPU
100%|██████████| 1203/1203 [00:01<00:00, 644.72it/s]
	MiniLM Vectors Saved at: /nfs/jup/sensitivity_classifier/threading/e2e_data/embeddings/psg_minilm_emb.npz


## 5W1H Extraction

In [8]:
from tqdm import tqdm
tqdm._instances.clear()

In [7]:
#Extract 5w1h
#The cell can resume from the last completed batch

nlp_model="en_core_web_sm" #Spacy Model, or use "coreNLP"
n_processes=4              #Number of CPU Processes (The count will be adjusted to maximum available CPUs if n_processes>cpu_count())
use_gpu=True               #Only for spaCy pipeline, ignored if torch.cuda.is_available() returns False

run_5w1h_extract(config,n_processes=n_processes,nlp_model=nlp_model,use_gpu=use_gpu,skip_where=True,force=True)


##Additional Parameters
# => force=True        ##To delete all cached results and perform extraction from all files
# => skip_where=True    ##To skip evaluation of location (where) from geopy
# => skip_errors=False ##To report any error and kill the process
# => show_errors=True  ##To report full errors
# => threaded_extraction=False ##To extract 5w1h sequentially for each document

Using GPU for Constituency parsing.
Running spaCy 4-way parallel (Total CPUs: 32)
  0%|          | 0/1203 [00:00<?, ?it/s]Could not find corpus for WordNet, will now try to download the corpus.
[nltk_data] Downloading package wordnet to /app/nltk_data/...
[nltk_data]   Package wordnet is already up-to-date!
Batch: 1/2:   0%|          | 0/1203 [00:04<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Batch 1/2 . Errors: 0:  13%|█▎        | 157/1203 [00:14<00:47, 22.22it/s]Traceback (most recent call last):
  File "/nfs/jup/sensitivity_classifier/threading/e2e_2023/sourceCode/extract_5w1h/extract_5w1h.py", line 131, in annotate_extract_spacy
    doc = extractor.parse(doc)
  File "/nfs/jup/sensitivity_classifier/threading/e2e_2023/sourceCode/extract_5w1h/extractor.py", line 114, in parse
    raise e
  File "/n

### Vectotise 5W1H Pseudo Passages

In [5]:
vectorise_5w1h(config, lsa_dim=200, vect_tfidf=True,vect_minilm=True,vect_roberta=False,sbert_batch=10000,use_gpu=True)

Creating data dictionaries
100%|██████████| 4/4 [00:00<00:00, 55738.26it/s]
TFIDF Vectorise
100%|██████████| 1200/1200 [00:01<00:00, 812.83it/s]
	TFIDF Vectors Saved at: /nfs/jup/sensitivity_classifier/threading/e2e_data/embeddings/tfidf_5w1h_emb.npz
	TFIDF-LSA Vectors Saved at: /nfs/jup/sensitivity_classifier/threading/e2e_data/embeddings/tfidflsa_5w1h_emb.npz
	TFIDF Features Saved at: /nfs/jup/sensitivity_classifier/threading/e2e_data/embeddings/tfidf_5w1h_features.txt.gz
	Tokenised Collection Saved at: /nfs/jup/sensitivity_classifier/threading/e2e_data/5w1h/tk_5w1h.json.gz
MiniLM Vectorise
	Running on GPU
100%|██████████| 1200/1200 [00:01<00:00, 630.20it/s]
	MiniLM Vectors Saved at: /nfs/jup/sensitivity_classifier/threading/e2e_data/embeddings/minilm_5w1h_emb.npz


## Date Extraction Example

In [122]:
text= "5-Jan-2012 17:00 \n2011 Haiti Earthquake Anniversary. As of 2010 (see 1500 photos here), the following major earthquakes "\
    "have been recorded in Haiti. The first great earthquake mentioned in histories of Haiti occurred in "\
    "1564 in what was still the Spanish colony. It destroyed Concepción de la Vega. On January 12, 2010, "\
    "a massive earthquake struck the nation of Haiti, causing catastrophic damage inside and around the "\
    "capital city of Port-au-Prince. On the first anniversary of the earthquake, 12 January 2011, "\
    "Haitian Prime Minister Jean-Max Bellerive said the death toll from the quake in 2010 was more "\
    "than 316,000, raising the figures in 2010 from previous estimates. I immediately flashed back to the afternoon "\
    "of 11th Feb, 1975 when, on my car radio, I first heard the news. On Sunday morning of the following week..."

In [123]:
from datetime import datetime

base_date = datetime.now()
print(f"Reference date: {base_date}")
datetime_objects,text_spans = datetime_parsing(text,base_date)
for d,s in zip(datetime_objects,text_spans):
    print(f"\tDatetime: \"{d}\" extracted from text span: \"{s}\"")

Reference date: 2023-02-28 20:57:42.948026
	Datetime: "2012-01-05 17:00:00" extracted from text span: "5-Jan-2012 17:00"
	Datetime: "2011-01-12 00:00:00" extracted from text span: "12 January 2011"
	Datetime: "1975-02-11 00:00:00" extracted from text span: "11th Feb, 1975"
	Datetime: "2010-01-12 00:00:00" extracted from text span: "January 12, 2010"
	Datetime: "2023-03-07 00:00:00" extracted from text span: "following week"
	Datetime: "2023-03-05 00:00:00" extracted from text span: "Sunday"
