# Preprocessors

## Imports

In [None]:
import os
import re
import logging
import unicodedata
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

# Global variables

## Maximum processes for multip

In [None]:
# Maximum nb of process for the multiprocessing Pool.
MAX_PROCESSES = 17 # 17 ok if you're not fuzzing around with other notebooks.

## Paths

In [None]:
CLEAN_DOCS_PATH = "../../data/languages"

## Init spacy and nltk

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

sp = None

def get_spacy_model(language):
    global sp
    if sp is not None:
        return sp
    if language == "fr":
        sp = spacy.load("fr_core_news_sm")
    elif language == "en":
        sp = spacy.load('en_core_web_sm')
    else:
        print("Unknown spacy language %s" % language)
    return sp

## Preprocessors pipeline

### Pipeblock

FIXME : pipeline multip def

* Generalize for all dataset a pipeline of processes.
* Easier to multiprocess.

=> Base "abstract" classe

In [None]:
from abc import abstractmethod #abstract base class

class PipeBlock():
    @abstractmethod
    def __init__():
        ...
        
    @abstractmethod
    def __call__(self, doc):
        ...

### Multiprocessing the pipeline

In [None]:
from multiprocessing import Pool

def _biased_vocab_wrapper(doc_url, biased_vocab, pipeline):
    for name, transform in pipeline.items():
        if name == "vectorizer":
            break
        for tag, voc in biased_vocab.items():
            biased_vocab[tag] = transform(voc)
    return (doc_url, biased_vocab)

def _doc_wrapper(doc_url, doc, pipeline):
    for name, transform in pipeline.items():
        doc = transform(doc)
        # Remove document that are too short.
        if (name == "tokenizer" or name == "cleaner") and len(doc) < transform.min_doc_len:
            return (doc_url, None)
    return (doc_url, doc)

def run_pipeline(corpus, wrapper, pipeline=None, verbose=0):
    """
    Loop over the dataset.
    On each document, apply the transformation of each block in pipeline (in order).
    
    :param corpus:   Dictionary mapping a doc_key to its content (array of strings, 1 string = 1 sentence)
    :param wrapper:  The wrapper is a function to adapt the behavior of the block to the dataset shape/structure.
    :param pipeline: If none, u'r looking for troubles my friend.
                     An array of actions to perform on a document of the corpus.
                     Must follow the PipeBlock implementation.
    """
    with Pool(MAX_PROCESSES) as ps:
        corpus_res = ps.starmap(wrapper, [(doc_url, doc_sents, pipeline) for doc_url, doc_sents in corpus.items()])
        corpus_res = list(filter(lambda x : x[1] != None, corpus_res))
        corpus = dict(corpus_res)
    return corpus


def multip_for(func, it_dict, args):
    """
    Apply a function on a dictionary.
    
    :param func:     Action to perform on the dictionary items. (key and/or value)/
    :param it_dict:  Dictionary on wich to perform the action.
    :param args:     Arguments to pass to the function in addition to the dictionary item.
    
    :return:    A new dictionary.
    """
    with Pool(MAX_PROCESSES) as ps:
        res = ps.starmap(func, [(key, value, *args) for key, value in it_dict.items()])
    return res

## Text normalisation

Define interface inorder to multiprocess the full pipeline.

## Tokenization

In [None]:
def generic_cleaner(sentence) : 
    #Method used to cleand a sentence of all diacritics all characters likes these.
    return re.sub(r"\s+"," ", 
                  re.sub(r"[^a-zA-Z0-9]"," ",
                  unicodedata.normalize('NFKD', sentence).encode('ASCII', 'ignore').decode("utf-8")
                )
             ).lower().strip()

def tokenizer_cleaner(doc, language='french') :
    # Method to create cleaned sentences with nltk
    sentences = sent_tokenize(doc, language=language)
    cleaned = []
    for sen in sentences:
        cleaned_sen = generic_cleaner(sen)
        if len(cleaned_sen.split()) > 1:
            cleaned.append(cleaned_sen)
    return cleaned

def brutal_tokenizer(doc, n) :
    #Create sentences by cutting the document in portions of n words
    toks = generic_cleaner(doc).split(" ")
    sentences = [" ".join(toks[x*n:x*n+n]) for x in range(len(toks)//n)]
    return sentences

def overlap_tokenizer(doc, block_size, over_window):
    #Create sentences by cutting the document in portions of n words
    toks = generic_cleaner(doc).split(" ")
    sentences = []
    if len(toks) >= block_size :
        sentences = [" ".join(toks[x*over_window:x*over_window+block_size+1])
                     for x in range( (len(toks)-block_size)//over_window+1)]
    return sentences
    

def spacy_tokenizer(doc, language):
    """
    Wrapper around the spacy tokenizer.
    Adapts it to the corpus dictionay structure.
    """
    sp = get_spacy_model(language)
    toks = sp(doc)
    #cleaned_doc = [generic_cleaner(sent.string) for sent in toks.sents]
    #cleaned_doc = [sent for sent in cleaned_doc if len(sent) > 1]
    #tok_doc = cleaned_doc
    #tok_doc = [sent.string for sent in toks.sents]
    tok_doc = toks
    return tok_doc

class Tokenizer(PipeBlock):
    def __init__(self, language, method, len_sen=10, over=4, min_doc_len=0):
        """
        Init a document tokenizer.

        :param method:      String. Reference a tokenize method.
                            'nltk'    ->
                            'brutal'  ->
                            'overlap' ->
                            'spacy'   ->

        :param len_sen:     int. Number of words in a sentence.
                            Used by the 'brutal' and 'overlap' tokenizer.

        :param over:        ???
                            Someting used by the 'overlap' tokenizer.

        :param min_doc_len: Int. Minimum of sentence a document must containe.
    

        :return:    Tokenized document. If once tokenized, nb of sentence <
                    min_doc_len, return None.
        """
        self.method = method
        self.language = language
        self.len_sen = len_sen
        self.over = over
        self.min_doc_len = min_doc_len

    def __call__(self, doc):
        """
        Tokenize documents.
        """
        #Sentence Tokenization of the corpus
        if self.method == 'nltk':
            tokenized_doc = tokenizer_cleaner(doc)
        elif self.method == 'brutal':
            tokenized_doc = brutal_tokenizer(doc, self.len_sen)
        elif self.method == 'overlap':
            tokenized_doc = overlap_tokenizer(doc, self.len_sen, self.over)
        elif self.method == 'spacy':
            tokenized_doc = spacy_tokenizer(doc, self.language)
        else :
            print("Tokenizer method not accepted: %s" % self.method)
            exit(1)
        return tokenized_doc

## Stop Words

In [None]:
class RemoveStopWords():
    def __init__(self, language, method):
        self.method = method
        if method == 'spacy':
            sp = get_spacy_model(language)
            spacy_model = spacy.lang.fr if language == "french" else spacy.lang.en
            self.stop_words = spacy_model.stop_words.STOP_WORDS
        else:
            self.stop_words = nltk.corpus.stopwords.words(language)
        assert(self.stop_words is not None)
        
    def __call__(self, doc):
        if self.method == "spacy":
            doc = self.spacy_stop_w(doc)
        else:
            doc = self.nltk_stop_w(doc)
        return doc
        
    def spacy_stop_w(self, doc):
        #print(self.stop_words)
        doc_res = []
        for sent in doc:
            sent_tmp = [w.string for w in sent if not w.is_stop]
            doc_res.append(" ".join(sent_tmp))
        return doc_res
    
    def nltk_stop_w(self, doc):
        doc_res = []
        for sent in doc:
            sent_tmp = [w for w in sent.split() if w not in self.stop_words]
            doc_res.append(" ".join(sent_tmp))
        return doc_res

## Corpus preparation

In [None]:
def prepare_corpus(language, gen, docs, gold_sum_dict, biased_vocab=None):    
    """
    Apply preprocessing to text.
    """
    logging.debug("[Preprocessors][PREP CORPUS] Preprocessing pipeline")
    
    # Preprocess the document
    logging.debug("[Preprocessors][PREP CORPUS] docs")
    normalisation_pipeline = {
        "tokenizer": Tokenizer(language, *gen, min_doc_len=3)
    }
    
    docs = run_pipeline(docs, _doc_wrapper, normalisation_pipeline)
    
    """
    Biased vocabulary preprocessing
    if biased_vocab is not None:
        logging.debug("[Preprocessors][PREP CORPUS] Biased vocab")    
        biased_vocab = run_pipeline(biased_vocab, _biased_vocab_wrapper, normalisation_pipeline)
    
    """
    
    # Preprocess the gold summaries
    logging.debug("[Preprocessors][PREP CORPUS] gold sum")
    clean_tok = {
        "cleaner": Tokenizer(language, 'nltk', min_doc_len=1)
    }
    
    gold_sum_dict = run_pipeline(gold_sum_dict, _doc_wrapper, clean_tok)
    logging.debug("[Preprocessors][PREP CORPUS] Done")
    
    return docs, gold_sum_dict

## Bias on vector representation

In [None]:
def _build_vocab_doc_bias(doc_url, tag_map, bias_weight, vocab):
    doc_vocab_bias = defaultdict(lambda:0)
    for tag, tokens in tag_map.items():
        if (len(tokens) == 0):
            continue
        tokens = tokenizer_cleaner(tokens)
        tokens = set(" ".join(tokens).split())
        for word in tokens:
            if word not in vocab.keys():
                continue
            doc_vocab_bias[vocab[word]] += bias_weight[tag]
    return (doc_url, dict(doc_vocab_bias))

def build_vocab_bias(vocab, doc_bias_terms, bias_weight, vocab_bias_file = None):
    """
    Transformed our string bias (dict doc to tag to words) to an item bias
    dict doc to word_id to weight

    :param vocab:           Dictionnary mapping a word to feature indices.
    :param doc_bias_terms:  Dictionnay mapping a document key to a bias element key
                            (ex html tag of interests). bias element key mapping to words,
                            as they appear in the vocabulary.
                            <=> biased_vocab quoi
    :param bias_weight: Dictionnary mapping a bias element to its values (weight).
    """
    
    if vocab_bias_file is not None and os.path.exists(vocab_bias_file):
        return pickle.load(open(vocab_bias_file, 'rb'))
    
    logging.debug("[Preprocessors][Bias] Building vocab bias for each document")
    s = time.time()
    vocab_bias = multip_for(_build_vocab_doc_bias, doc_bias_terms, (bias_weight, vocab))
    e = time.time()
    logging.debug("[Preprocessors][Bias] Done, Time :" + "{:.2f}".format(e-s))
    
    if vocab_bias_file is not None and not os.path.exists(vocab_bias_file):
        pickle.dump(vocab_bias, open(vocab_bias_file, 'wb'))

    return dict(vocab_bias)
 
def apply_bias(doc, doc_bias):
    for word_id, weight in doc_bias.items():
        doc_col = doc[:,word_id]
        weight_vec = [weight if k != 0 else 0 for sent_id, k in enumerate(doc_col)]
        try:
            doc_col += np.array(weight_vec).reshape(len(weight_vec), 1)
        except:
            logging.error('process id:', os.getpid(), "ERROR")
            logging.error('process id:', os.getpid(), "APPLY BIAS, doc.shape = ", doc.shape, "doc type", type(doc))
            logging.error('process id:', os.getpid(), "DOC COL SHAPE", doc_col.shape)
            logging.error('process id:', os.getpid(), "WORD_ID", word_id)
            raise
                
    return doc
    
def _apply_vocab_doc_bias(doc_url, doc, vocab_bias):
    """
    Given a vector representation of the document (each sentence is a vector of the vocab size), 
    adds to a word its bias weight.
    """
    doc = apply_bias(doc, vocab_bias[doc_url])
    # Normalisation ??
    return (doc_url, doc)

def apply_vocab_bias(docs, vocab_bias):
    """

    :param vocab:           Dictionnary mapping a word to feature indices.
    :param doc_bias_terms:  Dictionnay mapping a document key to a bias element key
                            (ex html tag of interests). bias element key mapping to words,
                            as they appear in the vocabulary.
                            <=> biased_vocab quoi
    :param bias_weight:     Dictionnary mapping a bias element to its values (weight).
    """
    
    docs = multip_for(_apply_vocab_doc_bias, docs, [vocab_bias])
        
    return dict(docs)

# Serialization by language

In [None]:
import os
import re
import json
import pickle
from itertools import islice
from langdetect import detect

def chunks(data, SIZE=50):
    it = iter(data)
    for i in range(0, len(data), SIZE):
        yield {k:data[k] for k in islice(it, SIZE)}

def serialize_by_lang(dic, biased_vocab, corpus):
    DIR_PATH = os.path.join(DATA_PATH, "languages/" + corpus)
    
    tmp_dic = {}
    for key, value in dic.items():
        try:
            lang = detect(value)
        except:
            continue
        
        if lang not in tmp_dic:
            tmp_dic[lang] = {}
        tmp_dic[lang][key] = {"docs": value, "biased_vocab" : biased_vocab[key]}
    
    for key, value in tmp_dic.items():
        DIR_NAME = os.path.join(DIR_PATH, key)
        if not os.path.exists(DIR_NAME):
            os.makedirs(DIR_NAME)
        
        gen = chunks(value)
        for i, json in enumerate(gen):
            FILE_NAME = key + '-part-{0:04}'.format(i)
            with open(os.path.join(DIR_NAME, FILE_NAME), 'wb+') as handle:
                pickle.dump(json, handle)

def deserialize(lang, corpus, sampling):
    DIR_PATH = os.path.join(DATA_PATH, "languages/" + corpus + "/" + lang)
    
    dic = {}
    files = [file for file in os.listdir(DIR_PATH) if bool(re.match(r'..-part-[0-9]+', file))]
    nb_files = int(sampling * len(files))
    
    for file in files[:nb_files]:
        with open(os.path.join(DIR_PATH, file), 'rb') as handle:
            dic.update(pickle.load(handle))
    
    docs, biased_vocab = {}, {}
    for key, value in dic.items():
        docs[key] = value["docs"]
        biased_vocab[key] = value["biased_vocab"]
    return docs, biased_vocab

In [None]:
import json

DATA_PATH = "/home/pfee/data"
docs = {'www.marseille.archi.fr/ecole/bibliotheque/': "École nationale supérieure d'architecture de Marseille 184, avenue de Luminy - case 924 13288 Marseille cedex 9 Tél : 04 91 82 71 00 Fax : 04 91 82 71 80", 'www.parisseveille.info/quelles-sont-les-meilleures-marques-dautoradio/': '/ /Quelles sont les meilleures marques d’autoradio ?', 'www.magicien-animateur.ch/contact/': "Vous souhaitez en savoir plus ? Renvoyez-moi ce formulaire avec les infos de votre événement. Vous n'êtes pas obligés de tout remplir, mais indiquez svp au moins votre nom et votre e-mail. Et plus vous m'en dites, plus l'offre que je vous enverrai sera précise ;-) Une proposition vous parviendra par e-mail dans les meilleurs délais ! Si vous préférez, vous pouvez aussi me contacter directement par e-mail sur : Illusion.ch - Magic Management - Patrick Waltrick Chemin de la Malice 36 - CH - 1228 Plan-les-Ouates / Genève Tel ++41.22 910 16 80 -", 'ivoirtv.net/modules.html': "Nicaise Blé et Brigitte Yodé ''LaFabuleuse SBY'' se sont dit « OUI » depuis le 28 avril 20... Le design et les couleurs chatoyantes interpellent les passants. Et quand on a accès à l'i... À proposLet'swelcome Ivory Coast Reggae Superstar RAMSES DE KIMON THE PHARAOH. His African... Zaena Morisho est une jeune chanteuse congolaise (RDC). Elle vit à Houston, la capitale du... A l'Etat civil, elle se nomme Opportune Aka et dans le milieu artistique elle répond au ps..."}
bias = {'www.marseille.archi.fr/ecole/bibliotheque/': {'h1': '', 'title': '', 'bold': '', 'b': '', 'i': '', 'em': '', 'mark': ''}, 'www.parisseveille.info/quelles-sont-les-meilleures-marques-dautoradio/': {'h1': '', 'title': '', 'bold': '', 'b': '', 'i': '', 'em': '', 'mark': ''}, 'www.magicien-animateur.ch/contact/': {'h1': '', 'title': '', 'bold': '', 'b': '', 'i': '', 'em': '', 'mark': ''}, 'ivoirtv.net/modules.html': {'h1': '', 'title': '', 'bold': '', 'b': '', 'i': '', 'em': '', 'mark': ''}}

"""test = serialize_by_lang(docs, bias)
print(json.dumps(test, indent=4, sort_keys=True))"""

docs, biased_vocab = deserialize("fr", "dmoz-html", sampling = 0.1)