<a href="https://colab.research.google.com/github/jbpolle/divers/blob/main/email_signature_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install sentencepiece
!pip install sentence_transformers



In [2]:
import regex
import pandas as pd
from sentence_transformers import SentenceTransformer
from scipy.spatial import distance

In [3]:
# Here is a dummy example that we will use in all the notebook.
# Obviously model should be trained on a full dataset of tagged values
text="""Bonjour Vincent,
Merci de m’avoir rappelé hier.
Seriez vous disponible pour un rendez vous la semaine prochaine?
Merci,
Jean-Baptiste
514 442 3670
jbpolle@hotmail.com"""

In [4]:
def f_split_text_by_lines(text, position_offset=0):
    """
    :param text: text that should be split
    :return: list containing for each line:  [position start, position end, sentence]
    """
    results = []
    iter_lines = regex.finditer("[^>\n]((.*?([!?.>] ){1,})|.*(?=\n|$))", text)
    for line_match in iter_lines:
        start_line = line_match.start()
        end_line = line_match.end()
        line = line_match.group()
        if len(line.strip()) > 1:
            results.append([start_line + position_offset, end_line + position_offset, line])
    return results

In [5]:
list_lines = f_split_text_by_lines(text)
list_lines

[[0, 16, 'Bonjour Vincent,'],
 [17, 47, 'Merci de m’avoir rappelé hier.'],
 [48, 112, 'Seriez vous disponible pour un rendez vous la semaine prochaine?'],
 [113, 119, 'Merci,'],
 [120, 133, 'Jean-Baptiste'],
 [134, 146, '514 442 3670'],
 [147, 166, 'jbpolle@hotmail.com']]

In [6]:
# We first create a list of all the entities that can be retrieved using regex 
# ( Email, Tel, Web,...)

# Function to retrieve different entities in text using regex 
def f_find_regex_pattern(text, type_, pattern):
    """ Find all occurences of a pattern in a text and return a list of results
    Args:
        text:  the text to be analyzed
        type_:  the entity type (value is added in result)
        pattern: regex pattern to be found

    Returns:
        A list containing type, matched value, position start and end of each result

    """
    list_result = []
    results = regex.finditer(pattern, text, flags=regex.IGNORECASE)
    for match in results:
        value = match.string[match.start(): match.end()].replace("\n", " ").strip()
        list_result.append([type_,
                            value,
                            match.start(),
                            match.end(),
                            1])
    return list_result

dict_pattern = dict(EMAIL=r'[\p{L}\p{M}\-\d._]{1,}@[\p{L}\p{M}\d\-_]{1,}(\.[\p{L}\p{M}]{1,}){1,}',
                          TEL=r'(?<!\d)(\+?\d{1,2}[ -]?)?\(?\d{3}\)?[ .-]?\d{3}[ .-]?\d{4}(?!\d|\p{P}\d)',
                          POST=r'\b([A-z][0-9][A-z][ -]?[0-9][A-z][0-9]|[A-z][0-9][A-z])\b')
list_result = []
for type_, pattern in dict_pattern.items():
    result = f_find_regex_pattern(text, type_, pattern,)
    list_result += result
list_result


[['EMAIL', 'jbpolle@hotmail.com', 147, 166, 1],
 ['TEL', '514 442 3670', 134, 146, 1]]

In [7]:
# Function to retrieve different entities in text using NLP (for french)
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner")
model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner")

nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
results = nlp(text)

for result in results:
    if result["word"] != "" and result['entity_group'] in ["PER", "LOC", "ORG"]:
      list_result.append(
                    [result["entity_group"],
                     result["word"],
                     result["start"],
                     result["end"],
                     result["score"]])
list_result

[['EMAIL', 'jbpolle@hotmail.com', 147, 166, 1],
 ['TEL', '514 442 3670', 134, 146, 1],
 ['PER', 'Vincent', 7, 15, 0.7680237],
 ['PER', 'Jean-Baptiste', 119, 133, 0.8631055]]

In [8]:
# We convert features list to dataframe
df_ner = pd.DataFrame(list_result, 
                      columns=["entity", "value", "start", "end", "score"])
df_ner

Unnamed: 0,entity,value,start,end,score
0,EMAIL,jbpolle@hotmail.com,147,166,1.0
1,TEL,514 442 3670,134,146,1.0
2,PER,Vincent,7,15,0.768024
3,PER,Jean-Baptiste,119,133,0.863105


In [9]:
embedder_model = SentenceTransformer("distiluse-base-multilingual-cased-v1")

def f_retrieve_entities_for_line(df_ner, start=0, end=1e12):
  """Retrieve all entities in the previously computed dataframe  for a specific line
    which position is included between start and end"""

  if len(df_ner) > 0:
    df = df_ner.query(f"""(start>= {start}  and end <= {end}) or (start<={start}  and end>={end})""")
    return df

def f_create_embedding_inv_dist_feature(text1, text2):
  embedding_merci = embedder_model.encode(text1)
  embedding_line = embedder_model.encode(text2)
  dist = distance.cosine(embedding_merci, embedding_line)
  return 1 / (dist + 0.01)

def f_create_line_features(list_lines, line_number, df_ner):
    current_line = list_lines[line_number]
    total_lines = len(list_lines)
    features_vector = [line_number, current_line[2], current_line[0], current_line[1]]
    df_ner_line = f_retrieve_entities_for_line(df_ner=df_ner, start=current_line[0], end=current_line[1])
    # Adding entity to feature vector
    for entity in ["PER", "ORG", "LOC", "TEL", "EMAIL"]:
        value = len(df_ner_line.query(f"entity=='{entity}'")) if df_ner_line is not None else 0
        features_vector.append(value)

    # Adding word count
    features_vector.append(len(current_line[2].split()))

    # distance to greeting word "merci"
    features_vector.append(f_create_embedding_inv_dist_feature("merci", current_line[2].lower()))

    # Line start with ps:
    features_vector.append(regex.match(r"\s*ps *:", current_line[2],  flags=regex.IGNORECASE ) is not None)

    # Adding position line in email
    position_in_email = (line_number + 1) / total_lines
    features_vector.append(position_in_email)
    # Adding special character count
    special_char_count = len(regex.findall(r"[^\p{L}0-9 .,\n]", current_line[2]))
    features_vector.append(special_char_count)
    # Number of empty chars with previous line
    empty_chars_with_prev_line = 0 if line_number == 0 else current_line[0] - list_lines[line_number - 1][1]
    features_vector.append(empty_chars_with_prev_line)
    return features_vector


In [10]:
list_features_vectors = []
for line_number in range(0, len(list_lines)):
    list_features_vectors.append(f_create_line_features(list_lines, line_number, df_ner))

list_name_columns_features = ["line_number", "line", "start", "end", 
                              "PER", "ORG", "LOC", "TEL", "EMAIL", 
                              "word_count",
                              "inv_distance_to_merci",
                              "starts_with_ps", 
                              "position_line",
                              "special_characters_count", 
                              "empty_chars_with_prev_line"]
df_features = pd.DataFrame(list_features_vectors, columns=list_name_columns_features)
df_features

Unnamed: 0,line_number,line,start,end,PER,ORG,LOC,TEL,EMAIL,word_count,inv_distance_to_merci,starts_with_ps,position_line,special_characters_count,empty_chars_with_prev_line
0,0,"Bonjour Vincent,",0,16,1,0,0,0,0,2,1.711611,False,0.142857,0,0
1,1,Merci de m’avoir rappelé hier.,17,47,0,0,0,0,0,5,2.101053,False,0.285714,1,1
2,2,Seriez vous disponible pour un rendez vous la ...,48,112,0,0,0,0,0,10,1.010082,False,0.428571,1,1
3,3,"Merci,",113,119,0,0,0,0,0,1,21.642599,False,0.571429,0,1
4,4,Jean-Baptiste,120,133,1,0,0,0,0,1,1.336501,False,0.714286,1,1
5,5,514 442 3670,134,146,0,0,0,1,0,3,1.210111,False,0.857143,0,1
6,6,jbpolle@hotmail.com,147,166,0,0,0,0,1,1,1.390052,False,1.0,1,1
