<a href="https://colab.research.google.com/github/jbpolle/divers/blob/main/email_signature_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install sentencepiece
!pip install sentence_transformers



In [2]:
import regex
import pandas as pd
from sentence_transformers import SentenceTransformer
from scipy.spatial import distance
from tensorflow import keras
import numpy as np

In [3]:
# In all the following steps, I adapted the code to run on this one example. In order to really work, 
# the code needs to be modified to work on batches of emails and their known prediction for each lines.
text = """Bonjour Vincent,
Merci de m’avoir rappelé hier.
Seriez vous disponible pour un rendez vous la semaine prochaine?
Merci,
Jean-Baptiste
514 442 3670
jbpolle@hotmail.com"""



In [4]:
# We first create a list of all the entities that can be retrieved in the text using regex 
# ( Email, Tel, Web,...)

# Function to retrieve different entities in text using regex 
def f_find_regex_pattern(text, type_, pattern):
    """ Find all occurences of a pattern in a text and return a list of results
    Args:
        text:  the text to be analyzed
        type_:  the entity type (value is added in result)
        pattern: regex pattern to be found

    Returns:
        A list containing type, matched value, position start and end of each result

    """
    list_result = []
    results = regex.finditer(pattern, text, flags=regex.IGNORECASE)
    for match in results:
        value = match.string[match.start(): match.end()].replace("\n", " ").strip()
        list_result.append([type_,
                            value,
                            match.start(),
                            match.end(),
                            1])
    return list_result

dict_pattern = dict(EMAIL=r'[\p{L}\p{M}\-\d._]{1,}@[\p{L}\p{M}\d\-_]{1,}(\.[\p{L}\p{M}]{1,}){1,}',
                          TEL=r'(?<!\d)(\+?\d{1,2}[ -]?)?\(?\d{3}\)?[ .-]?\d{3}[ .-]?\d{4}(?!\d|\p{P}\d)',
                          POST=r'\b([A-z][0-9][A-z][ -]?[0-9][A-z][0-9]|[A-z][0-9][A-z])\b')
list_result = []
for type_, pattern in dict_pattern.items():
    result = f_find_regex_pattern(text, type_, pattern,)
    list_result += result
list_result


[['EMAIL', 'jbpolle@hotmail.com', 147, 166, 1],
 ['TEL', '514 442 3670', 134, 146, 1]]

In [5]:
# Then we create a list of all the entities that can be retrieved in the text 
# using hugging face model for french NER (PER, ORG, LOC)

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner")
model_nlp = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner")

nlp = pipeline('ner', model=model_nlp, tokenizer=tokenizer, aggregation_strategy="simple")
results = nlp(text)

for result in results:
    if result["word"] != "" and result['entity_group'] in ["PER", "LOC", "ORG"]:
      list_result.append(
                    [result["entity_group"],
                     result["word"],
                     result["start"],
                     result["end"],
                     result["score"]])
list_result

[['EMAIL', 'jbpolle@hotmail.com', 147, 166, 1],
 ['TEL', '514 442 3670', 134, 146, 1],
 ['PER', 'Vincent', 7, 15, 0.7680237],
 ['PER', 'Jean-Baptiste', 119, 133, 0.8631055]]

In [6]:
# We convert features list to dataframe
df_ner = pd.DataFrame(list_result, 
                      columns=["entity", "value", "start", "end", "score"])
df_ner

Unnamed: 0,entity,value,start,end,score
0,EMAIL,jbpolle@hotmail.com,147,166,1.0
1,TEL,514 442 3670,134,146,1.0
2,PER,Vincent,7,15,0.768024
3,PER,Jean-Baptiste,119,133,0.863105


In [7]:
# We split the text by lines
def f_split_text_by_lines(text):
    """
    :param text: text that should be split
    :return: list containing for each line:  [position start, position end, sentence]
    """
    results = []
    iter_lines = regex.finditer("[^>\n]((.*?([!?.>] ){1,})|.*(?=\n|$))", text)
    for line_match in iter_lines:
        start_line = line_match.start()
        end_line = line_match.end()
        line = line_match.group()
        if len(line.strip()) > 1:
            results.append([start_line, end_line, line])
    return results

list_lines = f_split_text_by_lines(text)
list_lines

[[0, 16, 'Bonjour Vincent,'],
 [17, 47, 'Merci de m’avoir rappelé hier.'],
 [48, 112, 'Seriez vous disponible pour un rendez vous la semaine prochaine?'],
 [113, 119, 'Merci,'],
 [120, 133, 'Jean-Baptiste'],
 [134, 146, '514 442 3670'],
 [147, 166, 'jbpolle@hotmail.com']]

In [8]:
# For each line of the text, we compute corresponding features (word count, 
# number of persons, number of locations,...)
# Here we defined the fonctions

# Model used to compute embeddings of sentences
embedder_model = SentenceTransformer("distiluse-base-multilingual-cased-v1")

def f_retrieve_entities_for_line(df_ner, start=0, end=1e12):
  """Retrieve all entities in the previously computed dataframe  for a specific line

  Args:
        df_ner:  dataframe containing found entities
        start:  start position of the line in original text
        end: end position of the line in original text
        
        """

  if len(df_ner) > 0:
    df = df_ner.query(f"""(start>= {start}  and end <= {end}) or (start<={start}  and end>={end})""")
    return df

def f_create_embedding_inv_dist_feature(text1, text2):
  """ Computing distance between two texts based on their embedding 
  provided by the SentenceTransformer above"""
  embedding_merci = embedder_model.encode(text1)
  embedding_line = embedder_model.encode(text2)
  dist = distance.cosine(embedding_merci, embedding_line)
  return 1 / (dist + 0.01)

def f_create_line_features(list_lines, line_number, df_ner):
  """ Return the values of all the features for a specific line"""
  current_line = list_lines[line_number]
  total_lines = len(list_lines)
  features_vector = [line_number, current_line[2], current_line[0], current_line[1]]
  df_ner_line = f_retrieve_entities_for_line(df_ner=df_ner, start=current_line[0], end=current_line[1])
  # Adding entity to feature vector
  for entity in ["PER", "ORG", "LOC", "TEL", "EMAIL"]:
      value = len(df_ner_line.query(f"entity=='{entity}'")) if df_ner_line is not None else 0
      features_vector.append(value)

  # Adding word count
  features_vector.append(len(current_line[2].split()))

  # distance to greeting word "merci"
  features_vector.append(f_create_embedding_inv_dist_feature("merci", current_line[2].lower()))

  # Line start with ps:
  features_vector.append(regex.match(r"\s*ps *:", current_line[2],  flags=regex.IGNORECASE ) is not None)

  # Adding position line in email
  position_in_email = (line_number + 1) / total_lines
  features_vector.append(position_in_email)
  # Adding special character count
  special_char_count = len(regex.findall(r"[^\p{L}0-9 .,\n]", current_line[2]))
  features_vector.append(special_char_count)
  # Number of empty chars with previous line
  empty_chars_with_prev_line = 0 if line_number == 0 else current_line[0] - list_lines[line_number - 1][1]
  features_vector.append(empty_chars_with_prev_line)
  return features_vector


In [9]:
# Here we execute the code on each lines and put the results in df_features
list_features_vectors = []
for line_number in range(0, len(list_lines)):
    list_features_vectors.append(f_create_line_features(list_lines, line_number, df_ner))

list_columns_header = ["line_number", "line", "start", "end"]
list_name_columns_features = ["PER", "ORG", "LOC", "TEL", "EMAIL", 
                              "word_count",
                              "inv_distance_to_merci",
                              "starts_with_ps", 
                              "position_line",
                              "special_characters_count", 
                              "empty_chars_with_prev_line"]
df_features = pd.DataFrame(list_features_vectors, columns=list_columns_header + list_name_columns_features)
df_features

Unnamed: 0,line_number,line,start,end,PER,ORG,LOC,TEL,EMAIL,word_count,inv_distance_to_merci,starts_with_ps,position_line,special_characters_count,empty_chars_with_prev_line
0,0,"Bonjour Vincent,",0,16,1,0,0,0,0,2,1.711611,False,0.142857,0,0
1,1,Merci de m’avoir rappelé hier.,17,47,0,0,0,0,0,5,2.101053,False,0.285714,1,1
2,2,Seriez vous disponible pour un rendez vous la ...,48,112,0,0,0,0,0,10,1.010082,False,0.428571,1,1
3,3,"Merci,",113,119,0,0,0,0,0,1,21.642599,False,0.571429,0,1
4,4,Jean-Baptiste,120,133,1,0,0,0,0,1,1.336501,False,0.714286,1,1
5,5,514 442 3670,134,146,0,0,0,1,0,3,1.210111,False,0.857143,0,1
6,6,jbpolle@hotmail.com,147,166,0,0,0,0,1,1,1.390052,False,1.0,1,1


In [10]:
#  We scale all features. In this example, for simplicity, we use MinMaxScaler for all features
from sklearn.preprocessing import MinMaxScaler
minmax_scaler = MinMaxScaler()
df_features.loc[:, list_name_columns_features] = minmax_scaler.fit_transform(df_features[list_name_columns_features])
df_features

Unnamed: 0,line_number,line,start,end,PER,ORG,LOC,TEL,EMAIL,word_count,inv_distance_to_merci,starts_with_ps,position_line,special_characters_count,empty_chars_with_prev_line
0,0,"Bonjour Vincent,",0,16,1.0,0.0,0.0,0.0,0.0,0.111111,0.034001,0.0,0.0,0.0,0.0
1,1,Merci de m’avoir rappelé hier.,17,47,0.0,0.0,0.0,0.0,0.0,0.444444,0.052876,0.0,0.166667,1.0,1.0
2,2,Seriez vous disponible pour un rendez vous la ...,48,112,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.333333,1.0,1.0
3,3,"Merci,",113,119,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.5,0.0,1.0
4,4,Jean-Baptiste,120,133,1.0,0.0,0.0,0.0,0.0,0.0,0.015821,0.0,0.666667,1.0,1.0
5,5,514 442 3670,134,146,0.0,0.0,0.0,1.0,0.0,0.222222,0.009695,0.0,0.833333,0.0,1.0
6,6,jbpolle@hotmail.com,147,166,0.0,0.0,0.0,0.0,1.0,0.0,0.018416,0.0,1.0,1.0,1.0


In [11]:
# We create x variable. 
# In this case it's quite easy because we work on only one email with 7 lines, and therefore we create a batch of shape (7 x 11) (7 lines x 11 features)
# However in real case we need to batch together many emails with different number of lines.
# Therefore we need to do some padding based on the max number of lines of the longest email in the batch. We obtain a batch of shape: n emails x max number of lines x 11 features
# In order to limit the size of the model, it might also be a good idea to limit to a maximum number of lines in case where you have very long emails in the dataset (and anyway for a very long email, 
# you probably don't need the first lines to accurately predict the signature lines) 
x = df_features[list_name_columns_features].to_numpy()
# Here we need to add extra dimension as we only have 1 email
x = np.expand_dims(x, axis=0)
x.shape

(1, 7, 11)

In [12]:
# Here we manually input expected result for the 7 lines of our email
y = np.array([0, 0, 0, 0, 1, 1, 1])
# Same thing for y, we add extra dimension. It will have a shape of 1 email x 7 lines.
y = np.expand_dims(y, axis=0)
y.shape

(1, 7)

In [13]:
# Creating Keras model.
# In reality I used 2 layers of Bidirectional with 10 and 20 units (slighlty better results like that) 
model = keras.Sequential()
model.add(keras.layers.Bidirectional(
              layer=keras.layers.LSTM(
                                      units=10,
                                      return_sequences=True,
                                      dropout=0.15
                            )
    ))

model.add(keras.layers.Dense(1, activation='sigmoid'))
opt = keras.optimizers.Adam()
model.compile(loss="binary_crossentropy",
              optimizer=opt,
              metrics=[keras.metrics.Precision(name="precision"),
                        keras.metrics.Recall(name="recall")])
model.build(x.shape)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirectiona  (1, 7, 20)               1760      
 l)                                                              
                                                                 
 dense (Dense)               (1, 7, 1)                 21        
                                                                 
Total params: 1,781
Trainable params: 1,781
Non-trainable params: 0
_________________________________________________________________


In [14]:
# Train model
# Again this is a very simple version for training. In reality I would recommand to use
# Early stopping, reduce learning rate on plateau or a learning rate scheduler 
history = model.fit(x,
                    y,
                    # validation_split=0.05,
                    epochs=1,
                    verbose=1)



In [15]:
# And then prediction could be obtained using:
y_predict = model.predict(x)>0.5
y_predict

array([[[False],
        [ True],
        [ True],
        [False],
        [ True],
        [ True],
        [ True]]])