Module for E-mail Summarization
*****************************************************************************
Input Parameters:
    emails: A list of strings containing the emails
Returns:
    summary: A list of strings containing the summaries.
*****************************************************************************


In [None]:
import numpy as np
from talon.signature.bruteforce import extract_signature
from langdetect import detect
from nltk.tokenize import sent_tokenize
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min

In [None]:
def preprocess(email):
    """
    Performs preprocessing operations such as:
        1. Removing signature lines (only English emails are supported)
        2. Removing new line characters.
    """
    email, _ = extract_signature(email)
    
    lines = email.split('\n')
    for j in reversed(range(len(lines))):
        lines[j] = lines[j].strip()
        if lines[j] == '':
            lines.pop(j)
    
    return ' '.join(lines)

In [None]:
def split_sentences(email):
    """
    Splits the emails into individual sentences
    """        
    
    sentences = sent_tokenize(email)
    for j in reversed(range(len(sentences))):
        sent = sentences[j]
        sentences[j] = sent.strip()
        if sent == '':
            sentences.pop(j)
        
    return (sentences)

In [None]:
def Encoding(email, max_length=128):
    """
    Obtains sentence embeddings for each sentence in the emails
    """
    from transformers import BertTokenizer, BertForMaskedLM
    import torch

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    inputs = tokenizer(email, return_tensors='pt', max_length=max_length, truncation=True, padding='max_length')   

        
    return inputs['input_ids'].numpy()

In [None]:
def summarization(processed_email_text:list = None, encoded_email_text:np.ndarray = None):
    '''
        Email summarization

        Parameters
        ----------
        processed_email_text: list with sentences composing the email 
        encoded_email_text:
    '''

    n_clusters = int(np.ceil(len(encoded_email_text)**0.5))
    print('Number of clusters: ', n_clusters)


    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    kmeans = kmeans.fit(encoded_email_text)
    print('Kmeans trained')

    avg = []
    closest = []
    for j in range(n_clusters):
        idx = np.where(kmeans.labels_ == j)[0]
        avg.append(np.mean(idx))
    closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, encoded_email_text)
    ordering = sorted(range(n_clusters), key=lambda k: avg[k])
    
    return ' '.join([processed_email_text[closest[idx]] for idx in ordering])

# Example 1: cover_letter.txt

In [None]:
with open('Email_examples/cover_letter.txt') as f:
    email_text = f.read()
print(email_text)

In [31]:
print('Email pre-processing...')
processed_email_text = preprocess(email_text)

print('Splitting into sentences...')
processed_email_text = split_sentences(processed_email_text)
print('Number of sentences: ', len(processed_email_text))

print('Encoding process...')
encoded_email_text = Encoding(processed_email_text)    

summary = summarization(processed_email_text, encoded_email_text)

print('\nSummary:\n', summary)


Email pre-processing...
Splitting into sentences...
Number of sentences:  3
Encoding process...
Number of clusters:  2
Kmeans trained
Summary:
 Good evening Mrs. Yoo, I'm reaching out on behalf of LettuceEat to thank you for your review of our restaurant on ReviewIt. We really appreciate your kind words and recommending our restaurant to others on the platform.


# Example 2: appreciating_the_customer.txt

In [28]:
with open('Email_examples/appreciating_the_customer.txt') as f:
    email_text = f.read()
print(email_text)

Good evening Mrs. Yoo,

I'm reaching out on behalf of LettuceEat to thank you for your review of our restaurant on ReviewIt. 
We really appreciate your kind words and recommending our restaurant to others on the platform. 

LettuceEat is so happy you enjoyed our vegan options and your experience with us. 


Please come back soon!

Best regards,
Sarah Gibbs


In [29]:
print('Email pre-processing...')
processed_email_text = preprocess(email_text)

print('Splitting into sentences...')
processed_email_text = split_sentences(processed_email_text)
print('Number of sentences: ', len(processed_email_text))

print('Encoding process...')
encoded_email_text = Encoding(processed_email_text)    

summary = summarization(processed_email_text, encoded_email_text)

print('\nSummary:\n', summary)

Email pre-processing...
Splitting into sentences...
Number of sentences:  3
Encoding process...
Number of clusters:  2
Kmeans trained


"Good evening Mrs. Yoo, I'm reaching out on behalf of LettuceEat to thank you for your review of our restaurant on ReviewIt. We really appreciate your kind words and recommending our restaurant to others on the platform."

# Example 1: cover_letter.txt

In [None]:
with open('Email_examples/cover_letter.txt') as f:
    email_text = f.read()
print(email_text)

In [None]:
print('Email pre-processing...')
processed_email_text = preprocess(email_text)

print('Splitting into sentences...')
processed_email_text = split_sentences(processed_email_text)
print('Number of sentences: ', len(processed_email_text))

print('Encoding process...')
encoded_email_text = Encoding(processed_email_text)    

summary = summarization(processed_email_text, encoded_email_text)

print('\nSummary:\n', summary)