In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/enron-email-dataset/emails.csv


In [12]:
## Import Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import email
import warnings 
import re
warnings.filterwarnings('ignore')

## **A walk through of the data**

In [3]:
df = pd.read_csv("/kaggle/input/enron-email-dataset/emails.csv")

In [4]:
df.head()

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [5]:
df.shape

(517401, 2)

In [6]:
df.columns

Index(['file', 'message'], dtype='object')

In [7]:
# get message value
df.loc[0]['message']

"Message-ID: <18782981.1075855378110.JavaMail.evans@thyme>\nDate: Mon, 14 May 2001 16:39:00 -0700 (PDT)\nFrom: phillip.allen@enron.com\nTo: tim.belden@enron.com\nSubject: \nMime-Version: 1.0\nContent-Type: text/plain; charset=us-ascii\nContent-Transfer-Encoding: 7bit\nX-From: Phillip K Allen\nX-To: Tim Belden <Tim Belden/Enron@EnronXGate>\nX-cc: \nX-bcc: \nX-Folder: \\Phillip_Allen_Jan2002_1\\Allen, Phillip K.\\'Sent Mail\nX-Origin: Allen-P\nX-FileName: pallen (Non-Privileged).pst\n\nHere is our forecast\n\n "

In [8]:
# get file value
df.loc[0]['file']

'allen-p/_sent_mail/1.'

In [9]:
# transform the email into correct form
message = df.loc[1]['message']
emails = email.message_from_string(message)
emails.items()

[('Message-ID', '<15464986.1075855378456.JavaMail.evans@thyme>'),
 ('Date', 'Fri, 4 May 2001 13:51:00 -0700 (PDT)'),
 ('From', 'phillip.allen@enron.com'),
 ('To', 'john.lavorato@enron.com'),
 ('Subject', 'Re:'),
 ('Mime-Version', '1.0'),
 ('Content-Type', 'text/plain; charset=us-ascii'),
 ('Content-Transfer-Encoding', '7bit'),
 ('X-From', 'Phillip K Allen'),
 ('X-To', 'John J Lavorato <John J Lavorato/ENRON@enronXgate@ENRON>'),
 ('X-cc', ''),
 ('X-bcc', ''),
 ('X-Folder', "\\Phillip_Allen_Jan2002_1\\Allen, Phillip K.\\'Sent Mail"),
 ('X-Origin', 'Allen-P'),
 ('X-FileName', 'pallen (Non-Privileged).pst')]

In [10]:
# get email body
emails.get_payload()

"Traveling to have a business meeting takes the fun out of the trip.  Especially if you have to prepare a presentation.  I would suggest holding the business plan meetings here then take a trip without any formal business meetings.  I would even try and get some honest opinions on whether a trip is even desired or necessary.\n\nAs far as the business meetings, I think it would be more productive to try and stimulate discussions across the different groups about what is working and what is not.  Too often the presenter speaks and the others are quiet just waiting for their turn.   The meetings might be better if held in a round table discussion format.  \n\nMy suggestion for where to go is Austin.  Play golf and rent a ski boat and jet ski's.  Flying somewhere takes too much time.\n"

In [11]:
emails.get("Date")

'Fri, 4 May 2001 13:51:00 -0700 (PDT)'

## **Tasks**

**Task1**

In [15]:
# Preprocessing function to clean the email
def clean_email(email):
    
    if isinstance(email, str):
        # Remove unnecessary parts (e.g., signatures, metadata)
        email_body = email.split('--')[0]  # Split on signature separator
        
        # Remove headers and retain the body of the email
        email_body = re.sub(r'^.*?\n\n', '', email_body, flags=re.DOTALL)  # Remove headers
        
        return email_body.strip()
    return ''  # Return empty string for non-string inputs

In [13]:
# Apply cleaning to the email messages
df['cleaned_message'] = df['message'].apply(clean_email)


In [14]:
df.head()

Unnamed: 0,file,message,cleaned_message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,Here is our forecast
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,Traveling to have a business meeting takes the...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,test successful. way to go!!!
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,"Randy,\n\n Can you send me a schedule of the s..."
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,Let's shoot for Tuesday at 11:45.


In [16]:
df.shape

(517401, 3)

we can see that the dataset is huge and it will take a lot of time to process it, so let's create a subset of the dataset that have common topics

In [17]:
# Define keywords for common topics
common_keywords = ['meeting', 'update', 'project', 'schedule', 'confirm', 'discussion']

# Function to identify long threads and common topics
def is_valid_email(email):
    # Check if it contains common keywords
    if any(keyword in email.lower() for keyword in common_keywords):
        return True
    return False

In [18]:
# Filter emails
subset_df = df[df['cleaned_message'].apply(is_valid_email)]

In [19]:
subset_df.shape

(114561, 3)

**now we have a subset of data to work on**

In [21]:
subset_df[['file', 'cleaned_message']].head()

Unnamed: 0,file,cleaned_message
1,allen-p/_sent_mail/10.,Traveling to have a business meeting takes the...
3,allen-p/_sent_mail/1000.,"Randy,\n\n Can you send me a schedule of the s..."
6,allen-p/_sent_mail/1003.,Please cc the following distribution list with...
28,allen-p/_sent_mail/12.,"Reagan,\n\nJust wanted to give you an update. ..."
29,allen-p/_sent_mail/120.,Nymex expiration is during this time frame. P...


**Task2**

In [22]:
import pandas as pd
from transformers import pipeline

# Load the summarization model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [23]:
## Since the subset is still huge, will take just a fraction of the dataset to do the remaining operation

subset_df = subset_df.head(100)

In [24]:
subset_df

Unnamed: 0,file,message,cleaned_message
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,Traveling to have a business meeting takes the...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,"Randy,\n\n Can you send me a schedule of the s..."
6,allen-p/_sent_mail/1003.,Message-ID: <16254169.1075863688286.JavaMail.e...,Please cc the following distribution list with...
28,allen-p/_sent_mail/12.,Message-ID: <8572706.1075855378498.JavaMail.ev...,"Reagan,\n\nJust wanted to give you an update. ..."
29,allen-p/_sent_mail/120.,Message-ID: <29665600.1075855687895.JavaMail.e...,Nymex expiration is during this time frame. P...
...,...,...,...
891,allen-p/all_documents/364.,Message-ID: <30386365.1075855693824.JavaMail.e...,"Due to some problems with my email yesterday, ..."
899,allen-p/all_documents/371.,Message-ID: <24807826.1075855693979.JavaMail.e...,Traveling to have a business meeting takes the...
901,allen-p/all_documents/373.,Message-ID: <10184994.1075855694024.JavaMail.e...,"Reagan,\n\nJust wanted to give you an update. ..."
902,allen-p/all_documents/374.,Message-ID: <18045271.1075855694046.JavaMail.e...,"Jim,\n\nIs there going to be a conference call..."


**Below is a defintion of a function to summarize the emails, taking care to concatenate multiple emails if necessary**

In [25]:
# Summarize function for email threads
def summarize_emails(email_list):
    
    combined_email = " ".join(email_list)
    
    # Tokenize and check length
    if len(combined_email) > 1024:  # Adjusted the threshold based on the model's limit, as I was getting some error
        combined_email = combined_email[:1024]  # Truncate to the first 1024 characters

    # Calling the Summarize
    summary = summarizer(combined_email, max_length=80, min_length=30, do_sample=False)
    return summary[0]['summary_text'] if summary else ""

In [26]:
# Apply the summarization to each email in the subset
subset_df['summary'] = subset_df['cleaned_message'].apply(lambda x: summarize_emails([x]))

Your max_length is set to 80, but your input_length is only 54. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=27)
Your max_length is set to 80, but your input_length is only 18. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=9)
Your max_length is set to 80, but your input_length is only 42. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=21)
Your max_length is set to 80, but your input_length is only 45. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=22)
Your max_

In [27]:
# Display the summaries
subset_df[['file', 'cleaned_message', 'summary']].head()

Unnamed: 0,file,cleaned_message,summary
1,allen-p/_sent_mail/10.,Traveling to have a business meeting takes the...,Traveling to have a business meeting takes the...
3,allen-p/_sent_mail/1000.,"Randy,\n\n Can you send me a schedule of the s...","Randy, can you send me a schedule of the salar..."
6,allen-p/_sent_mail/1003.,Please cc the following distribution list with...,Phillip Allen (pallen@enron.com) and Mike Grig...
28,allen-p/_sent_mail/12.,"Reagan,\n\nJust wanted to give you an update. ...",Reagan: I have changed the unit mix to include...
29,allen-p/_sent_mail/120.,Nymex expiration is during this time frame. P...,Nymex expiration is during this time frame. Pl...


In [28]:
# A basic evaluation function
def evaluate_summary(original, summary):
    # Check if the summary includes keywords related to decisions or actions
    keywords = ['decide', 'action', 'confirm', 'schedule', 'meeting']
    return any(keyword in summary.lower() for keyword in keywords)

In [29]:
# Apply evaluation
subset_df['summary_evaluation'] = subset_df.apply(lambda row: evaluate_summary(row['cleaned_message'], row['summary']), axis=1)

In [30]:
# Display evaluations
subset_df[['file', 'summary', 'summary_evaluation']].head()

Unnamed: 0,file,summary,summary_evaluation
1,allen-p/_sent_mail/10.,Traveling to have a business meeting takes the...,True
3,allen-p/_sent_mail/1000.,"Randy, can you send me a schedule of the salar...",True
6,allen-p/_sent_mail/1003.,Phillip Allen (pallen@enron.com) and Mike Grig...,False
28,allen-p/_sent_mail/12.,Reagan: I have changed the unit mix to include...,False
29,allen-p/_sent_mail/120.,Nymex expiration is during this time frame. Pl...,True


**Task3**

In [31]:
from transformers import pipeline, BartTokenizer

# Load the response generation model using BART
response_generator = pipeline('text2text-generation', model='facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [32]:
# Maximum token length for BART
MAX_INPUT_LENGTH = 1024

def generate_response(email_content):
    prompt = f"Reply to the following email:\n\n{email_content}\n\nResponse:"
    
    # Have to truncate the prompt as I was getting error
    inputs = tokenizer(prompt, truncation=True, max_length=MAX_INPUT_LENGTH)
    truncated_prompt = tokenizer.decode(inputs['input_ids'], skip_special_tokens=True)
    
    # Generate a response using the model
    response = response_generator(truncated_prompt, max_length=150, num_return_sequences=1)

    # Format the response
    response_text = response[0]['generated_text'].strip()

    return response_text

In [33]:
# Generate responses for the cleaned messages
subset_df['response'] = subset_df['cleaned_message'].apply(generate_response)

In [34]:
# Evaluate responses for relevance
def evaluate_response(original, response):
    # Check if the response contains key phrases relevant to the email content
    keywords = ['confirm', 'thanks', 'update', 'schedule', 'details']
    return any(keyword in response.lower() for keyword in keywords)

In [35]:
# Apply evaluation
subset_df['response_evaluation'] = subset_df.apply(lambda row: evaluate_response(row['cleaned_message'], row['response']), axis=1)

In [36]:
# Display results
subset_df[['file', 'cleaned_message', 'response', 'response_evaluation']].head()

Unnamed: 0,file,cleaned_message,response,response_evaluation
1,allen-p/_sent_mail/10.,Traveling to have a business meeting takes the...,Traveling to have a business meeting takes the...,False
3,allen-p/_sent_mail/1000.,"Randy,\n\n Can you send me a schedule of the s...","Randy, can you send me a schedule of the salar...",True
6,allen-p/_sent_mail/1003.,Please cc the following distribution list with...,Phillip Allen (pallen@enron.com) and Mike Grig...,True
28,allen-p/_sent_mail/12.,"Reagan,\n\nJust wanted to give you an update. ...",Just wanted to give you an update. I have chan...,True
29,allen-p/_sent_mail/120.,Nymex expiration is during this time frame. P...,Nymex expiration is during this time frame. Pl...,True


In [37]:
subset_df.head()

Unnamed: 0,file,message,cleaned_message,summary,summary_evaluation,response,response_evaluation
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,Traveling to have a business meeting takes the...,Traveling to have a business meeting takes the...,True,Traveling to have a business meeting takes the...,False
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,"Randy,\n\n Can you send me a schedule of the s...","Randy, can you send me a schedule of the salar...",True,"Randy, can you send me a schedule of the salar...",True
6,allen-p/_sent_mail/1003.,Message-ID: <16254169.1075863688286.JavaMail.e...,Please cc the following distribution list with...,Phillip Allen (pallen@enron.com) and Mike Grig...,False,Phillip Allen (pallen@enron.com) and Mike Grig...,True
28,allen-p/_sent_mail/12.,Message-ID: <8572706.1075855378498.JavaMail.ev...,"Reagan,\n\nJust wanted to give you an update. ...",Reagan: I have changed the unit mix to include...,False,Just wanted to give you an update. I have chan...,True
29,allen-p/_sent_mail/120.,Message-ID: <29665600.1075855687895.JavaMail.e...,Nymex expiration is during this time frame. P...,Nymex expiration is during this time frame. Pl...,True,Nymex expiration is during this time frame. Pl...,True


**Task4**

In [38]:
!pip install rouge_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=531404654519eff423ca346371f1ce3f8910577568c739aa2c99e1a405dae4c1
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [40]:
!pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [41]:
import evaluate

# Load ROUGE metric
rouge = evaluate.load("rouge")

# Assuming 'subset_df' contains the original emails and generated summaries
reference_summaries = subset_df['cleaned_message'].tolist()  # Original emails can be references
generated_summaries = subset_df['summary'].tolist()

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [42]:
# Function to evaluate using ROUGE
def compute_rouge(references, predictions):
    rouge_output = rouge.compute(predictions=predictions, references=references)
    return rouge_output

In [43]:
# Compute ROUGE scores
rouge_scores = compute_rouge(reference_summaries, generated_summaries)

In [44]:
# Manual evaluation for key phrases
def evaluate_summary(summary):
    keywords = ['decide', 'action', 'confirm', 'schedule', 'meeting']
    return any(keyword in summary.lower() for keyword in keywords)

In [45]:
# Apply manual evaluation for summaries
subset_df['summary_evaluation'] = subset_df['summary'].apply(evaluate_summary)

In [48]:
# Display evaluations
subset_df[['file', 'summary', 'summary_evaluation']].head()

Unnamed: 0,file,summary,summary_evaluation
1,allen-p/_sent_mail/10.,Traveling to have a business meeting takes the...,True
3,allen-p/_sent_mail/1000.,"Randy, can you send me a schedule of the salar...",True
6,allen-p/_sent_mail/1003.,Phillip Allen (pallen@enron.com) and Mike Grig...,False
28,allen-p/_sent_mail/12.,Reagan: I have changed the unit mix to include...,False
29,allen-p/_sent_mail/120.,Nymex expiration is during this time frame. Pl...,True


In [47]:
print("ROUGE Scores:", rouge_scores)

ROUGE Scores: {'rouge1': 0.5650295642213841, 'rouge2': 0.526536946379182, 'rougeL': 0.5323693616951192, 'rougeLsum': 0.5622868769015515}


In [50]:
# Load BLEU metric
bleu = evaluate.load("bleu")

# Tokenize the cleaned messages (references) and responses (predictions)
def tokenize_text(text):
    return text.split()  # Simple tokenization by splitting on spaces

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [57]:
reference_responses = subset_df['cleaned_message'].tolist()  # Original emails (references)
generated_responses = subset_df['response'].tolist()  # Generated responses (predictions)

# BLEU expects references to be a list of lists. We need to wrap each reference in a list:
reference_responses = [[ref] for ref in reference_responses]  # Wrap each reference in a list

In [58]:
# Function to compute BLEU score
def compute_bleu(references, predictions):
    bleu_output = bleu.compute(predictions=predictions, references=references)
    return bleu_output

In [59]:
# Compute BLEU scores
bleu_scores = compute_bleu(reference_responses, generated_responses)

# Output the BLEU scores
print("BLEU Scores:", bleu_scores)

BLEU Scores: {'bleu': 0.14454689978765864, 'precisions': [0.7772126144455748, 0.7240427733701277, 0.6837486837486837, 0.6486245087531262], 'brevity_penalty': 0.20450799972071068, 'length_ratio': 0.386525984664788, 'translation_length': 5898, 'reference_length': 15259}
