In [1]:
import re
def clean_body(body):
    """
    Nettoie le contenu de l'e-mail et conserve uniquement le 'Subject:' le plus récent.
    """
    if not isinstance(body, str):
        return None

    # Étape 1 : Supprimer les blocs transférés "Forwarded by"
    body = re.sub(r"(?s)---------------------- Forwarded by .*?---------------------------\n", "", body, flags=re.IGNORECASE)

    # Étape 2 : Supprimer les messages précédents à partir de "-----Original Message-----"
    body = re.split(r"-----Original Message-----", body, flags=re.IGNORECASE)[0]

    # Étape 3 : Supprimer les lignes contenant "To:", "Cc:", "From:"
    body = re.sub(r"(?i)
                  ^\s*(To:|Cc:|From:).*?$", "", body, flags=re.MULTILINE)

    # Étape 4 : Supprimer les lignes contenant une adresse e-mail
    body = re.sub(r"(?m)^.*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}.*$", "", body)

    # Étape 5 : Retenir uniquement le premier 'Subject:'
    match = re.search(r"(?i)^Subject:.*?$", body, flags=re.MULTILINE)
    if match:
        subject = match.group(0)
        # Supprimer tous les autres 'Subject:'
        body = subject + "\n" + re.sub(r"(?i)^Subject:.*?$", "", body, flags=re.MULTILINE)

    # Étape 6 : Supprimer les lignes contenant uniquement des espaces ou des tabulations
    body = re.sub(r"^\s+$", "", body, flags=re.MULTILINE)

    # Étape 7 : Nettoyer les espaces et sauts de ligne multiples
    body = body.strip()
    body = re.sub(r"\n{2,}", "\n", body)  # Réduit plusieurs sauts de ligne consécutifs à un seul

    return body


In [2]:
import pandas as pd
parsed_df = pd.read_csv('parsed_emails.csv')

In [3]:
parsed_df

Unnamed: 0,file,message,parsed_email,Message-ID,Date,From,To,Subject,Mime-Version,Content-Type,Content-Transfer-Encoding,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,Body
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,{'Message-ID': '<18782981.1075855378110.JavaMa...,<18782981.1075855378110.JavaMail.evans@thyme>,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",phillip.allen@enron.com,tim.belden@enron.com,,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Tim Belden <Tim Belden/Enron@EnronXGate>,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,\nHere is our forecast\n\n
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,{'Message-ID': '<15464986.1075855378456.JavaMa...,<15464986.1075855378456.JavaMail.evans@thyme>,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",phillip.allen@enron.com,john.lavorato@enron.com,Re:,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,John J Lavorato <John J Lavorato/ENRON@enronXg...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,\nTraveling to have a business meeting takes t...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,{'Message-ID': '<24216240.1075855687451.JavaMa...,<24216240.1075855687451.JavaMail.evans@thyme>,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",phillip.allen@enron.com,leah.arsdall@enron.com,Re: test,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Leah Van Arsdall,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,\ntest successful. way to go!!!
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,{'Message-ID': '<13505866.1075863688222.JavaMa...,<13505866.1075863688222.JavaMail.evans@thyme>,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",phillip.allen@enron.com,randall.gay@enron.com,,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Randall L Gay,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,"\nRandy,\n\n Can you send me a schedule of the..."
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,{'Message-ID': '<30922949.1075863688243.JavaMa...,<30922949.1075863688243.JavaMail.evans@thyme>,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",phillip.allen@enron.com,greg.piper@enron.com,Re: Hello,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Greg Piper,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,\nLet's shoot for Tuesday at 11:45.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517396,zufferli-j/sent_items/95.,Message-ID: <26807948.1075842029936.JavaMail.e...,{'Message-ID': '<26807948.1075842029936.JavaMa...,<26807948.1075842029936.JavaMail.evans@thyme>,"Wed, 28 Nov 2001 13:30:11 -0800 (PST)",john.zufferli@enron.com,kori.loibl@enron.com,Trade with John Lavorato,1.0,text/plain; charset=us-ascii,7bit,"Zufferli, John </O=ENRON/OU=NA/CN=RECIPIENTS/C...","Loibl, Kori </O=ENRON/OU=NA/CN=RECIPIENTS/CN=K...",,,"\ExMerge - Zufferli, John\Sent Items",ZUFFERLI-J,john zufferli 6-26-02.PST,\nThis is a trade with OIL-SPEC-HEDGE-NG (John...
517397,zufferli-j/sent_items/96.,Message-ID: <25835861.1075842029959.JavaMail.e...,{'Message-ID': '<25835861.1075842029959.JavaMa...,<25835861.1075842029959.JavaMail.evans@thyme>,"Wed, 28 Nov 2001 12:47:48 -0800 (PST)",john.zufferli@enron.com,john.lavorato@enron.com,Gas Hedges,1.0,text/plain; charset=us-ascii,7bit,"Zufferli, John </O=ENRON/OU=NA/CN=RECIPIENTS/C...","Lavorato, John </O=ENRON/OU=NA/CN=RECIPIENTS/C...",,,"\ExMerge - Zufferli, John\Sent Items",ZUFFERLI-J,john zufferli 6-26-02.PST,\nSome of my position is with the Alberta Term...
517398,zufferli-j/sent_items/97.,Message-ID: <28979867.1075842029988.JavaMail.e...,{'Message-ID': '<28979867.1075842029988.JavaMa...,<28979867.1075842029988.JavaMail.evans@thyme>,"Wed, 28 Nov 2001 07:20:00 -0800 (PST)",john.zufferli@enron.com,dawn.doucet@enron.com,RE: CONFIDENTIAL,1.0,text/plain; charset=us-ascii,7bit,"Zufferli, John </O=ENRON/OU=NA/CN=RECIPIENTS/C...","Doucet, Dawn </O=ENRON/OU=NA/CN=RECIPIENTS/CN=...",,,"\ExMerge - Zufferli, John\Sent Items",ZUFFERLI-J,john zufferli 6-26-02.PST,\n2\n\n -----Original Message-----\nFrom: \tDo...
517399,zufferli-j/sent_items/98.,Message-ID: <22052556.1075842030013.JavaMail.e...,{'Message-ID': '<22052556.1075842030013.JavaMa...,<22052556.1075842030013.JavaMail.evans@thyme>,"Tue, 27 Nov 2001 11:52:45 -0800 (PST)",john.zufferli@enron.com,jeanie.slone@enron.com,Calgary Analyst/Associate,1.0,text/plain; charset=us-ascii,7bit,"Zufferli, John </O=ENRON/OU=NA/CN=RECIPIENTS/C...","Slone, Jeanie </O=ENRON/OU=NA/CN=RECIPIENTS/CN...",,,"\ExMerge - Zufferli, John\Sent Items",ZUFFERLI-J,john zufferli 6-26-02.PST,\nAnalyst\t\t\t\t\tRank\n\nStephane Brodeur\t\...


In [4]:
# Appliquer la fonction sur la colonne 'body'
parsed_df['cleaned_body'] = parsed_df['Body'].apply(clean_body)

In [5]:
parsed_df

Unnamed: 0,file,message,parsed_email,Message-ID,Date,From,To,Subject,Mime-Version,Content-Type,Content-Transfer-Encoding,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,Body,cleaned_body
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,{'Message-ID': '<18782981.1075855378110.JavaMa...,<18782981.1075855378110.JavaMail.evans@thyme>,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",phillip.allen@enron.com,tim.belden@enron.com,,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Tim Belden <Tim Belden/Enron@EnronXGate>,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,\nHere is our forecast\n\n,Here is our forecast
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,{'Message-ID': '<15464986.1075855378456.JavaMa...,<15464986.1075855378456.JavaMail.evans@thyme>,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",phillip.allen@enron.com,john.lavorato@enron.com,Re:,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,John J Lavorato <John J Lavorato/ENRON@enronXg...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,\nTraveling to have a business meeting takes t...,Traveling to have a business meeting takes the...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,{'Message-ID': '<24216240.1075855687451.JavaMa...,<24216240.1075855687451.JavaMail.evans@thyme>,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",phillip.allen@enron.com,leah.arsdall@enron.com,Re: test,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Leah Van Arsdall,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,\ntest successful. way to go!!!,test successful. way to go!!!
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,{'Message-ID': '<13505866.1075863688222.JavaMa...,<13505866.1075863688222.JavaMail.evans@thyme>,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",phillip.allen@enron.com,randall.gay@enron.com,,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Randall L Gay,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,"\nRandy,\n\n Can you send me a schedule of the...","Randy,\n Can you send me a schedule of the sal..."
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,{'Message-ID': '<30922949.1075863688243.JavaMa...,<30922949.1075863688243.JavaMail.evans@thyme>,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",phillip.allen@enron.com,greg.piper@enron.com,Re: Hello,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Greg Piper,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,\nLet's shoot for Tuesday at 11:45.,Let's shoot for Tuesday at 11:45.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517396,zufferli-j/sent_items/95.,Message-ID: <26807948.1075842029936.JavaMail.e...,{'Message-ID': '<26807948.1075842029936.JavaMa...,<26807948.1075842029936.JavaMail.evans@thyme>,"Wed, 28 Nov 2001 13:30:11 -0800 (PST)",john.zufferli@enron.com,kori.loibl@enron.com,Trade with John Lavorato,1.0,text/plain; charset=us-ascii,7bit,"Zufferli, John </O=ENRON/OU=NA/CN=RECIPIENTS/C...","Loibl, Kori </O=ENRON/OU=NA/CN=RECIPIENTS/CN=K...",,,"\ExMerge - Zufferli, John\Sent Items",ZUFFERLI-J,john zufferli 6-26-02.PST,\nThis is a trade with OIL-SPEC-HEDGE-NG (John...,This is a trade with OIL-SPEC-HEDGE-NG (John L...
517397,zufferli-j/sent_items/96.,Message-ID: <25835861.1075842029959.JavaMail.e...,{'Message-ID': '<25835861.1075842029959.JavaMa...,<25835861.1075842029959.JavaMail.evans@thyme>,"Wed, 28 Nov 2001 12:47:48 -0800 (PST)",john.zufferli@enron.com,john.lavorato@enron.com,Gas Hedges,1.0,text/plain; charset=us-ascii,7bit,"Zufferli, John </O=ENRON/OU=NA/CN=RECIPIENTS/C...","Lavorato, John </O=ENRON/OU=NA/CN=RECIPIENTS/C...",,,"\ExMerge - Zufferli, John\Sent Items",ZUFFERLI-J,john zufferli 6-26-02.PST,\nSome of my position is with the Alberta Term...,Some of my position is with the Alberta Term b...
517398,zufferli-j/sent_items/97.,Message-ID: <28979867.1075842029988.JavaMail.e...,{'Message-ID': '<28979867.1075842029988.JavaMa...,<28979867.1075842029988.JavaMail.evans@thyme>,"Wed, 28 Nov 2001 07:20:00 -0800 (PST)",john.zufferli@enron.com,dawn.doucet@enron.com,RE: CONFIDENTIAL,1.0,text/plain; charset=us-ascii,7bit,"Zufferli, John </O=ENRON/OU=NA/CN=RECIPIENTS/C...","Doucet, Dawn </O=ENRON/OU=NA/CN=RECIPIENTS/CN=...",,,"\ExMerge - Zufferli, John\Sent Items",ZUFFERLI-J,john zufferli 6-26-02.PST,\n2\n\n -----Original Message-----\nFrom: \tDo...,2
517399,zufferli-j/sent_items/98.,Message-ID: <22052556.1075842030013.JavaMail.e...,{'Message-ID': '<22052556.1075842030013.JavaMa...,<22052556.1075842030013.JavaMail.evans@thyme>,"Tue, 27 Nov 2001 11:52:45 -0800 (PST)",john.zufferli@enron.com,jeanie.slone@enron.com,Calgary Analyst/Associate,1.0,text/plain; charset=us-ascii,7bit,"Zufferli, John </O=ENRON/OU=NA/CN=RECIPIENTS/C...","Slone, Jeanie </O=ENRON/OU=NA/CN=RECIPIENTS/CN...",,,"\ExMerge - Zufferli, John\Sent Items",ZUFFERLI-J,john zufferli 6-26-02.PST,\nAnalyst\t\t\t\t\tRank\n\nStephane Brodeur\t\...,Analyst\t\t\t\t\tRank\nStephane Brodeur\t\t\t1...


In [6]:
target_id = "<25857288.1075843363920.JavaMail.evans@thyme>"

In [7]:
parsed_df[parsed_df['Message-ID'] == target_id].iloc[0]['Body']

'\nThe original message was received at Fri, 16 Feb 2001 14:54:24 GMT\nfrom mailman.enron.com [192.168.189.66]\n\n   ----- The following addresses had permanent fatal errors -----\n<hgovenar@mail.acom2.com>\n\n   ----- Transcript of session follows -----\n... while talking to mail.acom2.com:\n>>> RCPT To:<hgovenar@mail.acom2.com>\n<<< 550 <hgovenar@mail.acom2.com>... Relaying denied, Momo.\n550 <hgovenar@mail.acom2.com>... User unknown\n\n   ----- Original message follows -----\n\nReturn-Path: <Jeff.Dasovich@enron.com>\nReceived: from mailman.enron.com (mailman.enron.com [192.168.189.66])\n by postmaster.enron.com (8.8.8/8.8.8/postmaster-1.00) with ESMTP id OAA18632\n for <hgovenar@mail.acom2.com>; Fri, 16 Feb 2001 14:54:24 GMT\nFrom: Jeff.Dasovich@enron.com\nReceived: from nahou-msmsw01px.corp.enron.com ([172.28.10.37])\n by mailman.enron.com (8.10.1/8.10.1/corp-1.05) with ESMTP id f1GEsNa21936\n for <hgovenar@mail.acom2.com>; Fri, 16 Feb 2001 08:54:23 -0600 (CST)\nReceived: from ene-

In [8]:
parsed_df[parsed_df['Message-ID'] == target_id].iloc[0]['cleaned_body']

'Subject: Ken Lay\'s email to Sen. Brulte\nThe original message was received at Fri, 16 Feb 2001 14:54:24 GMT\nfrom mailman.enron.com [192.168.189.66]\n   ----- The following addresses had permanent fatal errors -----\n   ----- Transcript of session follows -----\n... while talking to mail.acom2.com:\n   ----- Original message follows -----\nReceived: from mailman.enron.com (mailman.enron.com [192.168.189.66])\n by postmaster.enron.com (8.8.8/8.8.8/postmaster-1.00) with ESMTP id OAA18632\nReceived: from nahou-msmsw01px.corp.enron.com ([172.28.10.37])\n by mailman.enron.com (8.10.1/8.10.1/corp-1.05) with ESMTP id f1GEsNa21936\nReceived: from ene-mta01.enron.com (unverified) by \nnahou-msmsw01px.corp.enron.com\n (Content Technologies SMTPRS 4.1.5) with ESMTP id \n Fri, 16 Feb 2001 08:54:12 -0600\nX-Mailer: Lotus Notes Release 5.0.3  March 21, 2000\nDate: Fri, 16 Feb 2001 08:49:14 -0600\nX-MIMETrack: Serialize by Router on ENE-MTA01/Enron(Release 5.0.6 |December \n14, 2000) at\n 02/16/200

In [9]:
parsed_df.iloc[24000]['Body']


'\nItinerary for Sally Beck\nLondon\nJanuary 14-19, 2001\n\n\nTues, Jan 14  Continental Flt. CO 4 (Seat 01K)  \n    6:55 pm Depart Houston-IAH\n    9:55 am Arrive London-Gatwick\n\nHotel:   47 Park Street Hotel\n    47 Park Street\n    Mayfair, London, W1Y 4EB\n    United Kingdom\n    Phone:  011-44-207-491-7282\n    Fax:  011-44-207-491-7281\n    \nContact:   Niki Scott\n    Enron House\n    40 Grosvenor Place\nLondon SWIX 7EN\nPhone:  011-44-207-783-7533\nFax:  011-44-207-783-8025\n\nFri, Jan 19  Continental Flt. 35 (Seat 01E)\n    9:40 am Depart London-Gatwick\n    2:10 pm Arrive Houston-IAH\n\nTime Difference in London (+6 hours from Houston)'

In [10]:
parsed_df.iloc[24000]['cleaned_body']

'Itinerary for Sally Beck\nLondon\nJanuary 14-19, 2001\nTues, Jan 14  Continental Flt. CO 4 (Seat 01K)  \n    6:55 pm Depart Houston-IAH\n    9:55 am Arrive London-Gatwick\nHotel:   47 Park Street Hotel\n    47 Park Street\n    Mayfair, London, W1Y 4EB\n    United Kingdom\n    Phone:  011-44-207-491-7282\n    Fax:  011-44-207-491-7281\nContact:   Niki Scott\n    Enron House\n    40 Grosvenor Place\nLondon SWIX 7EN\nPhone:  011-44-207-783-7533\nFax:  011-44-207-783-8025\nFri, Jan 19  Continental Flt. 35 (Seat 01E)\n    9:40 am Depart London-Gatwick\n    2:10 pm Arrive Houston-IAH\nTime Difference in London (+6 hours from Houston)'

In [11]:
# Recherche avec des motifs supplémentaires
contains_attachments = parsed_df[parsed_df['cleaned_body'].str.contains(r'<< File:|Attachment|\.pdf|\.docx|Enclosed|Attached', na=False, case=False)]

# Affichage des emails contenant des pièces jointes
print(contains_attachments[['cleaned_body']])


                                             cleaned_body
18      Subject: Westgate\nWestgate\nEnclosed are demo...
23      Liane,\n As we discussed yesterday, I am conce...
24      Liane,\n As we discussed yesterday, I am conce...
41      Subject: Westgate Proforma-Phillip Allen.xls\n...
47      Lucy,\nYou wrote fewer checks this month.  Spe...
...                                                   ...
516994  John,\nAttached is a spreadsheet that outlines...
516999  Further to our meeting today, please find atta...
517003  Hi John,\nThe attached are the pictures of the...
517038  Subject: Trades for 3/1/02\n <<Trades for 3/1/...
517148  The import/export units are fine, here is the ...

[90421 rows x 1 columns]


In [36]:
contains_attachments.iloc[5]['cleaned_body']

'Subject: Contact list for mid market\nMichael Etringer\n09/11/2000 02:32 PM\nPhillip,\nAttached is the list.  Have your people fill in the columns highlighted in \nyellow.  As best can we will try not to overlap on accounts. \nThanks, Mike'

In [12]:
# Identifier les emails contenant des pièces jointes
attachment_keywords = r"(See attached file:|Content-Type:|Content-Disposition:|Content-Transfer-Encoding:)"
mails_with_attachments = parsed_df[parsed_df['cleaned_body'].str.contains(attachment_keywords, na=False, case=False)]

# Afficher les mails identifiés
print(mails_with_attachments[['cleaned_body']])


  mails_with_attachments = parsed_df[parsed_df['cleaned_body'].str.contains(attachment_keywords, na=False, case=False)]


                                             cleaned_body
349     Subject: Nondeliverable mail\nCary,\nHere is t...
350     Subject: Nondeliverable mail\n------Transcript...
382     Subject: Sagewood M/F\n(See attached file: out...
397     Subject: Sagewood Phase II\nAndrew M Ozuna\n03...
873     Subject: RADIANT HEATING\nDate: Tue, 4 Jan 200...
...                                                   ...
516832  Andy & Mike -\nAttached are our revisions to t...
516837  Attached is the document that will serve as th...
516908  (See attached file: Powder-Room_optimized.jpg)...
516910  The guy in some of the shots is the vendor.  T...
517038  Subject: Trades for 3/1/02\n <<Trades for 3/1/...

[9812 rows x 1 columns]


In [13]:
mails_with_attachments.iloc[27]['cleaned_body']

"Subject: \nI had Devon put together a resume for you to get an idea of his experience to \ndate. He's not our typical MBA Associate, but I feel he has great potential \nas a junior trader.  I'll call you later to discuss. Per\n\tEnron Capital & Trade Resources Corp.\nPM\nhope this is better.\nits best if i speak to who ever i need to outside of the office when i can \nspeak a little more freely ie after 5:30 i stuck my mobile # on it\n(See attached file: DKB CV.doc)\n - DKB CV.doc"

In [14]:
import re

def clean_email_keep_body(body):
    """
    Nettoie les pièces jointes d'un email tout en conservant le texte principal.
    - Supprime les références textuelles aux fichiers attachés (e.g., (See attached file: ...)).
    - Supprime les métadonnées des pièces jointes (Content-Type, Content-Disposition, etc.).
    - Supprime le contenu encodé en base64.
    """
    if not isinstance(body, str):  # Vérifie que le contenu est une chaîne
        return body

    # Étape 1 : Supprimer les références textuelles aux fichiers attachés
    body = re.sub(r"\(See attached file: .*?\)", "", body)

    # Étape 2 : Supprimer les métadonnées liées aux pièces jointes (Content-Type, Content-Disposition, etc.)
    body = re.sub(r"(?is)Content-Type:.*?boundary=.*?\n|Content-Disposition:.*?\n|Content-Transfer-Encoding:.*?\n", "", body)

    # Étape 3 : Supprimer les sections encodées en base64 (longues séquences de caractères alphanumériques)
    body = re.sub(r"(?s)(?:[A-Za-z0-9+/]{60,}\n)+", "", body)

    # Étape 4 : Nettoyer les lignes vides et réduire les multiples sauts de ligne
    body = re.sub(r"\n{2,}", "\n", body).strip()

    return body


In [15]:
parsed_df['without_attachement'] = parsed_df['cleaned_body'].apply(clean_email_keep_body)

In [16]:
target_id = "<25857288.1075843363920.JavaMail.evans@thyme>"

In [17]:
parsed_df[parsed_df['Message-ID'] == target_id].iloc[0]['without_attachement']

'Subject: Ken Lay\'s email to Sen. Brulte\nThe original message was received at Fri, 16 Feb 2001 14:54:24 GMT\nfrom mailman.enron.com [192.168.189.66]\n   ----- The following addresses had permanent fatal errors -----\n   ----- Transcript of session follows -----\n... while talking to mail.acom2.com:\n   ----- Original message follows -----\nReceived: from mailman.enron.com (mailman.enron.com [192.168.189.66])\n by postmaster.enron.com (8.8.8/8.8.8/postmaster-1.00) with ESMTP id OAA18632\nReceived: from nahou-msmsw01px.corp.enron.com ([172.28.10.37])\n by mailman.enron.com (8.10.1/8.10.1/corp-1.05) with ESMTP id f1GEsNa21936\nReceived: from ene-mta01.enron.com (unverified) by \nnahou-msmsw01px.corp.enron.com\n (Content Technologies SMTPRS 4.1.5) with ESMTP id \n Fri, 16 Feb 2001 08:54:12 -0600\nX-Mailer: Lotus Notes Release 5.0.3  March 21, 2000\nDate: Fri, 16 Feb 2001 08:49:14 -0600\nX-MIMETrack: Serialize by Router on ENE-MTA01/Enron(Release 5.0.6 |December \n14, 2000) at\n 02/16/200

In [18]:
# Identifier les emails contenant des pièces jointes
attachment_keywords = r"(See attached file:|Content-Type:|Content-Disposition:|Content-Transfer-Encoding:)"
mails_with_attachments_1 = parsed_df[parsed_df['without_attachement'].str.contains(attachment_keywords, na=False, case=False)]

# Afficher les mails identifiés
print(mails_with_attachments_1[['without_attachement']])

  mails_with_attachments_1 = parsed_df[parsed_df['without_attachement'].str.contains(attachment_keywords, na=False, case=False)]


                                      without_attachement
873     Subject: RADIANT HEATING\nDate: Tue, 4 Jan 200...
1313    Subject:\tRevised High Level Design\nPhillip/T...
1364    Subject:\tRevised High Level Design-Sign-off f...
1784    Subject: RADIANT HEATING\nDate: Tue, 4 Jan 200...
3032    Subject: RADIANT HEATING\nDate: Tue, 4 Jan 200...
...                                                   ...
516216  Hi Andy,\nI'm not sure if Andy Lewis explained...
516832  Andy & Mike -\nAttached are our revisions to t...
516908  (See attached file:\nLaundry_optimized.jpg)(Se...
516910  The guy in some of the shots is the vendor.  T...
517038  Subject: Trades for 3/1/02\n <<Trades for 3/1/...

[3002 rows x 1 columns]


In [26]:
mails_with_attachments_1.iloc[302]['without_attachement']

'Lots on CA; EPSA\'s views; and much from some Enron rep named "Sue Marrow."\n----- Forwarded by Susan J Mara/NA/Enron on 10/31/2000 09:56 AM -----\n\t10/29/2000 06:17 PM\n\tPlease respond to season\n\t\t Subject: Restructuring Today, Monday October 30, 2000\n(see attached file:  rt001030.pdf)\nThank you,\nSeason Hawksley\nUS Publishing\n1-800-486-8201\nwww.restructuringtoday.com\n - rt001030.pdf'

In [29]:
# Recherche avec des motifs supplémentaires
contains_attachments = parsed_df[parsed_df['without_attachement'].str.contains(r'<< File:|\.pdf|\.docx|Enclosed', na=False, case=False)]

# Affichage des emails contenant des pièces jointes
print(contains_attachments[['without_attachement']])

                                      without_attachement
18      Subject: Westgate\nWestgate\nEnclosed are demo...
41      Subject: Westgate Proforma-Phillip Allen.xls\n...
51      Subject: Westgate Proforma-Phillip Allen.xls\n...
57      Subject: Westgate Proforma-Phillip Allen.xls\n...
68      Subject: FYI\n\tEnron North America Corp.\n--\...
...                                                   ...
515883  The enclosed spreadsheet shows the net online ...
515960  USCAA Track & Field Championships, June 9, 200...
516834  <<EDSCube_NY_Feb2.ppt>>\nGreg,\nEnclosed is th...
516964  Transmission Expansion and Systems in Transiti...
517148  The import/export units are fine, here is the ...

[10835 rows x 1 columns]


In [30]:
contains_attachments.iloc[302]['without_attachement']

"Hi Don,\nI've passed along your e-mail address to be added to our customer\ndistribution list.  I've also attached the Spring 2001 issue of our customer\nnewsletter, the AP Transmission Report.  It contains some information on the\nNext-hour Market product that you may find to be useful.\n <<TM Newsletter #3 (Spring 2001).pdf>>\nThank you,\nCara\nCara Gigliotti\nTransmission Marketing\nAllegheny Power\n724-838-6944\n - TM Newsletter #3 (Spring 2001).pdf"

In [32]:
import re

def clean_email_remove__new_kind_attachments(body):
    """
    Cleans email content by removing references to attachments while retaining the main email body.
    - Removes inline attachment references (e.g., <<filename>>).
    - Removes plain text references (e.g., - filename.pdf).
    """
    if not isinstance(body, str):  # Ensure input is a string
        return body

    # Step 1: Remove inline attachment references (e.g., <<filename.pdf>>)
    body = re.sub(r"<<.*?>>", "", body)

    # Step 2: Remove plain text references to attachments (e.g., - filename.pdf)
    body = re.sub(r"(?m)^\s*-\s+.*\.(pdf|docx?|xlsx?|jpg|png|txt|csv)\s*$", "", body)

    # Step 3: Normalize whitespace and remove excessive empty lines
    body = re.sub(r"\n{2,}", "\n", body).strip()

    return body


In [33]:
parsed_df['without_attachement'] = parsed_df['without_attachement'].apply(clean_email_remove__new_kind_attachments)

In [34]:
# Recherche avec des motifs supplémentaires
contains_attachments = parsed_df[parsed_df['without_attachement'].str.contains(r'<< File:|\.pdf|\.docx|Enclosed', na=False, case=False)]

# Affichage des emails contenant des pièces jointes
print(contains_attachments[['without_attachement']])

                                      without_attachement
18      Subject: Westgate\nWestgate\nEnclosed are demo...
41      Subject: Westgate Proforma-Phillip Allen.xls\n...
51      Subject: Westgate Proforma-Phillip Allen.xls\n...
57      Subject: Westgate Proforma-Phillip Allen.xls\n...
158     Subject: gas storage model\nZimin Lu\n06/14/20...
...                                                   ...
515509  eSource presents Free Global Energy Markets Pr...
515883  The enclosed spreadsheet shows the net online ...
515960  USCAA Track & Field Championships, June 9, 200...
516834  Greg,\nEnclosed is the presentation that Jean-...
516964  Transmission Expansion and Systems in Transiti...

[6689 rows x 1 columns]


In [36]:
contains_attachments.iloc[312]['without_attachement']

'In late November, you were asked to participate in the =01&Excellence Throu=\ngh=20\nCommunications=018 survey.  We want to thank you for your outstanding respo=\nnse.=20\n48% of ENA employees spent time giving us valuable feedback regarding Visio=\nn=20\nand Values. This was our best response to date.   You may view the specific=\n=20\ndata by clicking on this link.  =20\nhttp://home.ena.enron.com/content/mi/files/etc2000_ena.pdf\nOverall, your feedback tells us that efforts in several key areas are havin=\ng=20\nan impact.  They include:\n? Access to information\n? Access to and utilization of people resources\n? Performance Management Process\nSome areas need further understanding and alignment to create an environmen=\nt=20\nwhere you can achieve your personal best. They are:\n?  Communications, especially across departments and regarding strategic=20\ndirection=20\n?  IT support\n?  Teamwork/ Work Process=20\n?  Support for expressing opinions and challenging the status quo\n?  L

In [38]:
parsed_df.to_csv("without_attachement.csv", index=False)

In [39]:
clean = pd.read_csv('without_attachement.csv')

In [40]:
clean

Unnamed: 0,file,message,parsed_email,Message-ID,Date,From,To,Subject,Mime-Version,Content-Type,...,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,Body,cleaned_body,without_attachement
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,{'Message-ID': '<18782981.1075855378110.JavaMa...,<18782981.1075855378110.JavaMail.evans@thyme>,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",phillip.allen@enron.com,tim.belden@enron.com,,1.0,text/plain; charset=us-ascii,...,Phillip K Allen,Tim Belden <Tim Belden/Enron@EnronXGate>,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,\nHere is our forecast\n\n,Here is our forecast,Here is our forecast
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,{'Message-ID': '<15464986.1075855378456.JavaMa...,<15464986.1075855378456.JavaMail.evans@thyme>,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",phillip.allen@enron.com,john.lavorato@enron.com,Re:,1.0,text/plain; charset=us-ascii,...,Phillip K Allen,John J Lavorato <John J Lavorato/ENRON@enronXg...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,\nTraveling to have a business meeting takes t...,Traveling to have a business meeting takes the...,Traveling to have a business meeting takes the...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,{'Message-ID': '<24216240.1075855687451.JavaMa...,<24216240.1075855687451.JavaMail.evans@thyme>,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",phillip.allen@enron.com,leah.arsdall@enron.com,Re: test,1.0,text/plain; charset=us-ascii,...,Phillip K Allen,Leah Van Arsdall,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,\ntest successful. way to go!!!,test successful. way to go!!!,test successful. way to go!!!
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,{'Message-ID': '<13505866.1075863688222.JavaMa...,<13505866.1075863688222.JavaMail.evans@thyme>,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",phillip.allen@enron.com,randall.gay@enron.com,,1.0,text/plain; charset=us-ascii,...,Phillip K Allen,Randall L Gay,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,"\nRandy,\n\n Can you send me a schedule of the...","Randy,\n Can you send me a schedule of the sal...","Randy,\n Can you send me a schedule of the sal..."
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,{'Message-ID': '<30922949.1075863688243.JavaMa...,<30922949.1075863688243.JavaMail.evans@thyme>,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",phillip.allen@enron.com,greg.piper@enron.com,Re: Hello,1.0,text/plain; charset=us-ascii,...,Phillip K Allen,Greg Piper,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,\nLet's shoot for Tuesday at 11:45.,Let's shoot for Tuesday at 11:45.,Let's shoot for Tuesday at 11:45.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517396,zufferli-j/sent_items/95.,Message-ID: <26807948.1075842029936.JavaMail.e...,{'Message-ID': '<26807948.1075842029936.JavaMa...,<26807948.1075842029936.JavaMail.evans@thyme>,"Wed, 28 Nov 2001 13:30:11 -0800 (PST)",john.zufferli@enron.com,kori.loibl@enron.com,Trade with John Lavorato,1.0,text/plain; charset=us-ascii,...,"Zufferli, John </O=ENRON/OU=NA/CN=RECIPIENTS/C...","Loibl, Kori </O=ENRON/OU=NA/CN=RECIPIENTS/CN=K...",,,"\ExMerge - Zufferli, John\Sent Items",ZUFFERLI-J,john zufferli 6-26-02.PST,\nThis is a trade with OIL-SPEC-HEDGE-NG (John...,This is a trade with OIL-SPEC-HEDGE-NG (John L...,This is a trade with OIL-SPEC-HEDGE-NG (John L...
517397,zufferli-j/sent_items/96.,Message-ID: <25835861.1075842029959.JavaMail.e...,{'Message-ID': '<25835861.1075842029959.JavaMa...,<25835861.1075842029959.JavaMail.evans@thyme>,"Wed, 28 Nov 2001 12:47:48 -0800 (PST)",john.zufferli@enron.com,john.lavorato@enron.com,Gas Hedges,1.0,text/plain; charset=us-ascii,...,"Zufferli, John </O=ENRON/OU=NA/CN=RECIPIENTS/C...","Lavorato, John </O=ENRON/OU=NA/CN=RECIPIENTS/C...",,,"\ExMerge - Zufferli, John\Sent Items",ZUFFERLI-J,john zufferli 6-26-02.PST,\nSome of my position is with the Alberta Term...,Some of my position is with the Alberta Term b...,Some of my position is with the Alberta Term b...
517398,zufferli-j/sent_items/97.,Message-ID: <28979867.1075842029988.JavaMail.e...,{'Message-ID': '<28979867.1075842029988.JavaMa...,<28979867.1075842029988.JavaMail.evans@thyme>,"Wed, 28 Nov 2001 07:20:00 -0800 (PST)",john.zufferli@enron.com,dawn.doucet@enron.com,RE: CONFIDENTIAL,1.0,text/plain; charset=us-ascii,...,"Zufferli, John </O=ENRON/OU=NA/CN=RECIPIENTS/C...","Doucet, Dawn </O=ENRON/OU=NA/CN=RECIPIENTS/CN=...",,,"\ExMerge - Zufferli, John\Sent Items",ZUFFERLI-J,john zufferli 6-26-02.PST,\n2\n\n -----Original Message-----\nFrom: \tDo...,2,2
517399,zufferli-j/sent_items/98.,Message-ID: <22052556.1075842030013.JavaMail.e...,{'Message-ID': '<22052556.1075842030013.JavaMa...,<22052556.1075842030013.JavaMail.evans@thyme>,"Tue, 27 Nov 2001 11:52:45 -0800 (PST)",john.zufferli@enron.com,jeanie.slone@enron.com,Calgary Analyst/Associate,1.0,text/plain; charset=us-ascii,...,"Zufferli, John </O=ENRON/OU=NA/CN=RECIPIENTS/C...","Slone, Jeanie </O=ENRON/OU=NA/CN=RECIPIENTS/CN...",,,"\ExMerge - Zufferli, John\Sent Items",ZUFFERLI-J,john zufferli 6-26-02.PST,\nAnalyst\t\t\t\t\tRank\n\nStephane Brodeur\t\...,Analyst\t\t\t\t\tRank\nStephane Brodeur\t\t\t1...,Analyst\t\t\t\t\tRank\nStephane Brodeur\t\t\t1...


In [45]:
clean.iloc[201]['without_attachement']

'Beth,\nHere are our addresses for DSL lines:\nHunter Shively\n10545 Gawain\nHouston, TX 77024\n713 461-4130\nPhillip Allen\n8855 Merlin Ct\nHouston, TX 77055\n713 463-8626\nMike Grigsby\n6201 Meadow Lake\nHouston, TX 77057\n713 780-1022\nThanks\nPhillip'

In [47]:
clean[clean['Message-ID'] == target_id].iloc[0]['without_attachement']

'Subject: Ken Lay\'s email to Sen. Brulte\nThe original message was received at Fri, 16 Feb 2001 14:54:24 GMT\nfrom mailman.enron.com [192.168.189.66]\n   ----- The following addresses had permanent fatal errors -----\n   ----- Transcript of session follows -----\n... while talking to mail.acom2.com:\n   ----- Original message follows -----\nReceived: from mailman.enron.com (mailman.enron.com [192.168.189.66])\n by postmaster.enron.com (8.8.8/8.8.8/postmaster-1.00) with ESMTP id OAA18632\nReceived: from nahou-msmsw01px.corp.enron.com ([172.28.10.37])\n by mailman.enron.com (8.10.1/8.10.1/corp-1.05) with ESMTP id f1GEsNa21936\nReceived: from ene-mta01.enron.com (unverified) by \nnahou-msmsw01px.corp.enron.com\n (Content Technologies SMTPRS 4.1.5) with ESMTP id \n Fri, 16 Feb 2001 08:54:12 -0600\nX-Mailer: Lotus Notes Release 5.0.3  March 21, 2000\nDate: Fri, 16 Feb 2001 08:49:14 -0600\nX-MIMETrack: Serialize by Router on ENE-MTA01/Enron(Release 5.0.6 |December \n14, 2000) at\n 02/16/200

In [52]:
nb_nan = clean['without_attachement'].isna().sum()
print(f"Il y a {nb_nan} NaN dans la colonne 'Body'.")


Il y a 13062 NaN dans la colonne 'Body'.


In [None]:
clean['without_attachement'] = clean['without_attachement'].fillna("")  # Remplir les NaN avec ""
