In [1]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd
from tqdm import tqdm

tokenizer = AutoTokenizer.from_pretrained("dima806/email-spam-detection-roberta")
model = AutoModelForSequenceClassification.from_pretrained("dima806/email-spam-detection-roberta")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv('/Users/armandbidault/Downloads/df_final_Ali_gte-base-en-v1.5.csv')

In [4]:
data["final_body"] = data["final_body"].fillna("").astype(str)
emails = data["final_body"].tolist()


In [5]:
inputs = tokenizer(emails, padding=True, truncation=True, max_length=512, return_tensors="pt")

In [6]:
# Configurer le modèle pour le GPU si disponible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Diviser les emails en lots pour un traitement efficace
batch_size = 16
predictions = []

for i in tqdm(range(0, len(emails), batch_size), desc="Processing emails"):
    batch_emails = emails[i:i + batch_size]
    
    # Tokenisation avec troncation explicite
    inputs = tokenizer(batch_emails, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        batch_predictions = torch.argmax(logits, dim=1).cpu().numpy()
        predictions.extend(batch_predictions)

# Ajouter les prédictions au DataFrame
data["spam"] = predictions

# Sauvegarder le résultat
data.to_csv("emails_with_predictions.csv", index=False)
print("Prédictions enregistrées dans 'emails_with_predictions.csv'")

Processing emails:   0%|          | 0/32338 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Processing emails: 100%|██████████| 32338/32338 [49:36:34<00:00,  5.52s/it]      


Prédictions enregistrées dans 'emails_with_predictions.csv'


In [8]:
data['spam'].value_counts()

spam
0    434019
1     83382
Name: count, dtype: int64

In [9]:
data_clean = data[data['spam'] == 0]

In [10]:
data_clean.to_csv("NoSpam_emails.csv", index=False)

In [2]:
test = pd.read_csv('NoSpam_emails.csv')
test

Unnamed: 0,file,message,parsed_email,Message-ID,X-FileName,Body,final_body,embedding,spam
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,{'Message-ID': '<18782981.1075855378110.JavaMa...,<18782981.1075855378110.JavaMail.evans@thyme>,pallen (Non-Privileged).pst,\nHere is our forecast\n\n,forecast,"[-0.2228, -0.1683, 0.414, -0.1859, 0.7666, -0....",0
1,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,{'Message-ID': '<13505866.1075863688222.JavaMa...,<13505866.1075863688222.JavaMail.evans@thyme>,pallen.nsf,"\nRandy,\n\n Can you send me a schedule of the...","Randy, send schedule salary level everyone sch...","[0.4868, 0.3755, -0.836, -0.723, 1.365, -0.114...",0
2,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,{'Message-ID': '<30922949.1075863688243.JavaMa...,<30922949.1075863688243.JavaMail.evans@thyme>,pallen.nsf,\nLet's shoot for Tuesday at 11:45.,Let's shoot Tuesday 11:45.,"[-0.03079, 0.02507, -0.4038, -0.3984, 1.139, 0...",0
3,allen-p/_sent_mail/1002.,Message-ID: <30965995.1075863688265.JavaMail.e...,{'Message-ID': '<30965995.1075863688265.JavaMa...,<30965995.1075863688265.JavaMail.evans@thyme>,pallen.nsf,"\nGreg,\n\n How about either next Tuesday or T...","Greg, either next Tuesday Thursday? Phillip","[-0.4834, 0.2832, -1.531, -0.6226, 0.0867, 0.4...",0
4,allen-p/_sent_mail/1003.,Message-ID: <16254169.1075863688286.JavaMail.e...,{'Message-ID': '<16254169.1075863688286.JavaMa...,<16254169.1075863688286.JavaMail.evans@thyme>,pallen.nsf,\nPlease cc the following distribution list wi...,Please cc following distribution list updates:...,"[0.2007, -0.7554, -0.322, -0.1583, 0.454, -0.4...",0
...,...,...,...,...,...,...,...,...,...
434014,zufferli-j/sent_items/89.,Message-ID: <24358278.1075842029773.JavaMail.e...,{'Message-ID': '<24358278.1075842029773.JavaMa...,<24358278.1075842029773.JavaMail.evans@thyme>,john zufferli 6-26-02.PST,\n\nEnron is willing to perform the operation ...,Enron willing perform operation question (1) (...,"[0.00992, -0.2651, -0.605, 0.1359, 1.509, 0.04...",0
434015,zufferli-j/sent_items/91.,Message-ID: <23829224.1075842029820.JavaMail.e...,{'Message-ID': '<23829224.1075842029820.JavaMa...,<23829224.1075842029820.JavaMail.evans@thyme>,john zufferli 6-26-02.PST,"\nNicole and everyone, I think the directory i...","Nicole everyone, think directory i:\canadian\e...","[1.218, -0.943, 0.1938, 1.032, 0.4238, -0.3433...",0
434016,zufferli-j/sent_items/95.,Message-ID: <26807948.1075842029936.JavaMail.e...,{'Message-ID': '<26807948.1075842029936.JavaMa...,<26807948.1075842029936.JavaMail.evans@thyme>,john zufferli 6-26-02.PST,\nThis is a trade with OIL-SPEC-HEDGE-NG (John...,trade OIL-SPEC-HEDGE-NG (John Lavorato's book)...,"[0.2764, -0.03079, -0.6875, 0.05423, -1.289, 0...",0
434017,zufferli-j/sent_items/96.,Message-ID: <25835861.1075842029959.JavaMail.e...,{'Message-ID': '<25835861.1075842029959.JavaMa...,<25835861.1075842029959.JavaMail.evans@thyme>,john zufferli 6-26-02.PST,\nSome of my position is with the Alberta Term...,"position Alberta Term book, send positions dir...","[0.7695, -0.5005, -1.423, -0.1415, -0.4844, 0....",0


In [5]:
name = pd.read_csv('clean_emails_without_attachement.csv')
name

Unnamed: 0,file,message,parsed_email,Message-ID,Date,From,To,Subject,Mime-Version,Content-Type,...,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,Body,cleaned_body,without_attachement,final_body
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,{'Message-ID': '<18782981.1075855378110.JavaMa...,<18782981.1075855378110.JavaMail.evans@thyme>,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",phillip.allen@enron.com,tim.belden@enron.com,,1.0,text/plain; charset=us-ascii,...,Tim Belden <Tim Belden/Enron@EnronXGate>,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,\nHere is our forecast\n\n,Here is our forecast,Here is our forecast,forecast
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,{'Message-ID': '<15464986.1075855378456.JavaMa...,<15464986.1075855378456.JavaMail.evans@thyme>,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",phillip.allen@enron.com,john.lavorato@enron.com,Re:,1.0,text/plain; charset=us-ascii,...,John J Lavorato <John J Lavorato/ENRON@enronXg...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,\nTraveling to have a business meeting takes t...,Traveling to have a business meeting takes the...,Traveling to have a business meeting takes the...,Traveling business meeting takes fun trip. Esp...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,{'Message-ID': '<24216240.1075855687451.JavaMa...,<24216240.1075855687451.JavaMail.evans@thyme>,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",phillip.allen@enron.com,leah.arsdall@enron.com,Re: test,1.0,text/plain; charset=us-ascii,...,Leah Van Arsdall,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,\ntest successful. way to go!!!,test successful. way to go!!!,test successful. way to go!!!,test successful. way go!!!
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,{'Message-ID': '<13505866.1075863688222.JavaMa...,<13505866.1075863688222.JavaMail.evans@thyme>,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",phillip.allen@enron.com,randall.gay@enron.com,,1.0,text/plain; charset=us-ascii,...,Randall L Gay,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,"\nRandy,\n\n Can you send me a schedule of the...","Randy,\n Can you send me a schedule of the sal...","Randy,\n Can you send me a schedule of the sal...","Randy, send schedule salary level everyone sch..."
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,{'Message-ID': '<30922949.1075863688243.JavaMa...,<30922949.1075863688243.JavaMail.evans@thyme>,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",phillip.allen@enron.com,greg.piper@enron.com,Re: Hello,1.0,text/plain; charset=us-ascii,...,Greg Piper,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,\nLet's shoot for Tuesday at 11:45.,Let's shoot for Tuesday at 11:45.,Let's shoot for Tuesday at 11:45.,Let's shoot Tuesday 11:45.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517396,zufferli-j/sent_items/95.,Message-ID: <26807948.1075842029936.JavaMail.e...,{'Message-ID': '<26807948.1075842029936.JavaMa...,<26807948.1075842029936.JavaMail.evans@thyme>,"Wed, 28 Nov 2001 13:30:11 -0800 (PST)",john.zufferli@enron.com,kori.loibl@enron.com,Trade with John Lavorato,1.0,text/plain; charset=us-ascii,...,"Loibl, Kori </O=ENRON/OU=NA/CN=RECIPIENTS/CN=K...",,,"\ExMerge - Zufferli, John\Sent Items",ZUFFERLI-J,john zufferli 6-26-02.PST,\nThis is a trade with OIL-SPEC-HEDGE-NG (John...,This is a trade with OIL-SPEC-HEDGE-NG (John L...,This is a trade with OIL-SPEC-HEDGE-NG (John L...,trade OIL-SPEC-HEDGE-NG (John Lavorato's book)...
517397,zufferli-j/sent_items/96.,Message-ID: <25835861.1075842029959.JavaMail.e...,{'Message-ID': '<25835861.1075842029959.JavaMa...,<25835861.1075842029959.JavaMail.evans@thyme>,"Wed, 28 Nov 2001 12:47:48 -0800 (PST)",john.zufferli@enron.com,john.lavorato@enron.com,Gas Hedges,1.0,text/plain; charset=us-ascii,...,"Lavorato, John </O=ENRON/OU=NA/CN=RECIPIENTS/C...",,,"\ExMerge - Zufferli, John\Sent Items",ZUFFERLI-J,john zufferli 6-26-02.PST,\nSome of my position is with the Alberta Term...,Some of my position is with the Alberta Term b...,Some of my position is with the Alberta Term b...,"position Alberta Term book, send positions dir..."
517398,zufferli-j/sent_items/97.,Message-ID: <28979867.1075842029988.JavaMail.e...,{'Message-ID': '<28979867.1075842029988.JavaMa...,<28979867.1075842029988.JavaMail.evans@thyme>,"Wed, 28 Nov 2001 07:20:00 -0800 (PST)",john.zufferli@enron.com,dawn.doucet@enron.com,RE: CONFIDENTIAL,1.0,text/plain; charset=us-ascii,...,"Doucet, Dawn </O=ENRON/OU=NA/CN=RECIPIENTS/CN=...",,,"\ExMerge - Zufferli, John\Sent Items",ZUFFERLI-J,john zufferli 6-26-02.PST,\n2\n\n -----Original Message-----\nFrom: \tDo...,2,2,2
517399,zufferli-j/sent_items/98.,Message-ID: <22052556.1075842030013.JavaMail.e...,{'Message-ID': '<22052556.1075842030013.JavaMa...,<22052556.1075842030013.JavaMail.evans@thyme>,"Tue, 27 Nov 2001 11:52:45 -0800 (PST)",john.zufferli@enron.com,jeanie.slone@enron.com,Calgary Analyst/Associate,1.0,text/plain; charset=us-ascii,...,"Slone, Jeanie </O=ENRON/OU=NA/CN=RECIPIENTS/CN...",,,"\ExMerge - Zufferli, John\Sent Items",ZUFFERLI-J,john zufferli 6-26-02.PST,\nAnalyst\t\t\t\t\tRank\n\nStephane Brodeur\t\...,Analyst\t\t\t\t\tRank\nStephane Brodeur\t\t\t1...,Analyst\t\t\t\t\tRank\nStephane Brodeur\t\t\t1...,Analyst Rank Stephane Brodeur 1 Chad Clark 1 I...


In [6]:
test_merged_from = pd.merge(
    test,
    name[['Message-ID', 'From']],  # on se limite aux colonnes utiles
    on='Message-ID',               # clé de jointure
    how='left'                     # type de jointure (left, right, inner, outer)
)

test_merged_from

Unnamed: 0,file,message,parsed_email,Message-ID,X-FileName,Body,final_body,embedding,spam,From
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,{'Message-ID': '<18782981.1075855378110.JavaMa...,<18782981.1075855378110.JavaMail.evans@thyme>,pallen (Non-Privileged).pst,\nHere is our forecast\n\n,forecast,"[-0.2228, -0.1683, 0.414, -0.1859, 0.7666, -0....",0,phillip.allen@enron.com
1,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,{'Message-ID': '<13505866.1075863688222.JavaMa...,<13505866.1075863688222.JavaMail.evans@thyme>,pallen.nsf,"\nRandy,\n\n Can you send me a schedule of the...","Randy, send schedule salary level everyone sch...","[0.4868, 0.3755, -0.836, -0.723, 1.365, -0.114...",0,phillip.allen@enron.com
2,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,{'Message-ID': '<30922949.1075863688243.JavaMa...,<30922949.1075863688243.JavaMail.evans@thyme>,pallen.nsf,\nLet's shoot for Tuesday at 11:45.,Let's shoot Tuesday 11:45.,"[-0.03079, 0.02507, -0.4038, -0.3984, 1.139, 0...",0,phillip.allen@enron.com
3,allen-p/_sent_mail/1002.,Message-ID: <30965995.1075863688265.JavaMail.e...,{'Message-ID': '<30965995.1075863688265.JavaMa...,<30965995.1075863688265.JavaMail.evans@thyme>,pallen.nsf,"\nGreg,\n\n How about either next Tuesday or T...","Greg, either next Tuesday Thursday? Phillip","[-0.4834, 0.2832, -1.531, -0.6226, 0.0867, 0.4...",0,phillip.allen@enron.com
4,allen-p/_sent_mail/1003.,Message-ID: <16254169.1075863688286.JavaMail.e...,{'Message-ID': '<16254169.1075863688286.JavaMa...,<16254169.1075863688286.JavaMail.evans@thyme>,pallen.nsf,\nPlease cc the following distribution list wi...,Please cc following distribution list updates:...,"[0.2007, -0.7554, -0.322, -0.1583, 0.454, -0.4...",0,phillip.allen@enron.com
...,...,...,...,...,...,...,...,...,...,...
434014,zufferli-j/sent_items/89.,Message-ID: <24358278.1075842029773.JavaMail.e...,{'Message-ID': '<24358278.1075842029773.JavaMa...,<24358278.1075842029773.JavaMail.evans@thyme>,john zufferli 6-26-02.PST,\n\nEnron is willing to perform the operation ...,Enron willing perform operation question (1) (...,"[0.00992, -0.2651, -0.605, 0.1359, 1.509, 0.04...",0,john.zufferli@enron.com
434015,zufferli-j/sent_items/91.,Message-ID: <23829224.1075842029820.JavaMail.e...,{'Message-ID': '<23829224.1075842029820.JavaMa...,<23829224.1075842029820.JavaMail.evans@thyme>,john zufferli 6-26-02.PST,"\nNicole and everyone, I think the directory i...","Nicole everyone, think directory i:\canadian\e...","[1.218, -0.943, 0.1938, 1.032, 0.4238, -0.3433...",0,john.zufferli@enron.com
434016,zufferli-j/sent_items/95.,Message-ID: <26807948.1075842029936.JavaMail.e...,{'Message-ID': '<26807948.1075842029936.JavaMa...,<26807948.1075842029936.JavaMail.evans@thyme>,john zufferli 6-26-02.PST,\nThis is a trade with OIL-SPEC-HEDGE-NG (John...,trade OIL-SPEC-HEDGE-NG (John Lavorato's book)...,"[0.2764, -0.03079, -0.6875, 0.05423, -1.289, 0...",0,john.zufferli@enron.com
434017,zufferli-j/sent_items/96.,Message-ID: <25835861.1075842029959.JavaMail.e...,{'Message-ID': '<25835861.1075842029959.JavaMa...,<25835861.1075842029959.JavaMail.evans@thyme>,john zufferli 6-26-02.PST,\nSome of my position is with the Alberta Term...,"position Alberta Term book, send positions dir...","[0.7695, -0.5005, -1.423, -0.1415, -0.4844, 0....",0,john.zufferli@enron.com


In [11]:
from typing import List
import pickle
def keep_valid_senders(unique_senders: List[str]) -> List[str]:
    """Keep only valid senders

    Params:
        senders (List[str]): list of senders
    
    Returns:
        List[str]: list of valid senders
    """

    enron_senders = [sender for sender in unique_senders if 'enron' in sender.lower()]
    news_senders = [sender for sender in unique_senders if 'news' in sender.lower()]
    no_reply_senders = [sender for sender in enron_senders if 'no-reply' in sender.lower()]
    with open('not_enron_correct_mails.pkl', 'rb') as f:
        not_enron_correct_mails = pickle.load(f)
    concat_good_senders = enron_senders + not_enron_correct_mails
    # drop the news_senders and no_reply_senders from the concat_good_senders
    # step 1 : from the concat_goo_senders, drop the news senders
    concat_good_senders = [sender for sender in concat_good_senders if sender not in news_senders]
    # step 2 : from the concat_good_senders, drop the no_reply senders
    concat_good_senders = [sender for sender in concat_good_senders if sender not in no_reply_senders]
    # step 3 : from the concat_good_senders, drop the annoncement senders
    concat_good_senders = [sender for sender in concat_good_senders if "announcement" not in sender.lower()]

    return concat_good_senders

In [12]:
keep_valid_senders(test_merged_from['From'].unique())

['phillip.allen@enron.com',
 'ina.rangel@enron.com',
 'critical.notice@enron.com',
 'rebecca.cantrell@enron.com',
 'paul.kaufman@enron.com',
 'public.relations@enron.com',
 'stephanie.miller@enron.com',
 'tracy.arthur@enron.com',
 'sarah.novosel@enron.com',
 'tim.heizenrader@enron.com',
 'frank.hayden@enron.com',
 'kim.ward@enron.com',
 'perfmgmt@enron.com',
 'alyse.herasimchuk@enron.com',
 'lisa.jacobson@enron.com',
 'christi.nicolay@enron.com',
 'richard.shapiro@enron.com',
 'tiffany.miller@enron.com',
 'philip.polsky@enron.com',
 'mark.whitt@enron.com',
 'arsystem@mailman.enron.com',
 'tim.belden@enron.com',
 'outlook-migration-team@enron.com',
 'no.address@enron.com',
 'ray.alvarez@enron.com',
 'w..cantrell@enron.com',
 'savita.puthigai@enron.com',
 'chad.landry@enron.com',
 'veronica.espinoza@enron.com',
 'mike.grigsby@enron.com',
 'kathie.grabstald@enron.com',
 'karen.buckley@enron.com',
 'kathryn.sheppard@enron.com',
 'james.bruce@enron.com',
 'm..tholt@enron.com',
 'randy.bhati

In [14]:
# Appel de la fonction pour obtenir la liste des senders valides
valid_senders = keep_valid_senders(test_merged_from['From'].unique())

# Filtrer le DataFrame en ne gardant que les lignes avec un 'From' valide
filtered_df = test_merged_from[test_merged_from['From'].isin(valid_senders)]

# Afficher ou utiliser le DataFrame filtré
filtered_df


Unnamed: 0,file,message,parsed_email,Message-ID,X-FileName,Body,final_body,embedding,spam,From
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,{'Message-ID': '<18782981.1075855378110.JavaMa...,<18782981.1075855378110.JavaMail.evans@thyme>,pallen (Non-Privileged).pst,\nHere is our forecast\n\n,forecast,"[-0.2228, -0.1683, 0.414, -0.1859, 0.7666, -0....",0,phillip.allen@enron.com
1,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,{'Message-ID': '<13505866.1075863688222.JavaMa...,<13505866.1075863688222.JavaMail.evans@thyme>,pallen.nsf,"\nRandy,\n\n Can you send me a schedule of the...","Randy, send schedule salary level everyone sch...","[0.4868, 0.3755, -0.836, -0.723, 1.365, -0.114...",0,phillip.allen@enron.com
2,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,{'Message-ID': '<30922949.1075863688243.JavaMa...,<30922949.1075863688243.JavaMail.evans@thyme>,pallen.nsf,\nLet's shoot for Tuesday at 11:45.,Let's shoot Tuesday 11:45.,"[-0.03079, 0.02507, -0.4038, -0.3984, 1.139, 0...",0,phillip.allen@enron.com
3,allen-p/_sent_mail/1002.,Message-ID: <30965995.1075863688265.JavaMail.e...,{'Message-ID': '<30965995.1075863688265.JavaMa...,<30965995.1075863688265.JavaMail.evans@thyme>,pallen.nsf,"\nGreg,\n\n How about either next Tuesday or T...","Greg, either next Tuesday Thursday? Phillip","[-0.4834, 0.2832, -1.531, -0.6226, 0.0867, 0.4...",0,phillip.allen@enron.com
4,allen-p/_sent_mail/1003.,Message-ID: <16254169.1075863688286.JavaMail.e...,{'Message-ID': '<16254169.1075863688286.JavaMa...,<16254169.1075863688286.JavaMail.evans@thyme>,pallen.nsf,\nPlease cc the following distribution list wi...,Please cc following distribution list updates:...,"[0.2007, -0.7554, -0.322, -0.1583, 0.454, -0.4...",0,phillip.allen@enron.com
...,...,...,...,...,...,...,...,...,...,...
434014,zufferli-j/sent_items/89.,Message-ID: <24358278.1075842029773.JavaMail.e...,{'Message-ID': '<24358278.1075842029773.JavaMa...,<24358278.1075842029773.JavaMail.evans@thyme>,john zufferli 6-26-02.PST,\n\nEnron is willing to perform the operation ...,Enron willing perform operation question (1) (...,"[0.00992, -0.2651, -0.605, 0.1359, 1.509, 0.04...",0,john.zufferli@enron.com
434015,zufferli-j/sent_items/91.,Message-ID: <23829224.1075842029820.JavaMail.e...,{'Message-ID': '<23829224.1075842029820.JavaMa...,<23829224.1075842029820.JavaMail.evans@thyme>,john zufferli 6-26-02.PST,"\nNicole and everyone, I think the directory i...","Nicole everyone, think directory i:\canadian\e...","[1.218, -0.943, 0.1938, 1.032, 0.4238, -0.3433...",0,john.zufferli@enron.com
434016,zufferli-j/sent_items/95.,Message-ID: <26807948.1075842029936.JavaMail.e...,{'Message-ID': '<26807948.1075842029936.JavaMa...,<26807948.1075842029936.JavaMail.evans@thyme>,john zufferli 6-26-02.PST,\nThis is a trade with OIL-SPEC-HEDGE-NG (John...,trade OIL-SPEC-HEDGE-NG (John Lavorato's book)...,"[0.2764, -0.03079, -0.6875, 0.05423, -1.289, 0...",0,john.zufferli@enron.com
434017,zufferli-j/sent_items/96.,Message-ID: <25835861.1075842029959.JavaMail.e...,{'Message-ID': '<25835861.1075842029959.JavaMa...,<25835861.1075842029959.JavaMail.evans@thyme>,john zufferli 6-26-02.PST,\nSome of my position is with the Alberta Term...,"position Alberta Term book, send positions dir...","[0.7695, -0.5005, -1.423, -0.1415, -0.4844, 0....",0,john.zufferli@enron.com


In [15]:
filtered_df.to_csv("NoSpam_emails.csv", index=False)

In [19]:
filtered_df['embedding'].dtype

dtype('O')

In [20]:
# Vérifie le type du premier élément de la colonne
print(type(filtered_df['embedding'].iloc[0]))

# Vérifie que tous les éléments de la colonne sont des listes
all(isinstance(embedding, list) for embedding in filtered_df['embedding'])

<class 'str'>


False

In [22]:
import ast
from tqdm import tqdm

# Initialiser tqdm pour suivre la progression
tqdm.pandas(desc="Converting embeddings")

# Convertir les chaînes en listes de nombres avec la barre de progression
filtered_df['embedding'] = filtered_df['embedding'].progress_apply(ast.literal_eval)

# Vérifier à nouveau le type du premier élément
print(type(filtered_df['embedding'].iloc[0]))  # Cela devrait maintenant donner <class 'list'>


Converting embeddings: 100%|██████████| 363684/363684 [15:25<00:00, 393.00it/s] 


<class 'list'>


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['embedding'] = filtered_df['embedding'].progress_apply(ast.literal_eval)


In [25]:
print(type(filtered_df['embedding'].iloc[100]))


<class 'list'>


In [26]:
filtered_df.to_parquet("NoSpam_emails.parquet")
