In [6]:
# 导入所需库
import os
import email
import sys
from email import policy
from email.parser import BytesParser
from tqdm.notebook import tqdm
import pandas as pd

In [7]:
# 定义解析邮件的函数
def parse_email(file_path):
    with open(file_path, 'rb') as f:
        msg = BytesParser(policy=policy.default).parse(f)
    
    email_data = {
        "From": msg["From"],
        "To": msg["To"],
        "Cc": msg["Cc"],
        "Bcc": msg["Bcc"],
        "Date": msg["Date"],
        "Subject": msg["Subject"],
        "Body": msg.get_body(preferencelist=('plain', 'html')).get_content(),
        "Attachments": [part.get_filename() for part in msg.iter_attachments()]
    }
    
    return email_data


In [8]:
def process_directory(directory_path):
    emails = []
    
    # 首先获取所有文件的总数用于进度条
    all_files = []
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if file.endswith('_'):  # 注意：这里您使用的是endswith('_')，可能需要检查是否正确
                all_files.append(os.path.join(root, file))
    
    # 使用tqdm创建进度条
    for file_path in tqdm(all_files, desc="处理邮件文件"):
        try:
            email_data = parse_email(file_path)
            emails.append(email_data)
        except Exception as e:
            print(f"\n处理文件 {file_path} 时出错: {e}")
            continue
    
    return pd.DataFrame(emails)

In [9]:
import os

def get_subdirectories(path):
    """获取指定路径下所有一级子目录（更高效）"""
    with os.scandir(path) as entries:
        return [entry.name for entry in entries if entry.is_dir()]

# 使用示例
all_pois_emails = get_subdirectories('EnronEmails/EnronEmails')
print(all_pois_emails)

['allen-p', 'arnold-j', 'arora-h', 'badeer-r', 'bailey-s', 'bass-e', 'baughman-d', 'beck-s', 'benson-r', 'blair-l', 'brawner-s', 'buy-r', 'campbell-l', 'carson-m', 'cash-m', 'causholli-m', 'corman-s', 'crandell-s', 'cuilla-m', 'dasovich-j', 'davis-d', 'dean-c', 'delainey-d', 'derrick-j', 'dickson-s', 'donoho-l', 'donohoe-t', 'dorland-c', 'ermis-f', 'farmer-d', 'fischer-m', 'forney-j', 'fossum-d', 'gang-l', 'gay-r', 'geaccone-t', 'germany-c', 'gilbertsmith-d', 'giron-d', 'griffith-j', 'grigsby-m', 'guzman-m', 'haedicke-m', 'hain-m', 'harris-s', 'hayslett-r', 'heard-m', 'hendrickson-s', 'hernandez-j', 'hodge-j', 'holst-k', 'horton-s', 'hyatt-k', 'hyvl-d', 'jones-t', 'kaminski-v', 'kean-s', 'keavey-p', 'keiser-k', 'king-j', 'kitchen-l', 'kuykendall-t', 'lavorato-j', 'lay-k', 'lenhart-m', 'lewis-a', 'linder-e', 'lokay-m', 'lokey-t', 'love-p', 'lucci-p', 'maggi-m', 'mann-k', 'martin-t', 'may-l', 'mccarty-d', 'mcconnell-m', 'mckay-b', 'mckay-j', 'mclaughlin-e', 'merriss-s', 'meyers-a', 'mims

In [10]:
# 指定Enron数据集的路径
directory_path = 'EnronEmails/EnronEmails/allen-p/_sent_mail'

# 处理目录并转换为DataFrame
email_df = process_directory(directory_path)

# 显示部分数据
print(email_df.head())

处理邮件文件:   0%|          | 0/602 [00:00<?, ?it/s]

                      From                                                To  \
0  phillip.allen@enron.com                             randall.gay@enron.com   
1  phillip.allen@enron.com                              greg.piper@enron.com   
2  phillip.allen@enron.com                              greg.piper@enron.com   
3  phillip.allen@enron.com  david.l.johnson@enron.com, john.shafer@enron.com   
4  phillip.allen@enron.com                          joyce.teixeira@enron.com   

     Cc   Bcc                             Date                       Subject  \
0  None  None  Mon, 23 Oct 2000 06:13:00 -0700                                 
1  None  None  Thu, 31 Aug 2000 05:07:00 -0700                     Re: Hello   
2  None  None  Thu, 31 Aug 2000 04:17:00 -0700                     Re: Hello   
3  None  None  Tue, 22 Aug 2000 07:44:00 -0700                                 
4  None  None  Fri, 14 Jul 2000 06:59:00 -0700  Re: PRC review - phone calls   

                                      

In [25]:
for poi_emails in tqdm(all_pois_emails):
    poi_emails_path = 'EnronEmails/EnronEmails' + '/' + poi_emails
    
    poi_emails_df = process_directory(poi_emails_path)
    try:
        os.remove('EnronEmails'+'/'+poi_emails+'.csv')
    except :
        pass
    finally:
        poi_emails_df.to_csv('EnronEmails'+'/'+poi_emails+'.csv', index=False)

  0%|          | 0/150 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/3034 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/4898 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/654 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/877 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/478 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/7823 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/2760 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/11830 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/767 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/3415 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1026 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/2429 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/6490 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1400 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/2969 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/943 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/2025 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/519 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1029 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/28234 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/2249 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/2429 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/3566 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1766 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/395 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1045 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1015 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/2127 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1230 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/13032 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1498 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/729 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/4796 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/590 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1415 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1592 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/12436 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/578 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/4220 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/2973 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/2237 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/6054 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/5246 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/3820 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/548 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/2554 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1623 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/719 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/3265 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1661 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/463 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/2470 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1794 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/3210 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/19950 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/28465 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/25351 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/2177 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1113 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/462 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/5546 [00:00<?, ?it/s]


处理文件 EnronEmails/EnronEmails/kitchen-l\sent_items\20_ 时出错: 'ValueTerminal' object does not support item assignment

处理文件 EnronEmails/EnronEmails/kitchen-l\sent_items\24_ 时出错: 'ValueTerminal' object does not support item assignment

处理文件 EnronEmails/EnronEmails/kitchen-l\sent_items\29_ 时出错: 'ValueTerminal' object does not support item assignment

处理文件 EnronEmails/EnronEmails/kitchen-l\_americas\esvl\87_ 时出错: 'ValueTerminal' object does not support item assignment

处理文件 EnronEmails/EnronEmails/kitchen-l\_americas\netco_eol\1_ 时出错: 'ValueTerminal' object does not support item assignment

处理文件 EnronEmails/EnronEmails/kitchen-l\_americas\netco_eol\82_ 时出错: 'ValueTerminal' object does not support item assignment

处理文件 EnronEmails/EnronEmails/kitchen-l\_americas\netco_eol\83_ 时出错: 'ValueTerminal' object does not support item assignment

处理文件 EnronEmails/EnronEmails/kitchen-l\_americas\netco_restart\2_ 时出错: 'ValueTerminal' object does not support item assignment

处理文件 EnronEmails/EnronEmails/

处理邮件文件:   0%|          | 0/1120 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/4685 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/5937 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/5920 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/2191 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/2805 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/5568 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1156 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/5002 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/997 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1991 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/23381 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1112 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1600 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/691 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/4542 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/681 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/998 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/3353 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1627 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1099 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/2038 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/378 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/3268 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/10655 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/437 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/2284 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/725 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/4778 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/35 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/642 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/574 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/2204 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/395 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1568 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/563 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/498 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/582 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/706 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/994 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/2766 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/8009 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1643 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/5200 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1116 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1632 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/256 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/7329 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/647 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1859 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/738 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/8022 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/721 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/18687 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/3856 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/6071 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1991 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/4139 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/132 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1642 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1081 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/248 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/621 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/3030 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/3331 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1227 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1252 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1027 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1169 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/355 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/10827 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/13875 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1885 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1293 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/646 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1219 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/2611 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/2950 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1566 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1878 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/3335 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/3272 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/807 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1213 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/3440 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1587 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1291 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/1563 [00:00<?, ?it/s]

处理邮件文件:   0%|          | 0/557 [00:00<?, ?it/s]