In [22]:
import os, sys, email,re
import numpy as np 
import pandas as pd
# Plotting
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns; sns.set_style('whitegrid')
import wordcloud

# Network analysis
import networkx as nx

# NLP
from nltk.tokenize.regexp import RegexpTokenizer

from subprocess import check_output

from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import LatentDirichletAllocation

import gensim
from gensim import corpora
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
from nltk.stem.porter import PorterStemmer

In [23]:
# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [24]:
# Read the data into a DataFrame
emails_df = pd.read_csv('emails.csv')
print(emails_df.shape)
emails_df.head(1)

(517401, 2)


Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,"Message-ID: <18782981.1075855378110.JavaMail.evans@thyme>\nDate: Mon, 14 May 2001 16:39:00 -0700 (PDT)\nFrom: phillip.allen@enron.com\nTo: tim.belden@enron.com\nSubject: \nMime-Version: 1.0\nContent-Type: text/plain; charset=us-ascii\nContent-Transfer-Encoding: 7bit\nX-From: Phillip K Allen\nX-To: Tim Belden <Tim Belden/Enron@EnronXGate>\nX-cc: \nX-bcc: \nX-Folder: \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Sent Mail\nX-Origin: Allen-P\nX-FileName: pallen (Non-Privileged).pst\n\nHere is our forecast\n\n"


In [25]:
## Helper functions
def get_text_from_email(msg):
    '''To get the content from email objects'''
    parts = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            parts.append( part.get_payload() )
    return ''.join(parts)

def split_email_addresses(line):
    '''To separate multiple email addresses'''
    if line:
        addrs = line.split(',')
        addrs = set(map(lambda x: x.strip(), addrs))
    else:
        addrs = set()
    return addrs

In [26]:
# Parse the emails into a list email objects
messages = list(map(email.message_from_string, emails_df['message']))
emails_df.drop('message', axis=1, inplace=True)
# Get fields from parsed email objects
keys = messages[0].keys()
for key in keys:
    emails_df[key] = [doc[key] for doc in messages]
# Parse content from emails
emails_df['content'] = list(map(get_text_from_email, messages))
# Split multiple email addresses
emails_df['From'] = emails_df['From'].map(split_email_addresses)
emails_df['To'] = emails_df['To'].map(split_email_addresses)

# Extract the root of 'file' as 'user'
emails_df['user'] = emails_df['file'].map(lambda x:x.split('/')[0])
del messages

emails_df.head(1)

Unnamed: 0,file,Message-ID,Date,From,To,Subject,Mime-Version,Content-Type,Content-Transfer-Encoding,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,content,user
0,allen-p/_sent_mail/1.,<18782981.1075855378110.JavaMail.evans@thyme>,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",{phillip.allen@enron.com},{tim.belden@enron.com},,1.0,text/plain; charset=us-ascii,7bit,Phillip K Allen,Tim Belden <Tim Belden/Enron@EnronXGate>,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Sent Mail",Allen-P,pallen (Non-Privileged).pst,Here is our forecast\n\n,allen-p


In [27]:
# Set index and drop columns with two few values
emails_df = emails_df.set_index('Message-ID')\
    .drop(['file', 'Mime-Version', 'Content-Type', 'Content-Transfer-Encoding'], axis=1)
# Parse datetime
emails_df['Date'] = pd.to_datetime(emails_df['Date'], infer_datetime_format=True)
emails_df.dtypes

  emails_df['Date'] = pd.to_datetime(emails_df['Date'], infer_datetime_format=True)
  emails_df['Date'] = pd.to_datetime(emails_df['Date'], infer_datetime_format=True)
  emails_df['Date'] = pd.to_datetime(emails_df['Date'], infer_datetime_format=True)


Date          object
From          object
To            object
Subject       object
X-From        object
X-To          object
X-cc          object
X-bcc         object
X-Folder      object
X-Origin      object
X-FileName    object
content       object
user          object
dtype: object

In [28]:
emails_df=emails_df.sample(150)

In [29]:
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm import tqdm
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from concurrent.futures import ProcessPoolExecutor
import os
from sklearn.cluster import KMeans
from email_analysis import analyze_emails_with_openai

def sentence_tokenize(text):
    return re.split(r'(?<=[.!?])\s+', text)

def normalize_text(text):
    text = re.sub(r'[^a-zA-Z\s.,!?]', '', text)
    return text.lower()

def textrank_summarize(text, num_sentences=5):
    if not text or len(text.split()) < num_sentences:
        return text

    sentences = sentence_tokenize(text)
    
    if len(sentences) < 2:
        return text

    try:
        tfidf = TfidfVectorizer().fit_transform(sentences)
        similarity_matrix = cosine_similarity(tfidf)
        graph = nx.from_numpy_array(similarity_matrix)
        scores = nx.pagerank(graph)
        ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
        summary = ' '.join(sent for _, sent in ranked_sentences[:num_sentences])
        return summary
    except ValueError:
        return text

def create_person_indexed_email_network_df(emails_df, max_features=1000, num_clusters=5, num_processes=4):
    email_data = defaultdict(lambda: {'sent': [], 'received': []})
    
    print("Processing emails...")
    for _, row in tqdm(emails_df.iterrows(), total=len(emails_df), desc="Processing emails"):
        summarized_content = textrank_summarize(row['content'])
        
        email_info = {
            'Date': row['Date'],
            'From': list(row['From'])[0] if row['From'] else '',
            'To': list(row['To'])[0] if row['To'] else '',
            'Subject': row['Subject'],
            'X-FileName': row['X-FileName'],
            'content': summarized_content,
        }
        
        for sender in row['From']:
            email_data[sender]['sent'].append(email_info)
        
        for recipient in row['To']:
            email_data[recipient]['received'].append(email_info)
    
    print("Performing OpenAI analysis with multiprocessing...")
    all_emails = [(person, [e['content'] for e in emails['sent'] + emails['received']]) 
                  for person, emails in email_data.items()]
    
    with ProcessPoolExecutor(max_workers=num_processes) as executor:
        results = list(tqdm(executor.map(analyze_emails_with_openai, all_emails), 
                            total=len(all_emails), 
                            desc="Analyzing emails"))
    
    openai_results = dict(results)
    
    print("Creating DataFrame structure...")
    data = []
    all_summaries = []
    for person, emails in tqdm(email_data.items(), desc="Processing people"):
        sent_df = pd.DataFrame(emails['sent'])
        received_df = pd.DataFrame(emails['received'])
        
        openai_analysis = openai_results.get(person)
        if openai_analysis:
            topics, sentiment, emotion, people, organizations, locations, categories = openai_analysis
        else:
            topics, sentiment, emotion, people, organizations, locations, categories = [], "Unknown", "Unknown", [], [], [], {}
        
        if not sent_df.empty:
            all_summaries.extend(sent_df['content'])
        if not received_df.empty:
            all_summaries.extend(received_df['content'])
        
        data.append({
            'person': person,
            'sent': sent_df,
            'received': received_df,
            'topics': topics,
            'sentiment': sentiment,
            'emotion': emotion,
            'mentioned_people': people,
            'mentioned_organizations': organizations,
            'mentioned_locations': locations,
            'email_categories': categories
        })
    
    print("Performing TF-IDF transformation...")
    vectorizer = TfidfVectorizer(max_features=max_features)
    with tqdm(total=1, desc="TF-IDF fit_transform") as pbar:
        tfidf_matrix = vectorizer.fit_transform(all_summaries)
        pbar.update(1)
    
    print("Adding TF-IDF vectors to DataFrame...")
    summary_to_vector = dict(zip(all_summaries, tfidf_matrix.toarray()))
    for entry in tqdm(data, desc="Adding TF-IDF vectors"):
        if not entry['sent'].empty:
            entry['sent']['tfidf_vector'] = entry['sent']['content'].map(summary_to_vector)
        if not entry['received'].empty:
            entry['received']['tfidf_vector'] = entry['received']['content'].map(summary_to_vector)
    
    print("Clustering users...")
    user_vectors = []
    for entry in data:
        user_vector = np.zeros(max_features)
        if not entry['sent'].empty:
            user_vector += entry['sent']['tfidf_vector'].mean()
        if not entry['received'].empty:
            user_vector += entry['received']['tfidf_vector'].mean()
        user_vectors.append(user_vector)
    
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    clusters = kmeans.fit_predict(user_vectors)
    
    for i, entry in enumerate(data):
        entry['cluster'] = clusters[i]
    
    print("Creating multi-index DataFrame...")
    email_network_df = pd.DataFrame(data).set_index('person')
    
    return email_network_df, vectorizer



# Create the new DataFrame
print("Starting to create the email network DataFrame...")
email_network_df, tfidf_vectorizer = create_person_indexed_email_network_df(emails_df, num_processes=os.cpu_count())

# Display basic info about the DataFrame
print("\nDataFrame Info:")
print(email_network_df.info())

# Display a sample of the DataFrame structure
print("\nSample of the DataFrame structure:")
sample_person = email_network_df.index[0]  # Get the first person in the DataFrame
print(f"\nData for {sample_person}:")
print("\nSent emails:")
print(email_network_df.loc[sample_person, 'sent'].head())
print("\nReceived emails:")
print(email_network_df.loc[sample_person, 'received'].head())
print("\nTopics:", email_network_df.loc[sample_person, 'topics'])
print("Sentiment:", email_network_df.loc[sample_person, 'sentiment'])
print("Emotion:", email_network_df.loc[sample_person, 'emotion'])
print("Mentioned People:", email_network_df.loc[sample_person, 'mentioned_people'])
print("Mentioned Organizations:", email_network_df.loc[sample_person, 'mentioned_organizations'])
print("Mentioned Locations:", email_network_df.loc[sample_person, 'mentioned_locations'])
print("Email Categories:", email_network_df.loc[sample_person, 'email_categories'])
print("Cluster:", email_network_df.loc[sample_person, 'cluster'])

Starting to create the email network DataFrame...
Processing emails...


Processing emails: 100%|██████████| 150/150 [00:00<00:00, 563.07it/s]

Performing OpenAI analysis with multiprocessing...



Analyzing emails:  14%|█▍        | 107/754 [00:43<03:21,  3.21it/s]

In [None]:
# For mentioned_organizations
org_set = set()
for orgs in email_network_df['mentioned_organizations']:
    org_set.update(orgs)

print("Unique organizations mentioned:")
for org in sorted(org_set):
    print(f"- {org}")

# For mentioned_locations
loc_set = set()
for locs in email_network_df['mentioned_locations']:
    loc_set.update(locs)

print("\nUnique locations mentioned:")
for loc in sorted(loc_set):
    print(f"- {loc}")

Unique organizations mentioned:
- (No additional organizations mentioned)
- (No organizations mentioned)
- (No other organizations mentioned)
- (No other organizations were specified in the content)
- (None other mentioned)
- (no other organizations mentioned)
- AEP
- AEP (American Electric Power)
- Anadarko
- Andrews & Kurth L.L.P.
- Assembly
- Calpine Canada Natural Gas Partnership
- Conoco
- Corp
- DandyDon
- Direct Access Coalition
- Dynegy
- EES
- EES (Energy Services)
- EIA
- EP Energy
- EPEnergy
- Eaton and Cottrell
- El Paso Merchant Energy
- Energy Commission
- Enron
- Enron (implied)
- Enron Europe Limited
- Enron Metals
- Enron North America
- EnronOnline
- Entergy
- Exxon
- FERC
- Fox News
- Fulbright
- Global Government Affairs
- Global Products
- HPL
- IC
- ISO
- Independent Energy Producers
- Independent Energy Producers (as an organization)
- Jones Day
- Kaye Scholer LLP
- LGEN (LAGN)
- LGEN (LGEN unit presumably related to AEP)
- LPSC
- LS
- LSU (Louisiana State Univer

In [None]:
import re
from collections import defaultdict

def clean_subject(subject):
    return re.sub(r'^(Re:|Fwd:)\s*', '', subject, flags=re.IGNORECASE).strip()

def summarize_threads(emails_df):
    threads = defaultdict(list)
    
    for _, email in emails_df.iterrows():
        clean_subj = clean_subject(email['Subject'])
        threads[clean_subj].append(email['content'])
    
    summaries = {}
    for subject, contents in threads.items():
        full_thread = " ".join(contents)
        summaries[subject] = textrank_summarize(full_thread)
    
    return summaries

def add_thread_summaries(email_network_df):
    email_network_df['thread_summaries'] = email_network_df.apply(
        lambda row: summarize_threads(pd.concat([row['sent'], row['received']])),
        axis=1
    )
    return email_network_df

# Apply the function to your DataFrame
email_network_df = add_thread_summaries(email_network_df)

In [None]:
email_network_df.thread_summaries.iloc[0]

{'Credit Watch List--Week of 11/5/01': "If there are any personnel in your group that were not included in this distribution, please insure that they receive a copy of this report. To add additional people to this distribution, or if this report has been sent to you in error, please contact Veronica Espinoza at x6-6002. Attached is a revised Credit Watch listing for the week of 11/05/01. For other questions, please contact Jason R. There are no updates from last week's list."}

In [None]:
from IPython.display import display

display(email_network_df.iloc[[100]])

Unnamed: 0_level_0,sent,received,topics,sentiment,emotion,mentioned_people,mentioned_organizations,mentioned_locations,email_categories,cluster,thread_summaries
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
michael.etringer@enron.com,Empty DataFrame Columns: [] Index: [],"Date From \ 0 2001-11-05 08:48:47-08:00 veronica.espinoza@enron.com 1 2001-02-08 03:32:00-08:00 susan.mara@enron.com 2 2001-06-14 04:54:00-07:00 susan.mara@enron.com To \ 0 k..ratnala@enron.com 1 brenda.barreda@enron.com 2 brenda.barreda@enron.com Subject \ 0 Credit Watch List--Week of 11/5/01 1 AReM: Hertzberg Update 2 FYI: Sacramento Bee--Dan Walters: Repaying huge power debts still\n\t looms as a high political hurdle X-FileName \ 0 GNEMEC (Non-Privileged).pst 1 skean.nsf 2 jdasovic.nsf content \ 0 To add additional people to this distribution, or if this report has been sent to you in error, please contact Veronica Espinoza at x6-6002. If there are any personnel in your group that were not included in this distribution, please insure that they receive a copy of this report. Attached is a revised Credit Watch listing for the week of 11/05/01. For other questions, please contact Jason R. There are no updates from last week's list. 1 Provide $150 million to develop clean and efficient renewable energy supply \n\nAllow plants up to 100 megawatts to be cityby local government andnot\nthrough Energy Commission review \n\nEnsure power by requiring new power plants to enter into purchasing\nagreements with the state. They also introduced five bills to provide funding to"" \n\nReplace energy inefficient appliances like refrigerators and air\nconditioners; \nGive schools grants and loans to decrease consumption \nProvide 50 percent grants and 50 percent loans to local governments to\nretrofit buildings; \nEstablish a Mobile Efficiency Brigade with $100 million in grants to\nnonprofits to hire a team of people to distribute low-energy lightbulbs to\nresidences, saving 125 megawatts, but help other conservation programs get\nstarted twell before the summer by providig people to do the work while\nlearning new skills. 9:30 a.m., SACRAMENTO _ Assembly Speaker Robert Hertzberg and others\ndiscuss legislation to promote energy conservation and increase power\nsupplies, Capitol, room 317. Said the energy efficiency package may save as many as 500 megawatts,\nequivalent to a new power plant. Contact: 916-445-4571. 2 A dizzying array of MOU alternatives\nis being floated, including an effort by Burton and Assembly Speaker Bob\nHertzberg to persuade Edison creditors to write off part of the debt, and\nfor big industrial and power consumers to shoulder the rest in return for\nrecapturing the authority to make power supply deals outside the utility\ngrid. PG&E already\nhas declared bankruptcy, and Edison was on the verge when Davis hurriedly\nsigned a ""memorandum of understanding"" (MOU) on a rescue scheme, the\ncenterpieces being state purchase of Edison's share of the intercity power\ngrid, plus a plan for ratepayers to pay off the utility's debts. And then there are the $14 billion or so in debts that the state's two big\nutilities, Pacific Gas and Electric and Southern California Edison, incurred\nfor power purchases before their credit was cut off in January. ""On an issue like this, they (legislators) ought to be able to vote their\nconsciences,"" Burton told reporters, denouncing the Edison deal as a\n""flat-ass bailout."" \nDavis spokesman Steve Maviglio rejected Burton's account: ""The governor's\ntoo smart to do any of that."" \nAs the public squabbling heats up, so is the private search for a compromise\nthat Edison, consumerists and other principal players can accept -- without\nmuch confidence that it can be found. Despite the\nrecent drop in spot power prices, however, many aspects of the energy crisis\nremain unresolved, and chief among them is liquidating the $20 billion-plus\nin debts that utilities and the state have accumulated for power purchases. tfidf_vector 0 [0.0, 0.0, 0.11076183030737578, 0.0, 0.0, 0.0, 0.11582547486682462, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.10546876001425726, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...] 1 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.13110841261755446, 0.0, 0.0, 0.0, 0.0, 0.06638950994636103, 0.0, 0.0, 0.066050667339742, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.048147376595979265, 0.0, 0.06638950994636103, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.06011327671628271, 0.0, 0.06638950994636103, 0.0, 0.0, 0.0, 0.0, 0.11635810909787088, ...] 2 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.051623377365471214, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.04131318835149119, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]","[- Distribution of reports, - Renewable energy funding, - Power plant regulations, - Energy conservation initiatives, - Legislation discussions, - Grant and loan programs for energy efficiency, - Role of local governments in energy projects, - Updates on energy efficiency bills, - Collaboration with nonprofits for energy conservation, - Bankruptcy of PG&E and Edison]",Neutral,Informative,"[Veronica Espinoza, Jason R., Robert Hertzberg, Burton, Bob Hertzberg]",[],[],"{'Work-related': '80%', 'Personal': '0%', 'Urgent': '0%', 'Informational': '20%', 'Action Required': '0%', 'Follow-up': '0%', 'Other': '0%'}",3,"{'Credit Watch List--Week of 11/5/01': 'If there are any personnel in your group that were not included in this distribution, please insure that they receive a copy of this report. To add additional people to this distribution, or if this report has been sent to you in error, please contact Veronica Espinoza at x6-6002. Attached is a revised Credit Watch listing for the week of 11/05/01. For other questions, please contact Jason R. There are no updates from last week's list.', 'AReM: Hertzberg Update': 'Provide $150 million to develop clean and efficient renewable energy supply Allow plants up to 100 megawatts to be cityby local government andnot through Energy Commission review Ensure power by requiring new power plants to enter into purchasing agreements with the state. They also introduced five bills to provide funding to"" Replace energy inefficient appliances like refrigerators and air conditioners; Give schools grants and loans to decrease consumption Provide 50 percent grants and 50 percent loans to local governments to retrofit buildings; Establish a Mobile Efficiency Brigade with $100 million in grants to nonprofits to hire a team of people to distribute low-energy lightbulbs to residences, saving 125 megawatts, but help other conservation programs get started twell before the summer by providig people to do the work while learning new skills. Contact: 916-445-4571. Said the energy efficiency package may save as many as 500 megawatts, equivalent to a new power plant. 9:30 a.m., SACRAMENTO _ Assembly Speaker Robert Hertzberg and others discuss legislation to promote energy conservation and increase power supplies, Capitol, room 317.', 'FYI: Sacramento Bee--Dan Walters: Repaying huge power debts still  looms as a high political hurdle': 'A dizzying array of MOU alternatives is being floated, including an effort by Burton and Assembly Speaker Bob Hertzberg to persuade Edison creditors to write off part of the debt, and for big industrial and power consumers to shoulder the rest in return for recapturing the authority to make power supply deals outside the utility grid. Despite the recent drop in spot power prices, however, many aspects of the energy crisis remain unresolved, and chief among them is liquidating the $20 billion-plus in debts that utilities and the state have accumulated for power purchases. PG&E already has declared bankruptcy, and Edison was on the verge when Davis hurriedly signed a ""memorandum of understanding"" (MOU) on a rescue scheme, the centerpieces being state purchase of Edison's share of the intercity power grid, plus a plan for ratepayers to pay off the utility's debts. And then there are the $14 billion or so in debts that the state's two big utilities, Pacific Gas and Electric and Southern California Edison, incurred for power purchases before their credit was cut off in January. ""On an issue like this, they (legislators) ought to be able to vote their consciences,"" Burton told reporters, denouncing the Edison deal as a ""flat-ass bailout."" Davis spokesman Steve Maviglio rejected Burton's account: ""The governor's too smart to do any of that."" As the public squabbling heats up, so is the private search for a compromise that Edison, consumerists and other principal players can accept -- without much confidence that it can be found.'}"


In [None]:
# Email volume
email_network_df['sent_count'] = email_network_df['sent'].apply(lambda x: len(x) if isinstance(x, pd.DataFrame) else 0)
email_network_df['received_count'] = email_network_df['received'].apply(lambda x: len(x) if isinstance(x, pd.DataFrame) else 0)
email_network_df['total_volume'] = email_network_df['sent_count'] + email_network_df['received_count']

# Response rate
#email_network_df['response_rate'] = email_network_df.apply(lambda row: calculate_response_rate(row['sent'], row['received']), axis=1)

# Thread depth (approximation based on 'Re:' count in subject)
def calculate_thread_depth(sent, received):
    all_emails = pd.concat([sent, received]) if isinstance(sent, pd.DataFrame) and isinstance(received, pd.DataFrame) else pd.DataFrame()
    if len(all_emails) > 0:
        return all_emails['Subject'].str.count('Re:').mean()
    return 0

email_network_df['avg_thread_depth'] = email_network_df.apply(lambda row: calculate_thread_depth(row['sent'], row['received']), axis=1)

def count_unique_contacts(sent, received):
    contacts = set()
    if isinstance(sent, pd.DataFrame):
        # Check for 'To' or alternative columns that might contain recipient information
        recipient_columns = ['To', 'X-To', 'Recipients', 'Recipient']
        for col in recipient_columns:
            if col in sent.columns:
                contacts.update(sent[col].dropna())
                break
    if isinstance(received, pd.DataFrame):
        # Check for 'From' or alternative columns that might contain sender information
        sender_columns = ['From', 'X-From', 'Sender']
        for col in sender_columns:
            if col in received.columns:
                contacts.update(received[col].dropna())
                break
    return len(contacts)

email_network_df['unique_contacts'] = email_network_df.apply(lambda row: count_unique_contacts(row['sent'], row['received']), axis=1)

def calculate_response_rate(sent, received):
    if isinstance(received, pd.DataFrame) and len(received) > 0:
        if isinstance(sent, pd.DataFrame) and len(sent) > 0:
            replies = sent[sent['Subject'].str.startswith('Re:', na=False)]
            return len(replies) / len(received)
    return 0

email_network_df['response_rate'] = email_network_df.apply(lambda row: calculate_response_rate(row['sent'], row['received']), axis=1)

In [None]:
import networkx as nx

def create_email_network(email_network_df):
    G = nx.DiGraph()
    for person, row in email_network_df.iterrows():
        if isinstance(row['sent'], pd.DataFrame):
            for _, email in row['sent'].iterrows():
                recipients = email['To'].split(';') if isinstance(email['To'], str) else [email['To']]
                for recipient in recipients:
                    G.add_edge(person, recipient.strip())
    return G

def calculate_network_features(email_network_df):
    G = create_email_network(email_network_df)
    
    # 1. Centrality measures
    degree_centrality = nx.degree_centrality(G)
    betweenness_centrality = nx.betweenness_centrality(G)
    eigenvector_centrality = nx.eigenvector_centrality(G, max_iter=1000)
    
    # 2. Community detection using Girvan-Newman algorithm
    communities_generator = nx.community.girvan_newman(G.to_undirected())
    top_level_communities = next(communities_generator)
    community_dict = {node: i for i, community in enumerate(top_level_communities) for node in community}
    
    # 3. Influence score
    pagerank = nx.pagerank(G)
    
    # Add these features to the DataFrame
    email_network_df['degree_centrality'] = email_network_df.index.map(degree_centrality)
    email_network_df['betweenness_centrality'] = email_network_df.index.map(betweenness_centrality)
    email_network_df['eigenvector_centrality'] = email_network_df.index.map(eigenvector_centrality)
    email_network_df['community'] = email_network_df.index.map(community_dict)
    email_network_df['influence_score'] = email_network_df.index.map(pagerank)
    
    return email_network_df

# Calculate forwarded and replied email counts
def calculate_email_interaction_counts(email_network_df):
    def count_interactions(sent, received):
        forwarded_count = 0
        replied_count = 0
        if isinstance(sent, pd.DataFrame) and 'Subject' in sent.columns:
            forwarded_count = sent['Subject'].str.contains('Fwd:', case=False, na=False).sum()
        if isinstance(received, pd.DataFrame) and 'Subject' in received.columns:
            replied_count = received['Subject'].str.contains('Re:', case=False, na=False).sum()
        return pd.Series({'forwarded_count': forwarded_count, 'replied_count': replied_count})

    interaction_counts = email_network_df.apply(lambda row: count_interactions(row['sent'], row['received']), axis=1)
    return pd.concat([email_network_df, interaction_counts], axis=1)


# Apply the functions
email_network_df = calculate_email_interaction_counts(email_network_df)
email_network_df = calculate_network_features(email_network_df)

# Adjust influence score based on forwarded and replied counts
email_network_df['adjusted_influence_score'] = email_network_df['influence_score'] * (1 + email_network_df['forwarded_count'] + email_network_df['replied_count'])

In [None]:
# Fill missing values with appropriate defaults
email_network_df = email_network_df.fillna({
    'degree_centrality': 0,
    'betweenness_centrality': 0,
    'eigenvector_centrality': 0,
    'community': -1,
    'influence_score': 0,
    'adjusted_influence_score': 0
})

##TODO: FIX THIS IT DOES NOT WORK
import pandas as pd
import networkx as nx
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

def create_email_network(email_network_df):
    G = nx.DiGraph()
    for person, row in email_network_df.iterrows():
        if isinstance(row['sent'], pd.DataFrame):
            for _, email in row['sent'].iterrows():
                if 'To' in email:
                    recipients = email['To'].split(';') if isinstance(email['To'], str) else [email['To']]
                    for recipient in recipients:
                        if isinstance(recipient, str):
                            G.add_edge(person, recipient.strip())
    return G

def calculate_network_features(email_network_df):
    G = create_email_network(email_network_df)
    
    # 1. Centrality measures
    degree_centrality = nx.degree_centrality(G)
    betweenness_centrality = nx.betweenness_centrality(G)
    eigenvector_centrality = nx.eigenvector_centrality(G, max_iter=1000)
    
    # 2. Community detection using Girvan-Newman algorithm
    communities_generator = nx.community.girvan_newman(G.to_undirected())
    top_level_communities = next(communities_generator)
    community_dict = {node: i for i, community in enumerate(top_level_communities) for node in community}
    
    # 3. Influence score
    pagerank = nx.pagerank(G)
    
    # Add these features to the DataFrame
    email_network_df['degree_centrality'] = email_network_df.index.map(lambda x: degree_centrality.get(x, 0))
    email_network_df['betweenness_centrality'] = email_network_df.index.map(lambda x: betweenness_centrality.get(x, 0))
    email_network_df['eigenvector_centrality'] = email_network_df.index.map(lambda x: eigenvector_centrality.get(x, 0))
    email_network_df['community'] = email_network_df.index.map(lambda x: community_dict.get(x, -1))
    email_network_df['influence_score'] = email_network_df.index.map(lambda x: pagerank.get(x, 0))
    
    return email_network_df

def calculate_email_interaction_counts(email_network_df):
    def count_interactions(sent, received):
        forwarded_count = 0
        replied_count = 0
        if isinstance(sent, pd.DataFrame) and 'Subject' in sent.columns:
            forwarded_count = sent['Subject'].str.contains('Fwd:', case=False, na=False).sum()
        if isinstance(received, pd.DataFrame) and 'Subject' in received.columns:
            replied_count = received['Subject'].str.contains('Re:', case=False, na=False).sum()
        return pd.Series({'forwarded_count': forwarded_count, 'replied_count': replied_count})

    interaction_counts = email_network_df.apply(lambda row: count_interactions(row['sent'], row['received']), axis=1)
    return pd.concat([email_network_df, interaction_counts], axis=1)

import pandas as pd
import numpy as np
import re

def calculate_priority_urgency_features(email_network_df):
    urgency_keywords = ['urgent', 'asap', 'immediately', 'critical', 'important']
    importance_keywords = ['priority', 'crucial', 'vital', 'essential', 'key']
    high_position_keywords = ['ceo', 'cfo', 'cto', 'president', 'director', 'manager', 'head']

    def calculate_scores(sent, received):
        if not isinstance(sent, pd.DataFrame) or not isinstance(received, pd.DataFrame):
            return pd.Series({'urgency_score': 0, 'importance_score': 0})

        all_emails = pd.concat([sent, received])
        
        # Urgency score
        urgency_pattern = re.compile('|'.join(urgency_keywords), re.IGNORECASE)
        urgency_count = all_emails['content'].apply(lambda x: len(urgency_pattern.findall(str(x)))).sum()
        
        avg_response_time = 24  # default to 24 hours if Date columns are not available
        if 'Date' in sent.columns and 'Date' in received.columns:
            # Convert to datetime and ensure UTC
            sent_dates = pd.to_datetime(sent['Date'], utc=True)
            received_dates = pd.to_datetime(received['Date'], utc=True)
            
            # Calculate time differences
            time_diffs = []
            for rec_date in received_dates:
                responses = sent_dates[sent_dates > rec_date]
                if not responses.empty:
                    time_diffs.append((responses.iloc[0] - rec_date).total_seconds() / 3600)
            
            avg_response_time = np.mean(time_diffs) if time_diffs else 24
        
        urgency_score = urgency_count / (1 + avg_response_time)  # Normalize by response time

        # Importance score
        importance_pattern = re.compile('|'.join(importance_keywords), re.IGNORECASE)
        importance_count = all_emails['content'].apply(lambda x: len(importance_pattern.findall(str(x)))).sum()
        
        position_pattern = re.compile('|'.join(high_position_keywords), re.IGNORECASE)
        sender_position_score = all_emails['From'].apply(lambda x: len(position_pattern.findall(str(x)))).sum()
        
        importance_score = importance_count + sender_position_score

        return pd.Series({'urgency_score': urgency_score, 'importance_score': importance_score})

    scores = email_network_df.apply(lambda row: calculate_scores(row['sent'], row['received']), axis=1)
    email_network_df = pd.concat([email_network_df, scores], axis=1)
    return email_network_df

import pandas as pd
import numpy as np
import re
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

def calculate_collaboration_metrics(email_network_df):
    def extract_department(email):
        return email.split('@')[0].split('.')[-1] if isinstance(email, str) else ''

    def calculate_metrics(sent, received):
        if not isinstance(sent, pd.DataFrame) or not isinstance(received, pd.DataFrame):
            return pd.Series({'cross_dept_comm_freq': 0, 'project_cluster': -1})

        all_emails = pd.concat([sent, received])
        
        # Cross-department communication frequency
        all_emails['sender_dept'] = all_emails['From'].apply(extract_department)
        all_emails['receiver_dept'] = all_emails['To'].apply(extract_department)
        cross_dept_comm = (all_emails['sender_dept'] != all_emails['receiver_dept']).sum()
        cross_dept_comm_freq = cross_dept_comm / len(all_emails) if len(all_emails) > 0 else 0

        # Project-based communication clusters
        vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
        tfidf_matrix = vectorizer.fit_transform(all_emails['content'].astype(str))
        
        # Adjust number of clusters based on data size
        n_clusters = min(max(1, tfidf_matrix.shape[0] // 10), 5)  # At least 1, at most 5 clusters
        
        if n_clusters > 1:
            kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
            cluster = kmeans.fit_predict(tfidf_matrix)[0]
        else:
            cluster = 0

        return pd.Series({'cross_dept_comm_freq': cross_dept_comm_freq, 'project_cluster': cluster})

    metrics = email_network_df.apply(lambda row: calculate_metrics(row['sent'], row['received']), axis=1)
    return pd.concat([email_network_df, metrics], axis=1)

def process_email_network(email_network_df):
    # Ensure index is unique
    email_network_df = email_network_df.reset_index(drop=True)
    
    # Calculate email interaction counts
    email_network_df = calculate_email_interaction_counts(email_network_df)

    # Calculate network features
    email_network_df = calculate_network_features(email_network_df)

    # Calculate priority and urgency features
    email_network_df = calculate_priority_urgency_features(email_network_df)

    # Calculate collaboration metrics
    email_network_df = calculate_collaboration_metrics(email_network_df)

    # Ensure all relevant columns are numeric
    numeric_columns = ['influence_score', 'forwarded_count', 'replied_count']
    for col in numeric_columns:
        if col in email_network_df.columns:
            email_network_df[col] = pd.to_numeric(email_network_df[col], errors='coerce').fillna(0)

    # Adjust influence score based on forwarded and replied counts
    if all(col in email_network_df.columns for col in numeric_columns):
        email_network_df['adjusted_influence_score'] = email_network_df['influence_score'] * (1 + email_network_df['forwarded_count'] + email_network_df['replied_count'])
    else:
        print("Warning: Not all required columns present for adjusted_influence_score calculation")
        email_network_df['adjusted_influence_score'] = email_network_df['influence_score']

    return email_network_df

# Assuming email_network_df is your input DataFrame
try:
    email_network_df = process_email_network(email_network_df)
    print("Processing completed successfully.")
except Exception as e:
    print(f"An error occurred: {str(e)}")
    print("DataFrame info:")
    print(email_network_df.info())
    print("\nDataFrame head:")
    print(email_network_df.head())

In [None]:
email_network_df.to_csv('email_network_df_original.csv')

In [20]:
from IPython.display import display

display(email_network_df.received.iloc[[49]])

person
dutch.quigley@enron.com                           Date                         From  \
0 2001-11-05 08:48:47-08:00  veronica.espinoza@enron.com   

                     To                             Subject  \
0  k..ratnala@enron.com  Credit Watch List--Week of 11/5/01   

                    X-FileName  \
0  GNEMEC (Non-Privileged).pst   

                                                                                                                                                                                                                                                                                                                                                                                                                                                   content  \
0  To add additional people to this distribution, or if this report has been sent to you in error, please contact Veronica Espinoza at x6-6002. If there are any personnel in your group that were not incl

In [21]:
import os
from email_analysis_utils import parallel_generate_llm_query_prompts


# Generate prompt recommendations for each user
email_network_df_prompts = parallel_generate_llm_query_prompts(email_network_df, num_processes=os.cpu_count())

# Display a sample of the recommendations
print("\nSample of personalized LLM query prompts:")
sample_users = email_network_df.sample(5)
for _, user in sample_users.iterrows():
    print(f"\nUser: {user.name}")
    print("Recommended LLM Query Prompts:")
    for i, prompt in enumerate(user['recommended_llm_queries'], 1):
        print(f"{i}. {prompt}")


Generating LLM query prompts using 10 processes...


Generating prompts: 100%|██████████| 1285/1285 [09:22<00:00,  2.29it/s]


Sample of personalized LLM query prompts:

User: harry's.group@enron.com
Recommended LLM Query Prompts:
1. 1. "What are the critical details regarding Duke Energy's $7 million contribution to marine life mitigation that I should include in my upcoming report on the Moss Landing project?"
2. 2. "Extract and summarize the recommendations made by the Moss Landing Siting Committee regarding the proposed construction of the Moss Landing plant."
3. 3. "Identify any follow-up actions or decisions that need to be made concerning the increase in generating capacity for the Moss Landing plant as discussed in recent emails."
4. 4. "Can you help me track the timeline for the approval process of the Moss Landing Power Project based on the latest communications about it?"
5. 5. "What were the key points discussed in my emails concerning the environmental impacts of the Moss Landing plant operations on marine biology?"
6. 6. "Summarize my interactions related to the California Energy Commission's li


