# This notebook stores each step of refactoring the graph data into PSL data

In [1]:
import pandas as pd
import re

## These functions help parse the .tab files.

In [2]:
# used for extracting feature name
# returns: tuple (feature_name, feature_value, optional_value)
def get_feature_tuple(feature):
    feature_data = re.split(r"[:=]", feature)
    return feature_data
    

# loads the *.tab files into a Pandas Dataframe.
# returns: pd.DataFrame(columns=features)
def load_table(filename):

    # initialize the pandas dataframe
    node_data = pd.DataFrame()


    with open(filename) as infile:
        i = 0
        for row in infile:
    
            # print('i is: ', i)

            if i == 0:
                # Skip non-useful first line
                print("Header: ", row)
            elif i == 1:
                # Prepare dataframe column labels
                tokens = row.split()
                if len(tokens) == 1:
                    print("This is not a NODE file, so don't load this row")
                else:  
                    features = ["id"] + [get_feature_tuple(feature)[1] for feature in tokens]
                    node_data = pd.DataFrame(columns=features)
            else:
          
                # this is to help the function generalize among the NODE and EDGE files.
                # EDGE files have a "|" character, which needs to be removed for proper feature decoupling
                row = re.sub(r'\|','', row)
            
                tokens = row.split()

                # the first token doesn't need splitting
                row_dict = {'id':tokens[0]}
                row_dict.update({get_feature_tuple(token)[0]:get_feature_tuple(token)[1] for token in tokens[1:]})
        
                # Fill in rows
                node_data = pd.concat([node_data, pd.DataFrame([row_dict])], ignore_index=True)
        
            i += 1
    
    return node_data

# Process the email nodes

In [3]:
email_nodes = load_table('../c3/namata-kdd11-data/enron/enron-samples-lowunk/enron-sample-lowunk-1of6/sample-enron.NODE.email.tab')
email_nodes

Header:  NODE	email



Unnamed: 0,id,emailaddress,numsent,numreceived,numexchanged,w-gerald,w-know,w-busi,w-mexicana,w-transact,...,w-columbiagassubject,w-perron,w-coh,w-agl,w-kinney,w-veselack,w-mwhitt,w-jarnold,"other,manager,specialist,director,executive",title
0,98,scott.goodell@enron.com,98.0,607.0,705.0,1.0,1.0,1.0,,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,,,,specialist
1,283,c..koehler@enron.com,128.0,606.0,734.0,1.0,1.0,1.0,,1.0,...,,,,,,,,,,director
2,183,p..south@enron.com,8.0,351.0,359.0,1.0,1.0,1.0,,,...,,,,,,,,,,director
3,204,lavorato@enron.com,388.0,3.0,391.0,,1.0,1.0,,1.0,...,,,,,,,,,,executive
4,303,t..hodge@enron.com,95.0,570.0,665.0,1.0,1.0,1.0,,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206,114,vkamins@enron.com,0.0,12.0,12.0,,1.0,1.0,,1.0,...,,,,,,,,,,director
207,282,sean.crandall@enron.com,94.0,138.0,232.0,,1.0,1.0,,1.0,...,,,,,,,,,,director
208,270,david.duran@enron.com,7.0,145.0,152.0,,1.0,1.0,,1.0,...,,,,,,,,,,director
209,243,kevin.presto@enron.com,248.0,198.0,446.0,1.0,1.0,1.0,,1.0,...,,,,,,,,,,executive


In [4]:
# Sanity check: how many unknowns are there
len(email_nodes[email_nodes['title'].isna()])

40

In [5]:
# convert email ids to ints
email_nodes['id'] = email_nodes['id'].astype(str).astype(int)

# convert bag-of-words columns to floats (since ints won't take NaNs)
for column in email_nodes.columns[2:-2]:
    email_nodes[column] = email_nodes[column].astype(str).astype(float)

In [6]:
email_nodes.dtypes

id                                               int64
emailaddress                                    object
numsent                                        float64
numreceived                                    float64
numexchanged                                   float64
                                                ...   
w-veselack                                     float64
w-mwhitt                                       float64
w-jarnold                                      float64
other,manager,specialist,director,executive     object
title                                           object
Length: 5120, dtype: object

In [7]:
# remove the (unnecessary) second to last column (it came from an ambiguous parse splits)
email_nodes.drop('other,manager,specialist,director,executive', axis=1, inplace=True)

# use node ids as index
email_nodes = email_nodes.set_index('id')
email_nodes

Unnamed: 0_level_0,emailaddress,numsent,numreceived,numexchanged,w-gerald,w-know,w-busi,w-mexicana,w-transact,w-want,...,w-bartlo,w-columbiagassubject,w-perron,w-coh,w-agl,w-kinney,w-veselack,w-mwhitt,w-jarnold,title
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
98,scott.goodell@enron.com,98.0,607.0,705.0,1.0,1.0,1.0,,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,specialist
283,c..koehler@enron.com,128.0,606.0,734.0,1.0,1.0,1.0,,1.0,1.0,...,,,,,,,,,,director
183,p..south@enron.com,8.0,351.0,359.0,1.0,1.0,1.0,,,1.0,...,,,,,,,,,,director
204,lavorato@enron.com,388.0,3.0,391.0,,1.0,1.0,,1.0,1.0,...,,,,,,,,,,executive
303,t..hodge@enron.com,95.0,570.0,665.0,1.0,1.0,1.0,,1.0,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,vkamins@enron.com,0.0,12.0,12.0,,1.0,1.0,,1.0,1.0,...,,,,,,,,,,director
282,sean.crandall@enron.com,94.0,138.0,232.0,,1.0,1.0,,1.0,1.0,...,,,,,,,,,,director
270,david.duran@enron.com,7.0,145.0,152.0,,1.0,1.0,,1.0,1.0,...,,,,,,,,,,director
243,kevin.presto@enron.com,248.0,198.0,446.0,1.0,1.0,1.0,,1.0,1.0,...,,,,,,,,,,executive


# Process the CoRef edges

In [8]:
# need to rename one of the columns due to key collision
# use copy for safety
!cp ../c3/namata-kdd11-data/enron/enron-samples-lowunk/enron-sample-lowunk-1of6/sample-enron.UNDIRECTED.coref.tab .
!sed -i 's/email/alt_email/2g' sample-enron.UNDIRECTED.coref.tab

coref_edges = load_table('sample-enron.UNDIRECTED.coref.tab')

# Convert all columns except last one to ints 
# FIXME: need to be consistent with the other datasets.
coref_edges['id'] = coref_edges['id'].astype(str).astype(int)
coref_edges['email'] = coref_edges['email'].astype(str).astype(int)
coref_edges['alt_email'] = coref_edges['alt_email'].astype(str).astype(int)

coref_edges = coref_edges.set_index('id')

coref_edges

Header:  UNDIRECTED	coref

This is not a NODE file, so don't load this row


Unnamed: 0_level_0,email,alt_email,exists
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2856,265,141,NOTEXIST
18491,310,295,NOTEXIST
516,272,183,NOTEXIST
5131,201,19,NOTEXIST
12417,138,78,NOTEXIST
...,...,...,...
15003,208,135,NOTEXIST
4450,197,47,NOTEXIST
20302,25,248,NOTEXIST
12985,222,118,


In [9]:
# Sanity Check: These should print pairs of the same people
for index in coref_edges[coref_edges['exists'] == 'EXIST'][['email', 'alt_email']].index:
    pair = (coref_edges.loc[index]['email'], coref_edges.loc[index]['email'])
    print(email_nodes.loc[pair[0]])
    print(email_nodes.loc[pair[1]])
    print("------------------------------------------------")

emailaddress    vkamins@enron.com
numsent                       0.0
numreceived                  12.0
numexchanged                 12.0
w-gerald                      NaN
                      ...        
w-kinney                      NaN
w-veselack                    NaN
w-mwhitt                      NaN
w-jarnold                     NaN
title                    director
Name: 114, Length: 5118, dtype: object
emailaddress    vkamins@enron.com
numsent                       0.0
numreceived                  12.0
numexchanged                 12.0
w-gerald                      NaN
                      ...        
w-kinney                      NaN
w-veselack                    NaN
w-mwhitt                      NaN
w-jarnold                     NaN
title                    director
Name: 114, Length: 5118, dtype: object
------------------------------------------------
emailaddress    d..baughman@enron.com
numsent                           6.0
numreceived                      96.0
numexchange

emailaddress    larry.campbell@enron.com
numsent                             35.0
numreceived                         28.0
numexchanged                        63.0
w-gerald                             NaN
                          ...           
w-kinney                             NaN
w-veselack                           NaN
w-mwhitt                             NaN
w-jarnold                            NaN
title                         specialist
Name: 83, Length: 5118, dtype: object
emailaddress    larry.campbell@enron.com
numsent                             35.0
numreceived                         28.0
numexchanged                        63.0
w-gerald                             NaN
                          ...           
w-kinney                             NaN
w-veselack                           NaN
w-mwhitt                             NaN
w-jarnold                            NaN
title                         specialist
Name: 83, Length: 5118, dtype: object
----------------------

# Process the Manager edges

In [10]:
# Todo:
# Load in the email-submgr and sanity check the edges to see who is the manager of whom.

# Split data into observed and targets (AKA train and test)

In [11]:
email_node_observed = email_nodes[email_nodes['title'].notna()]
email_node_targets = email_nodes[email_nodes['title'].isna()]

coref_edges_observed = coref_edges[coref_edges['exists'].notna()]
coref_edges_targets = coref_edges[coref_edges['exists'].isna()]

In [12]:
print("email_node_observed: ", len(email_node_observed))
print("email_node_target: ", len(email_node_targets))

print("coref_edges_observed: ", len(coref_edges_observed))
print("coref_edges_target: ", len(coref_edges_targets))

email_node_observed:  171
email_node_target:  40
coref_edges_observed:  16625
coref_edges_target:  4156


# Prepare data for PSL predicates

## Predicate: EmailHasLabel(E, L)

In [13]:
# convert titles to integers, so PSL can ground faster
# title_map = {"other": 0, "manager": 1, "specialist": 2, "director": 3, "executive": 4}

# The copy is to suppress an in-place warning
# email_has_label = email_nodes[['title']].copy()
# email_has_label['title'] = email_has_label['title'].map(title_map)
# email_has_label = email_nodes['title'].map(title_map)

In [14]:
# email_has_label.hist()

In [15]:
# Outputs to file
# email_has_label.to_csv('EmailHasLabel.csv', sep ='\t')

In [16]:
# Outputs to file
# coref_edges.to_csv('CoRef_obs.csv', sep ='\t')