# This notebook stores each step of refactoring the graph data into PSL data

In [1]:
# No space between equals sign is necessary, so we can treat these as Bash variables as well.
FILE_GROUND_TRUTH_EMAIL_NODES='../c3/namata-kdd11-data/enron/enron-samples-lowunk/outputgraph/enron.NODE.email.tab'
FILE_GROUND_TRUTH_COREF_EDGES='../c3/namata-kdd11-data/enron/enron-samples-lowunk/outputgraph/enron.UNDIRECTED.coref.tab'
FILE_GROUND_TRUTH_MANAGES_EDGES='../c3/namata-kdd11-data/enron/enron-samples-lowunk/outputgraph/enron.UNDIRECTED.email-submgr.tab'

FILE_SAMPLE_EMAIL_NODES='../c3/namata-kdd11-data/enron/enron-samples-lowunk/enron-sample-lowunk-1of6/sample-enron.NODE.email.tab'
FILE_SAMPLE_COREF_EDGES='../c3/namata-kdd11-data/enron/enron-samples-lowunk/enron-sample-lowunk-1of6/sample-enron.UNDIRECTED.coref.tab'
FILE_GROUND_TRUTH_MANAGES_EDGES='../c3/namata-kdd11-data/enron/enron-samples-lowunk/enron-sample-lowunk-1of6/sample-enron.UNDIRECTED.email-submgr.tab'



## These functions help parse the .tab files.

In [2]:
import pandas as pd
import re
import itertools # for cross products when filling in a full PSL dataset

In [3]:
# assigns types to each column
def resolve_column_type(table):
    for column in table.columns:
        if column in {'id', 'email', 'alt_email', 'subordinate_email' , 'numsent', 'numreceived', 'numexchanged'}:
            table[column] = table[column].astype(str).astype(float).astype(int)
        # convert bag-of-words columns to floats (since ints won't take NaNs)
        elif re.match("w-", column):
            table[column] = table[column].astype(str).astype(float)

# extracts feature name from an element in a raw tab row
# returns: tuple (feature_name, feature_value, optional_value)
def get_feature_tuple(feature):
    feature_data = re.split(r"[:=]", feature)
    return feature_data
    

# loads the *.tab files into a Pandas Dataframe.
# returns: pd.DataFrame(columns=features)
def load_table(filename):

    # initialize the pandas dataframe
    node_data = pd.DataFrame()


    with open(filename) as infile:
        i = 0
        row_list = []
        for row in infile:
    
            #print('i is: ', i)

            if i == 0:
                # Skip non-useful first line
                print("Header: ", row)
            elif i == 1:
                # Prepare dataframe column labels
                tokens = row.split()
                if len(tokens) == 1:
                    print("This is not a NODE file, so don't load this row")
                else:  
                    features = ["id"] + [get_feature_tuple(feature)[1] for feature in tokens]
                    node_data = pd.DataFrame(columns=features)
            else:
          
                # this is to help the function generalize among the NODE and EDGE files.
                # EDGE files have a "|" character, which needs to be removed for proper feature decoupling
                row = re.sub(r'\|','', row)
            
                tokens = row.split()

                # the first token doesn't need splitting
                row_dict = {'id':tokens[0]}
                row_dict.update({get_feature_tuple(token)[0]:get_feature_tuple(token)[1] for token in tokens[1:]})
                row_list.append(row_dict)
        
            i += 1
        
        # Fill in rows
        node_data = pd.concat([node_data, pd.DataFrame(row_list)], ignore_index=True)

    return node_data

# Process the email nodes

In [4]:
email_nodes = load_table(FILE_GROUND_TRUTH_EMAIL_NODES)
# remove the (unnecessary) second to last column (it came from an ambiguous parse splits)
email_nodes.drop('other,manager,specialist,director,executive', axis=1, inplace=True)
resolve_column_type(email_nodes)

email_nodes.dtypes

Header:  NODE	email



id                int64
emailaddress     object
numsent           int64
numreceived       int64
numexchanged      int64
                 ...   
w-kinney        float64
w-veselack      float64
w-mwhitt        float64
w-jarnold       float64
title            object
Length: 5119, dtype: object

In [5]:
email_nodes

Unnamed: 0,id,emailaddress,numsent,numreceived,numexchanged,w-gerald,w-know,w-busi,w-mexicana,w-transact,...,w-bartlo,w-columbiagassubject,w-perron,w-coh,w-agl,w-kinney,w-veselack,w-mwhitt,w-jarnold,title
0,283,c..koehler@enron.com,128,606,734,1.0,1.0,1.0,,1.0,...,,,,,,,,,,director
1,98,scott.goodell@enron.com,98,607,705,1.0,1.0,1.0,,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,specialist
2,183,p..south@enron.com,8,351,359,1.0,1.0,1.0,,,...,,,,,,,,,,director
3,204,lavorato@enron.com,388,3,391,,1.0,1.0,,1.0,...,,,,,,,,,,executive
4,318,mike.grigsby@enron.com,3702,490,4192,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,executive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206,114,vkamins@enron.com,0,12,12,,1.0,1.0,,1.0,...,,,,,,,,,,director
207,270,david.duran@enron.com,7,145,152,,1.0,1.0,,1.0,...,,,,,,,,,,director
208,282,sean.crandall@enron.com,94,138,232,,1.0,1.0,,1.0,...,,,,,,,,,,director
209,243,kevin.presto@enron.com,248,198,446,1.0,1.0,1.0,,1.0,...,,,,,,,,,,executive


# Process the CoRef edges

In [6]:
# need to rename one of the columns due to key collision
# use copy for safety

!cp $FILE_GROUND_TRUTH_COREF_EDGES .
!sed -i 's/email/alt_email/2g' enron.UNDIRECTED.coref.tab

coref_edges = load_table('enron.UNDIRECTED.coref.tab')
resolve_column_type(coref_edges)

coref_edges.dtypes

Header:  UNDIRECTED	coref

This is not a NODE file, so don't load this row


id            int64
email         int64
alt_email     int64
exists       object
dtype: object

In [7]:
coref_edges

Unnamed: 0,id,email,alt_email,exists
0,2856,265,141,NOTEXIST
1,18491,310,295,NOTEXIST
2,516,272,183,NOTEXIST
3,5131,201,19,NOTEXIST
4,12417,138,78,NOTEXIST
...,...,...,...,...
20776,15003,135,208,NOTEXIST
20777,4450,197,47,NOTEXIST
20778,20302,248,25,NOTEXIST
20779,12985,222,118,NOTEXIST


In [8]:
# Sanity Check: These should print pairs of the same people
#for index in coref_edges[coref_edges['exists'] == 'EXIST'][['email', 'alt_email']].index:
    # email_id  = coref_edges.loc[index]['email']
    # alt_email_id = coref_edges.loc[index]['alt_email']

    # print(email_nodes[email_nodes['id'] == email_id]['emailaddress'])
    # print(email_nodes[email_nodes['id'] == alt_email_id]['emailaddress'])
    # print("------------------------------------------------")
    

# Process the Manager edges

In [9]:
# Load in the observed email-submgr.
# need to rename one of the columns due to key collision
# use copy for safety
!cp $FILE_GROUND_TRUTH_MANAGES_EDGES .
!sed -i 's/\temail/\tsubordinate_email/2g' enron.UNDIRECTED.email-submgr.tab

manager_edges = load_table('enron.UNDIRECTED.email-submgr.tab')

# FIXME: can probably omit this line
manager_edges.drop('NOTEXIST,EXIST', axis=1, inplace=True)

resolve_column_type(manager_edges)

manager_edges.dtypes

Header:  UNDIRECTED	email-submgr



id                     int64
w-gerald             float64
w-know               float64
w-busi               float64
w-mexicana           float64
                      ...   
w-jarnold            float64
numexchanged           int64
email                  int64
subordinate_email      int64
exists                object
Length: 5118, dtype: object

In [10]:
manager_edges[manager_edges['exists'] == 'EXIST']

Unnamed: 0,id,w-gerald,w-know,w-busi,w-mexicana,w-transact,w-want,w-thing,w-review,w-questar,...,w-coh,w-agl,w-kinney,w-veselack,w-mwhitt,w-jarnold,numexchanged,email,subordinate_email,exists
0,2693,,1.0,,,1.0,1.0,,1.0,,...,,,,,,,6,286,324,EXIST
3,1406,,,,,1.0,1.0,,,,...,,,,,,,3,57,313,EXIST
22,3184,,,,,,,,1.0,,...,,,,,,,1,30,283,EXIST
34,3204,,1.0,,,1.0,1.0,,1.0,,...,,,,,,,120,30,143,EXIST
39,2717,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,...,,,,,1.0,,155,199,46,EXIST
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1999,2950,,,,,,,,,,...,,,,,,,1,164,55,EXIST
2006,3823,,1.0,,,1.0,,,1.0,,...,,,,,,,36,149,64,EXIST
2027,3379,,,1.0,,,1.0,,,,...,,,,,,,13,26,157,EXIST
2031,3163,,1.0,,,,1.0,1.0,,,...,,,,,,,12,34,269,EXIST


# Split data into observed and targets (AKA train and test)

## Email Nodes

In [11]:
# Grab the sample split from the original experiment
sample_email_nodes = load_table(FILE_SAMPLE_EMAIL_NODES)
# remove the (unnecessary) second to last column (it came from an ambiguous parse splits)
sample_email_nodes.drop('other,manager,specialist,director,executive', axis=1, inplace=True)
resolve_column_type(sample_email_nodes)

Header:  NODE	email



In [12]:
# calculate the split
email_nodes_obs = email_nodes[email_nodes['id'].isin(sample_email_nodes[sample_email_nodes['title'].notna()]['id'])]
email_nodes_truth = email_nodes[email_nodes['id'].isin(sample_email_nodes[sample_email_nodes['title'].isna()]['id'])]

In [None]:
# Add in the the non existent observations


# (DEPRECATED) Split data into observed and targets (AKA train and test)

In [64]:
# This is a sanity check to see the UNDIRECTED managerial edges
# for index in manager_edges[manager_edges['exists'] == 'EXIST'].index:
#    print("Email is: ", manager_edges.loc[index]['email'], "--- Sub Email is: ", manager_edges.loc[index]['subordinate_email'])
#    print("Title is: ", email_nodes[email_nodes['id'] == manager_edges.loc[index]['email']]['title'].iloc[0], " --- Title is: ", email_nodes[email_nodes['id'] == manager_edges.loc[index]['subordinate_email']]['title'].iloc[0])

In [65]:

#email_nodes_observed_indices =
#email_nodes_targets = email_nodes[email_nodes['title'].isna()]

#coref_edges_observed = coref_edges[coref_edges['exists'].notna()]
#coref_edges_targets = coref_edges[coref_edges['exists'].isna()]

#manager_edges_observed = manager_edges[manager_edges['exists'].notna()]
#manager_edges_targets = manager_edges[manager_edges['exists'].isna()]

# Sanity check to see if the splits match up with the paper.

# print("email_node_observed: ", len(email_nodes_observed))
# print("email_node_target: ", len(email_nodes_targets))

# print("coref_edges_observed: ", len(coref_edges_observed))
# print("coref_edges_target: ", len(coref_edges_targets))

# print("manager_edges_observed: ", len(manager_edges_observed))
# print("manager_edges_target: ", len(manager_edges_targets))

# Prepare data for PSL predicates

In [None]:
# Takes a table and fills the missing pairs and values to specify a full, sufficient set
# So far it only works with binary predicates
def fill_observed_missing_possibilities(table, arguments, values):
    total_possibilities = set(itertools.product(list(table[arguments[0]]), values))
    already_observed_possibilities = set((table.loc[index][arguments[0]], table.loc[index][arguments[1]]) for index in table.index)

    missing_possibilities = total_possibilities - already_observed_possibilities
    row_list = []
    for arg_0, arg_1 in missing_possibilities:
        row_dict = {arguments[0]:arg_0, arguments[1]:arg_1, arguments[2]:0 }
        row_list.append(row_dict)
        
    return pd.concat([table, pd.DataFrame(row_list)])

## Predicate: EmailHasLabel(E, L)

### Observed

In [None]:
title_map = {"other": 0, "manager": 1, "specialist": 2, "director": 3, "executive": 4}

# The copy is to suppress an in-place warning
email_has_label_obs = email_nodes_observed[['id', 'title']].copy()
# convert titles to integers, so PSL can ground faster
email_has_label_obs = email_has_label_obs.replace({'title': title_map})
# add in an existence column
email_has_label_obs['exists'] = 1.0

In [None]:
# Specify the full observed set, add in 1s for the observed, and 0s for the missing possibilities
full_set_email_has_label_obs = fill_observed_missing_possibilities(email_has_label_obs, ['id', 'title', 'exists'], list(title_map.values()))
full_set_email_has_label_obs

In [None]:
# Outputs to file
# full_set_email_has_label_obs.to_csv('EmailHasLabel_obs.csv', sep ='\t', index=False, header=False)

### Truth/Targets

In [None]:
ground_truth_email_nodes = load_table('../c3/namata-kdd11-data/enron/enron-samples-lowunk/outputgraph/enron.NODE.email.tab')
resolve_column_type(ground_truth_email_nodes)
ground_truth_email_nodes

In [None]:
# Grab the ground truths for the missing annotations
email_has_label_truth = ground_truth_email_nodes[ground_truth_email_nodes['id'].isin(list(email_nodes_targets['id']))][['id', 'title']].copy()

# Convert titles to integers so PSL can ground faster
email_has_label_truth = email_has_label_truth.replace({"title": title_map})
email_has_label_truth
# Add in an existence column
email_has_label_truth['exists'] = 1.0

full_set_email_has_label_truth = fill_observed_missing_possibilities(email_has_label_truth, ['id', 'title', 'exists'], list(title_map.values()) )
full_set_email_has_label_truth

In [None]:
# Outputs to file
# full_set_email_has_label_truth.to_csv('EmailHasLabel_truth.csv', sep ='\t', index=False, header=False)

## Predicate: CoRef(E1, E2)

### Observed

In [None]:
exists_map = {"NOTEXIST": 0.0, "EXIST": 1.0, 0.0: 0.0, 1.0: 1.0}
coref_edges_observed = coref_edges_observed.replace({"exists": exists_map})

coref_edges_observed

In [None]:
# Add in a symmetrical table: (email, alt_email) + (alt_email, email)

coref_edges_observed_symm = coref_edges_observed[['id', 'alt_email', 'email', 'exists']]
coref_edges_observed_symm.rename(columns = {'alt_email':'email', 'email':'alt_email'}, inplace = True)
pd.concat([coref_edges_observed, coref_edges_observed_symm])

In [None]:
# Specify the full observed set, add in 1s for the observed, and 0s for the missing possibilities
# full_set_coref_edges_obs = fill_observed_missing_possibilities(coref_edges_observed, ['email', 'alt_email', 'exists'], list(exists_map.values()))
pairs = set((int(coref_edges_observed.loc[index]['email']), int(coref_edges_observed.loc[index]['alt_email'])) for index in coref_edges_observed.index)
reverse_pairs = set((int(coref_edges_observed.loc[index]['alt_email']), int(coref_edges_observed.loc[index]['email'])) for index in coref_edges_observed.index)
total_observed_undirected_edges = pairs | reverse_pairs

# FIXME: make sure to not taint the new set with edges from the target.  Need to make sets of pairs from the target
# target_edges

In [None]:
# Setting up the total universe of possible edges
total_possible_undirected_edges = set(list(itertools.permutations(list(email_nodes['id']), 2)))

In [None]:
len(total_possible_undirected_edges)

In [None]:
# FIXME: needs to print the missing edges, and check for "target taint"

#len(total_possible_undirected_edges - total_observed_undirected_edges)
target_set = set((coref_edges_targets.loc[index]['email'], coref_edges_targets.loc[index]['alt_email']) for index in coref_edges_targets.index) | set((coref_edges_targets.loc[index]['alt_email'], coref_edges_targets.loc[index]['email']) for index in coref_edges_targets.index)
len(target_set)

In [None]:
len((total_possible_undirected_edges - total_observed_undirected_edges - target_set))

In [None]:
# Outputs to file
# coref_edges_observed.to_csv('CoRef_obs.csv', sep ='\t', index=False, header=False, columns=['email', 'alt_email', 'exists'])

### Truths/Targets

In [None]:
# need to rename one of the columns due to key collision
# use copy for safety
!cp ../c3/namata-kdd11-data/enron/enron-samples-lowunk/outputgraph/enron.UNDIRECTED.coref.tab .
!sed -i 's/email/alt_email/2g' enron.UNDIRECTED.coref.tab

ground_truth_coref_edges = load_table('enron.UNDIRECTED.coref.tab')
resolve_column_type(ground_truth_coref_edges)
ground_truth_coref_edges

In [None]:
# Need to fill in the 'exists' missing annotations
coref_edges_targets

In [None]:
# Grab the ground truths for the missing annotations
coref_edges_truth = ground_truth_coref_edges[ground_truth_coref_edges['id'].isin(list(coref_edges_targets['id']))].copy()
coref_edges_truth = coref_edges_truth.replace({"exists": exists_map})
coref_edges_truth

In [None]:
# Output to file
# coref_edges_truth.to_csv('CoRef_truth.csv', sep ='\t', index=False, header=False, columns=['email', 'alt_email', 'exists'])

## Predicate: EmailManages(E1, E2)

### Observed

In [None]:
manager_edges_observed

In [None]:

manager_edges_observed = manager_edges_observed.replace({'exists': exists_map})
manager_edges_observed[['email', 'subordinate_email', 'exists']]

In [None]:
# Output to csv
# manager_edges_observed.to_csv('Manages_obs.csv', sep ='\t', index=False, header=False, columns=['email', 'subordinate_email', 'exists'])

In [None]:
# Sanity check their blocking method.  
# Check if the candidate managerial relationships are really limited to pairs that are connected in the communication network
# TODO: Some set operations between manager_edges and the communication_edges


### Truth/Targets

In [None]:
# Need to fill in the the annotations for these targets
manager_edges_targets

In [None]:
# Load in the ground truth for email-submgr.
# need to rename one of the columns due to key collision
# use copy for safety
!cp ../c3/namata-kdd11-data/enron/enron-samples-lowunk/outputgraph/enron.UNDIRECTED.email-submgr.tab .
# FIXME: this is tainting the column names
!sed -i 's/\temail/\tsubordinate_email/2g' enron.UNDIRECTED.email-submgr.tab

ground_truth_manager_edges = load_table('enron.UNDIRECTED.email-submgr.tab')

# FIXME: can probably omit this line
ground_truth_manager_edges.drop('NOTEXIST,EXIST', axis=1, inplace=True)

resolve_column_type(ground_truth_manager_edges)

ground_truth_manager_edges.dtypes

In [None]:
ground_truth_manager_edges

In [None]:
# Fill in the missing annotations
manager_edges_truth = ground_truth_manager_edges[ground_truth_manager_edges['id'].isin(manager_edges_targets['id']).copy()]
manager_edges_truth = manager_edges_truth.replace({'exists': exists_map})
manager_edges_truth

In [None]:
# Outputs to file
# manager_edges_truth.to_csv('Manages_truth.csv', sep ='\t', index=False, header=False, columns=['email', 'subordinate_email', 'exists'])