# This notebook stores each step of refactoring the graph data into PSL data

In [1]:
import pandas as pd
import re

# Loading in the sparse NODE file

In [2]:
# used for extracting feature name
# returns: tuple (feature_name, feature_value, optional_value)
def get_feature_tuple(feature):
    feature_data = re.split(r"[:=]", feature)
    return feature_data
    

# loads the NODE*.tab files into a Pandas Dataframe.
# returns: pd.DataFrame(columns=features)
def load_node_table(filename):

  # initialize the pandas dataframe
  node_data = pd.DataFrame()


  with open(filename) as infile:
    i = 0
    for row in infile:
    
      # print('i is: ', i)

      if i == 0:
          # Skip non-useful first line
          print("Header: ", row)
      elif i == 1:
          # Prepare dataframe column labels
          tokens = row.split()
          features = ["node"] + [get_feature_tuple(feature)[1] for feature in tokens]
          node_data = pd.DataFrame(columns=features)
      else:
          # Fill in rows
          tokens = row.split()
        
          # the first token doesn't need splitting
          row_dict = {'node':tokens[0]}
          row_dict.update({get_feature_tuple(token)[0]:get_feature_tuple(token)[1] for token in tokens[1:]})
        
          node_data = pd.concat([node_data, pd.DataFrame([row_dict])], ignore_index=True)
        
      i += 1
    
    return node_data

In [3]:
email_nodes = load_node_table('../c3/namata-kdd11-data/enron/enron-samples-lowunk/outputgraph/enron.NODE.email.tab')
email_nodes

Header:  NODE	email



Unnamed: 0,node,emailaddress,numsent,numreceived,numexchanged,w-gerald,w-know,w-busi,w-mexicana,w-transact,...,w-columbiagassubject,w-perron,w-coh,w-agl,w-kinney,w-veselack,w-mwhitt,w-jarnold,"other,manager,specialist,director,executive",title
0,283,c..koehler@enron.com,128.0,606.0,734.0,1.0,1.0,1.0,,1.0,...,,,,,,,,,,director
1,98,scott.goodell@enron.com,98.0,607.0,705.0,1.0,1.0,1.0,,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,,,,specialist
2,183,p..south@enron.com,8.0,351.0,359.0,1.0,1.0,1.0,,,...,,,,,,,,,,director
3,204,lavorato@enron.com,388.0,3.0,391.0,,1.0,1.0,,1.0,...,,,,,,,,,,executive
4,318,mike.grigsby@enron.com,3702.0,490.0,4192.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,executive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206,114,vkamins@enron.com,0.0,12.0,12.0,,1.0,1.0,,1.0,...,,,,,,,,,,director
207,270,david.duran@enron.com,7.0,145.0,152.0,,1.0,1.0,,1.0,...,,,,,,,,,,director
208,282,sean.crandall@enron.com,94.0,138.0,232.0,,1.0,1.0,,1.0,...,,,,,,,,,,director
209,243,kevin.presto@enron.com,248.0,198.0,446.0,1.0,1.0,1.0,,1.0,...,,,,,,,,,,executive


In [4]:
pd.set_option('display.max_columns', 50)

email_nodes

Unnamed: 0,node,emailaddress,numsent,numreceived,numexchanged,w-gerald,w-know,w-busi,w-mexicana,w-transact,w-want,w-thing,w-review,w-questar,w-open,w-season,w-doc,w-end,w-tomorrow,w-close,w-follow,w-day,w-let,w-need,w-anyth,...,w-oren,w-parkhil,w-requirementscopi,w-pintoleiteenron,w-rh,w-trco,w-dkinney,w-dalphon,w-cdalpho,w-kinneyph,w-pdrexel,w-drexelius,w-columbiaga,w-columbiagascc,w-bartlo,w-columbiagassubject,w-perron,w-coh,w-agl,w-kinney,w-veselack,w-mwhitt,w-jarnold,"other,manager,specialist,director,executive",title
0,283,c..koehler@enron.com,128.0,606.0,734.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,1.0,1.0,,,,,,,,,,,,,,,,,,,,,director
1,98,scott.goodell@enron.com,98.0,607.0,705.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,,specialist
2,183,p..south@enron.com,8.0,351.0,359.0,1.0,1.0,1.0,,,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,,,,,,,,,,,,,,,,director
3,204,lavorato@enron.com,388.0,3.0,391.0,,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,,,,,,,,,,,,,,,,executive
4,318,mike.grigsby@enron.com,3702.0,490.0,4192.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,,,,,,,,,,,,,,,,executive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206,114,vkamins@enron.com,0.0,12.0,12.0,,1.0,1.0,,1.0,1.0,1.0,,,,,1.0,,1.0,,1.0,,1.0,,,...,,,,,,,,,,,,,,,,,,,,,,,,,director
207,270,david.duran@enron.com,7.0,145.0,152.0,,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,,,,,,,,,,,,,,,,director
208,282,sean.crandall@enron.com,94.0,138.0,232.0,,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,,,,,,,,,,,,,,,,director
209,243,kevin.presto@enron.com,248.0,198.0,446.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,,,,,,,,,,,,,,,,executive


# Clean up columns

In [6]:
# convert relevant columns to floats (since ints won't take NaNs)

for column in email_nodes.columns[2:-2]:
    email_nodes[column] = email_nodes[column].astype(str).astype(float)

In [7]:
email_nodes.dtypes

node                                            object
emailaddress                                    object
numsent                                        float64
numreceived                                    float64
numexchanged                                   float64
                                                ...   
w-veselack                                     float64
w-mwhitt                                       float64
w-jarnold                                      float64
other,manager,specialist,director,executive     object
title                                           object
Length: 5120, dtype: object

In [8]:
# remove the second to last column (it came from an ambiguous parsing)

email_nodes.drop('other,manager,specialist,director,executive', axis=1, inplace=True)

# remove the first index column, use the node ids instead
email_nodes = email_nodes.set_index('node')
email_nodes

Unnamed: 0_level_0,emailaddress,numsent,numreceived,numexchanged,w-gerald,w-know,w-busi,w-mexicana,w-transact,w-want,w-thing,w-review,w-questar,w-open,w-season,w-doc,w-end,w-tomorrow,w-close,w-follow,w-day,w-let,w-need,w-anyth,w-els,...,w-kindal,w-oren,w-parkhil,w-requirementscopi,w-pintoleiteenron,w-rh,w-trco,w-dkinney,w-dalphon,w-cdalpho,w-kinneyph,w-pdrexel,w-drexelius,w-columbiaga,w-columbiagascc,w-bartlo,w-columbiagassubject,w-perron,w-coh,w-agl,w-kinney,w-veselack,w-mwhitt,w-jarnold,title
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
283,c..koehler@enron.com,128.0,606.0,734.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,1.0,1.0,,,,,,,,,,,,,,,,,,,,director
98,scott.goodell@enron.com,98.0,607.0,705.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,specialist
183,p..south@enron.com,8.0,351.0,359.0,1.0,1.0,1.0,,,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,,,,,,,,,,,,,,,,director
204,lavorato@enron.com,388.0,3.0,391.0,,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,,,,,,,,,,,,,,,,executive
318,mike.grigsby@enron.com,3702.0,490.0,4192.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,,,,,,,,,,,,,,,,executive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,vkamins@enron.com,0.0,12.0,12.0,,1.0,1.0,,1.0,1.0,1.0,,,,,1.0,,1.0,,1.0,,1.0,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,director
270,david.duran@enron.com,7.0,145.0,152.0,,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,,,,,,,,,,,,,,,,director
282,sean.crandall@enron.com,94.0,138.0,232.0,,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,,,,,,,,,,,,,,,,director
243,kevin.presto@enron.com,248.0,198.0,446.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,,,,,,,,,,,,,,,,executive


# Prepare data for PSL predicates

## Predicate: HasLabel

In [9]:
email_has_label = email_nodes[["title"]]
email_has_label 

Unnamed: 0_level_0,title
node,Unnamed: 1_level_1
283,director
98,specialist
183,director
204,executive
318,executive
...,...
114,director
270,director
282,director
243,executive


In [None]:
# Outputs to file

# email_has_label.to_csv('EmailHasLabel.csv', sep ='\t')

## Predicate: CoRef

## Load in the UNDIRECTED edges file