In [1]:
DATA_DIR =  "../c3/namata-kdd11-data"
!ls $DATA_DIR

citeseer  cora	enron  FileFormat.txt  README


In [2]:
!ls $DATA_DIR/enron/enron-samples-lowunk/enron-sample-lowunk-1of6

sample-enron.DIRECTED.emailof.tab     sample-enron.NODE.person.tab
sample-enron.DIRECTED.sentto.tab      sample-enron.UNDIRECTED.coref.tab
sample-enron.GRAPH.communication.tab  sample-enron.UNDIRECTED.email-submgr.tab
sample-enron.NODE.email.tab	      sample-enron.UNDIRECTED.submgr.tab


In [3]:
import pandas as pd
import re

In [4]:
# used for extracting feature name
# returns: tuple (feature_name, feature_value, optional_value)

def get_feature_tuple(feature):
    feature_data = re.split(r"[:=]", feature)
    return feature_data
    

# Loading in the sparse, tab delimited node file

In [5]:
# initialize the pandas dataframe
node_data = pd.DataFrame()


with open('../c3/namata-kdd11-data/enron/enron-samples-lowunk/enron-sample-lowunk-1of6/sample-enron.NODE.email.tab') as infile:
  i = 0
  for row in infile:
    
    # print('i is: ', i)

    if i == 0:
        # Skip non-useful first line
        print("Header: ", row)
    elif i == 1:
        # Prepare dataframe column labels
        tokens = row.split()
        features = ["node"] + [get_feature_tuple(feature)[1] for feature in tokens]
        node_data = pd.DataFrame(columns=features)
    else:
        # Fill in rows
        tokens = row.split()
        
        # the first token doesn't need splitting
        row_dict = {'node':tokens[0]}
        row_dict.update({get_feature_tuple(token)[0]:get_feature_tuple(token)[1] for token in tokens[1:]})
        
        node_data = pd.concat([node_data, pd.DataFrame([row_dict])])
        
    i += 1

Header:  NODE	email



In [6]:
pd.set_option('display.max_columns', 50)

node_data

Unnamed: 0,node,emailaddress,numsent,numreceived,numexchanged,w-gerald,w-know,w-busi,w-mexicana,w-transact,w-want,w-thing,w-review,w-questar,w-open,w-season,w-doc,w-end,w-tomorrow,w-close,w-follow,w-day,w-let,w-need,w-anyth,...,w-oren,w-parkhil,w-requirementscopi,w-pintoleiteenron,w-rh,w-trco,w-dkinney,w-dalphon,w-cdalpho,w-kinneyph,w-pdrexel,w-drexelius,w-columbiaga,w-columbiagascc,w-bartlo,w-columbiagassubject,w-perron,w-coh,w-agl,w-kinney,w-veselack,w-mwhitt,w-jarnold,"other,manager,specialist,director,executive",title
0,98,scott.goodell@enron.com,98.0,607.0,705.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,,specialist
0,283,c..koehler@enron.com,128.0,606.0,734.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,1.0,1.0,,,,,,,,,,,,,,,,,,,,,director
0,183,p..south@enron.com,8.0,351.0,359.0,1.0,1.0,1.0,,,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,,,,,,,,,,,,,,,,director
0,204,lavorato@enron.com,388.0,3.0,391.0,,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,,,,,,,,,,,,,,,,executive
0,303,t..hodge@enron.com,95.0,570.0,665.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,1.0,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,114,vkamins@enron.com,0.0,12.0,12.0,,1.0,1.0,,1.0,1.0,1.0,,,,,1.0,,1.0,,1.0,,1.0,,,...,,,,,,,,,,,,,,,,,,,,,,,,,director
0,282,sean.crandall@enron.com,94.0,138.0,232.0,,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,,,,,,,,,,,,,,,,director
0,270,david.duran@enron.com,7.0,145.0,152.0,,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,,,,,,,,,,,,,,,,director
0,243,kevin.presto@enron.com,248.0,198.0,446.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,,,,,,,,,,,,,,,,executive


# Clean up columns

In [7]:
# convert relevant columns to floats (since ints won't take NaNs)

for column in node_data.columns[2:-2]:
    node_data[column] = node_data[column].astype(str).astype(float)

In [8]:
node_data.dtypes

node                                            object
emailaddress                                    object
numsent                                        float64
numreceived                                    float64
numexchanged                                   float64
                                                ...   
w-veselack                                     float64
w-mwhitt                                       float64
w-jarnold                                      float64
other,manager,specialist,director,executive     object
title                                           object
Length: 5120, dtype: object

In [9]:
# remove the second to last column (it came from an ambiguous parsing)

node_data.drop('other,manager,specialist,director,executive', axis=1, inplace=True)
node_data

Unnamed: 0,node,emailaddress,numsent,numreceived,numexchanged,w-gerald,w-know,w-busi,w-mexicana,w-transact,w-want,w-thing,w-review,w-questar,w-open,w-season,w-doc,w-end,w-tomorrow,w-close,w-follow,w-day,w-let,w-need,w-anyth,...,w-kindal,w-oren,w-parkhil,w-requirementscopi,w-pintoleiteenron,w-rh,w-trco,w-dkinney,w-dalphon,w-cdalpho,w-kinneyph,w-pdrexel,w-drexelius,w-columbiaga,w-columbiagascc,w-bartlo,w-columbiagassubject,w-perron,w-coh,w-agl,w-kinney,w-veselack,w-mwhitt,w-jarnold,title
0,98,scott.goodell@enron.com,98.0,607.0,705.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,specialist
0,283,c..koehler@enron.com,128.0,606.0,734.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,1.0,1.0,,,,,,,,,,,,,,,,,,,,director
0,183,p..south@enron.com,8.0,351.0,359.0,1.0,1.0,1.0,,,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,,,,,,,,,,,,,,,,director
0,204,lavorato@enron.com,388.0,3.0,391.0,,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,,,,,,,,,,,,,,,,executive
0,303,t..hodge@enron.com,95.0,570.0,665.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,1.0,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,114,vkamins@enron.com,0.0,12.0,12.0,,1.0,1.0,,1.0,1.0,1.0,,,,,1.0,,1.0,,1.0,,1.0,,,...,,,,,,,,,,,,,,,,,,,,,,,,,director
0,282,sean.crandall@enron.com,94.0,138.0,232.0,,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,,,,,,,,,,,,,,,,director
0,270,david.duran@enron.com,7.0,145.0,152.0,,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,,,,,,,,,,,,,,,,director
0,243,kevin.presto@enron.com,248.0,198.0,446.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,,,,,,,,,,,,,,,,executive
