## I/O and Preprocessing 

In [1]:
import pandas as pd

In [300]:
software_url = 'https://docs.google.com/spreadsheets/d/1Sjp9PG75Ap-5YBvOWZ-cCUGkNhN41LZlz3OL-gJ-tKU/export?format=csv&id=1Sjp9PG75Ap-5YBvOWZ-cCUGkNhN41LZlz3OL-gJ-tKU&gid=1702131855'
usage_url = 'https://docs.google.com/spreadsheets/d/1Sjp9PG75Ap-5YBvOWZ-cCUGkNhN41LZlz3OL-gJ-tKU/export?format=csv&id=1Sjp9PG75Ap-5YBvOWZ-cCUGkNhN41LZlz3OL-gJ-tKU&gid=1374288343'

In [301]:
software_data = pd.read_csv(software_url).dropna(how='all')
usage_data = pd.read_csv(usage_url).dropna(how='all')

In [265]:
def whitestrip_elements(x):
    '''
    Eliminates whitespace on left/right for strings or strings that begin with -
    '''
    if isinstance(x, str):
        if x[0] == '-':
            return None
        return x.lstrip().rstrip()
    return x

In [266]:
software = software_data.applymap(whitestrip_elements)

## Nodes 

In [267]:
all_names = list(software['Name'].unique())

In [268]:
tag_list = pd.Series() 
for col in software.ix[:, 4:23].columns:
    tag_list = tag_list.append((software[col].dropna()))
tag_list = sorted(list(set(tag_list)))
#tag_list = sorted([tag for tag in tag_list if tag[0]!='-'])

In [269]:
tag_df = pd.DataFrame(pd.Series(tag_list).dropna().drop_duplicates())
tag_df.columns = ['label']
tag_df['type'] = 'tag'

In [270]:
name_df = pd.DataFrame(pd.Series(all_names).dropna().drop_duplicates())
name_df.columns = ['label']
name_df['type'] = 'name'

In [293]:
node_df = pd.DataFrame()
node_df = node_df.append(tag_df, ignore_index=True)
node_df = node_df.append(name_df, ignore_index=True)

In [294]:
node_df = node_df.sort_values(by='label').reset_index()
node_df['id'] = node_df.index

In [295]:
node_df[['id', 'label', 'type']].to_csv('nodes.csv', index=False)

In [297]:
node_df.index = node_df['label']

## Edges

In [299]:
def build_name_tag_dict(row):
    '''
    Builds a dictionary as follows:
    {'Name': 'Software_Name', 'Tags': ['tag1', 'tag2']}
    '''
    name = row['Name']
    tags = list(row.ix[4:23].dropna())
    nt_dict = {'Name': name, 'Tags': tags}
    return nt_dict

software_dict = [build_name_tag_dict(row) for idx, row in software.iterrows()]

In [363]:
def build_edges(d, label_type='id'):
    '''
    Builds list of tuples to represent edges between the source and target
    Returns [(source1, target1), (source2, target2)]
    
    label_type: {'id', 'label'}
    '''
    source = d['Name']
    targets = d['Tags']
    edge_list = []
    node_type_grp = node_df.groupby('type') #groupby label type
    source_df = node_type_grp.get_group('name') #df of names
    target_df = node_type_grp.get_group('tag') #df of tags
    source_id = source_df.ix[source, 'id'] #map source_name to source_id
    for target in targets:
        target_id = target_df.ix[target, 'id'] #map target_name to target_id
        edge_list.append((source_id, target_id)) #append (source_id, target_id)
    return edge_list

In [370]:
edge_list = []
for d in software_dict:
    edge_list += build_edges(d, label_type='label')

In [371]:
edge_df = pd.DataFrame(edge_list)
edge_df.columns = ['Source', 'Target']
edge_df.to_csv('edges.csv', index=False)