In [11]:
#import us_news
#dfs = []
#for schooltype in ['colleges', 'community', 'gradschools', 'online', 'highschools', 'private']:
    #dfs.append(us_news.get_schools(schooltype))
#df = pd.concat(dfs)
import pandas as pd
import plotly.express as px
import numpy as np

In [12]:
def strip_formatting(id):
    return id.lower().strip()

companies = pd.read_csv('data/companies.csv').rename({'company':'label', 'company_id': 'id', 'comp_latitude': 'latitude', 'comp_longitude': 'longitude'}, axis=1)
companies['id'] = [strip_formatting(id) for id in companies['id']]
companies['type'] = 'company'
universities = pd.read_csv('data/universities.csv').rename({'Colleges': 'label', 'university_id': 'id'}, axis=1)
universities.columns = [strip_formatting(col) for col in universities.columns]
universities['id'] = [strip_formatting(id) for id in universities['id']]
universities['type'] = 'university'
streams = pd.read_csv('data/stream.csv').rename({'stream':'label', 'stream_id': 'id'}, axis=1)
streams['stream_id'] = [strip_formatting(id) for id in streams['id']]
streams['type'] = 'stream'
companystream = pd.read_csv('data/company-stream.csv').dropna(axis=1, how='all')
companystream['company_id'] = [strip_formatting(id) for id in companystream['company_id']]
companystream['stream_id'] = [strip_formatting(id) for id in companystream['stream_id']]
streamuniversity = pd.read_csv('data/stream-university.csv').dropna(axis=1, how='all')
streamuniversity['stream_id'] = [strip_formatting(id) for id in streamuniversity['stream_id']]
streamuniversity['university_id'] = [strip_formatting(id) for id in streamuniversity['university_id']]

In [13]:
companies['targets'] = companies['id'].map(companystream.groupby('company_id')['stream_id'].apply(list))
universities['targets'] = universities['id'].map(streamuniversity.groupby('university_id')['stream_id'].apply(list))
streams['target_companies'] = streams['id'].map(companystream.groupby('stream_id')['company_id'].apply(list))
streams['target_universities'] = streams['id'].map(streamuniversity.groupby('stream_id')['university_id'].apply(list))
streams['targets'] = streams['target_companies'] + streams['target_universities']
for df in [companies, universities, streams]:
    df['targets'] = df['targets'].fillna("").apply(list)
nodelist = pd.concat([universities, streams, companies])

In [14]:
nodelist['targets']

0     [s3, s4, s6, s7, s25, s76, s11, s12, s13, s24,...
1                                  [s39, s41, s42, s43]
2     [s3, s25, s4, s7, s6, s51, s76, s11, s12, s53,...
3     [s3, s4, s6, s7, s51, s11, s12, s13, s25, s67,...
4     [s6, s82, s83, s4, s12, s25, s87, s337, s92, s...
                            ...                        
80                                                   []
81                                                   []
82                                                   []
83                                                   []
84                                                   []
Name: targets, Length: 208, dtype: object

In [15]:
def extend_targets(targets):
    if targets is None:
        return []
    else:
        print(f'Extending targets: {targets}')
        for t in targets: # e.g. s1, s2
            if t in nodelist['id'].values:
                second_hop = nodelist[nodelist['id'] == t]['targets'].values[0]
                if len(second_hop) > 0:
                    print(f'Found extension for {t}: {second_hop}')
                    targets.extend(second_hop)
                else:
                    print(f'No extension for {t}')
        print(f'Extended targets: {targets}')
        print()
        return targets

In [16]:
# Don't extend targets, this takes forever to iterate over all hops
#nodelist['targets'] = [extend_targets(targets) for targets in nodelist['targets']]
lookup = nodelist.set_index('id').to_dict('index')
# Create a new row for each item in the targets list:
edgelist = nodelist.explode('targets').filter(['id', 'label', 'targets']).rename({'targets': 'targetId', 'id':'sourceId', 'label':'source'}, axis=1)
nodelist = nodelist.filter(['id', 'label', 'type','latitude','longitude', 'targets'])
nodelist.to_csv('data/nodelist.csv', index=False)
# Make sure the targetId is in agg['id']:
edgelist = edgelist[edgelist['targetId'].isin(lookup.keys())]
edgelist['target'] = [lookup[id]['label'] for id in edgelist['targetId']]
#edgelist = edgelist[edgelist['type'] != 'stream']
edgelist.to_csv('data/edgelist.csv', index=False)

In [17]:
nodelist

Unnamed: 0,id,label,type,latitude,longitude,targets
0,u1,Bates College,university,44.105720,-70.202200,"[s3, s4, s6, s7, s25, s76, s11, s12, s13, s24,..."
1,u2,Beal College,university,44.786260,-68.785500,"[s39, s41, s42, s43]"
2,u3,Bowdoin College,university,43.907690,-69.964000,"[s3, s25, s4, s7, s6, s51, s76, s11, s12, s53,..."
3,u4,Colby College,university,44.563870,-69.662600,"[s3, s4, s6, s7, s51, s11, s12, s13, s25, s67,..."
4,u5,College of the Atlantic,university,44.395030,-68.221500,"[s6, s82, s83, s4, s12, s25, s87, s337, s92, s..."
...,...,...,...,...,...,...
80,c81,L A W Calibration,company,43.535788,-70.431473,[]
81,c82,ME Molecular Quality Controls,company,43.529808,-70.416901,[]
82,c83,Purist,company,43.437855,-70.455839,[]
83,c84,Spin Analytical,company,43.236886,-70.805468,[]


In [18]:
x_locs = []
y_locs = []
# Companies are on the left, universities on the right:
c_count = 0
c_rat = 100/len(companies)
u_count = 0
u_rat = 100/len(universities)
s_count = 0
s_rat = 100/len(streams)
for i, node in nodelist.iterrows():
    if node['type'] == 'company':
        x_locs.append(1)
        y_locs.append(c_count*c_rat)
        c_count += 1
    elif node['type'] == 'university':
        x_locs.append(-1)
        y_locs.append(u_count*u_rat)
        u_count += 1
    else:
        x_locs.append(0)
        y_locs.append(s_count*s_rat)
        s_count += 1
nodelist['x'] = x_locs
nodelist['y'] = y_locs
nodelist.to_csv('data/nodelist.csv', index=False)
nodelookup = nodelist.set_index('id').to_dict('index')

In [19]:
import plotly.graph_objects as go

In [20]:
node_trace = go.Scatter(
    x=nodelist['x'],
    y=nodelist['y'],
    mode='markers',
    hoverinfo='text',
    marker=dict(
        showscale=True,
        colorscale='YlGnBu',
        reversescale=True,
        color=[],
        size=10,
        colorbar=dict(
            thickness=15,
            title='Node Connections',
            xanchor='left',
            titleside='right'
        ),
        line_width=2))

edge_x = []
edge_y = []

for i, edge in edgelist.iterrows():
    x0, y0 = nodelookup[edge['sourceId']]['x'], nodelookup[edge['sourceId']]['y']
    x1, y1 = nodelookup[edge['targetId']]['x'], nodelookup[edge['targetId']]['y']
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])

edge_trace = go.Scatter(
    x=edge_x,
    y=edge_y,
    line=dict(width=0.5,color='#888'),
    hoverinfo='none',
    mode='lines')

go.Figure(data=[edge_trace, node_trace]).show()