<h3> Reading provided dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('./Employees.csv', names=['id', 'name', 'email', 'manager_id', 'project', 'role', 'interests_list'])
df.set_index('id', inplace=True)

In [3]:
df.head()

Unnamed: 0_level_0,name,email,manager_id,project,role,interests_list
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Piper Dingledine,Piper.Dingledine@fakegame.com,7,Fake mobile game,Sales,"['Sales tactic', 'Storytelling', 'Relation bui..."
2,Dennis Cardenas,Dennis.Cardenas@fakegame.com,7,Fake mobile game,Director,"['Strategic Thinking', 'Communication', 'Decis..."
3,William Mooney,William.Mooney@fakegame.com,7,Fake mobile game,Programmer,"['Perseverance', 'Self-Motivation', 'Independe..."
4,Ashley Rifkin,Ashley.Rifkin@fakegame.com,7,Fake mobile game,Sales,"['Sales tactic', 'Storytelling', 'Relation bui..."
5,Lisa Yruegas,Lisa.Yruegas@fakegame.com,7,Fake mobile game,Director,"['Strategic Thinking', 'Communication', 'Decis..."


<h3> Generate size of a node </h3>
    
Considering number of people managed by this person, directly and indirectly

In [4]:
min_dot_radius, max_dot_radius = 5, 15

# Recursively find how many people are directly managed by a person (i.e. how many have this person as a manager)
df['directly_managed_people'] = pd.Series([0] * df.shape[0], dtype='int32', index=df.index)
def count_directly_managed_recurs(manager_id):
    managed_people = df[df.manager_id == manager_id].index.tolist()
    df.loc[manager_id, 'directly_managed_people'] += len(managed_people)
    for managed_id in managed_people:
        count_directly_managed_recurs(managed_id)
        
for top_manager in df[df.manager_id == 0].index:
    count_directly_managed_recurs(top_manager)

# Recursively find how many people are managed by a person in total 
# (i.e. how many are managed directly + how many are managed by each managed employee and so on)
df['total_managed_people'] = pd.Series([0] * df.shape[0], dtype='int32', index=df.index)
def count_total_managed_recurs(person_id):
    manager_id = df.loc[person_id].manager_id
    if manager_id == 0: return
    df.loc[manager_id, 'total_managed_people'] += 1
    count_total_managed_recurs(manager_id)  

for person_id in df.index:
    count_total_managed_recurs(person_id)
    
# Assign radius to each node, based on level of management (CEO is biggest, common employees are smallest)
df['dot_radius'] = min_dot_radius + (max_dot_radius - min_dot_radius) * df.total_managed_people.rank(method='dense', pct=True)

<h3>Artificially connect people (simulating their social interactions)</h3>
<ol> 
    <li>For each person randomly select how many connections they'll have
    <li>For each person randomly select how many connactions are inside their project, and how many are with people from other projects
    <li>For each person generate lists of people to whom they'll connect
    <li>Conver lists into tabular data, i.e. set of pairs: id_1, id_2 of connections
    <li>Artificially generate social interaction statistics (frequency and size of interactions)
</ol>

In [5]:
connections_distributions = np.clip(np.random.normal(3.0, 1.0, df.shape[0]), a_min=1.0, a_max = 100.0)
connections_counts = pd.Series(connections_distributions, index=df.index).astype('int32')
connections_counts.value_counts()

2    35
3    33
1    18
4     9
5     5
dtype: int64

In [6]:
# connections within project colleagues
internal_connections_count = np.ceil(np.clip(np.random.normal(0.85, 0.15, df.shape[0]), a_min=0.0, a_max=1.0) * connections_counts).astype('int32')
# connections with people from other projects / offices
external_connections_count = connections_counts - internal_connections_count

# Generating tabular representation of connections, i.e. set of pairs like: id_1, id_2
# using internal and external number of connections

chats_dict = {
    'id_1':[], 'id_2':[]
}

for person_id, count in internal_connections_count.iteritems():
    manager_id = df.loc[person_id, 'manager_id']
    project = df.loc[person_id, 'project']
    same_man_ids = df.query(f"manager_id == {manager_id} or project == '{project}'").index.tolist()
    if manager_id > 0:
        same_man_ids.append(manager_id)
    others_ids = np.random.choice(same_man_ids, count)
    others_ids = np.unique(others_ids)
    for other_id in others_ids:
        if other_id == person_id: continue
        if (other_id, person_id) in zip(chats_dict['id_1'], chats_dict['id_2']): continue
        chats_dict['id_1'].append(person_id)
        chats_dict['id_2'].append(other_id)
        
for person_id, count in external_connections_count.iteritems():
    manager_id = df.loc[person_id, 'manager_id']
    project = df.loc[person_id, 'project']
    external_people = df.query(f"manager_id != {manager_id} or project != '{project}'").index.tolist()
    others_ids = np.random.choice(external_people, count)
    others_ids = np.unique(others_ids)
    for other_id in others_ids:
        if other_id == person_id: continue
        if (other_id, person_id) in zip(chats_dict['id_1'], chats_dict['id_2']): continue
        chats_dict['id_1'].append(person_id)
        chats_dict['id_2'].append(other_id)

chats_df = pd.DataFrame(chats_dict)
chats_df.drop_duplicates(inplace=True)
chats_df.reset_index(drop=True, inplace=True)   

In [7]:
# Generate aftificial social activities in each generated pair

# Number of days in last 2 months when there were messages echanged
chats_df['days_interacted_last_2_months'] = np.ceil(np.clip(np.random.normal(10, 5, chats_df.shape[0]), a_min=0, a_max=60)).astype('int32')

# Number of messages total between employees
chats_df['total_messages'] = np.ceil(np.clip(np.random.normal(50, 60, chats_df.shape[0]), a_min=0.0, a_max=10000.0)).astype('int32')
chats_df.loc[chats_df['total_messages'] < chats_df['days_interacted_last_2_months'], 'total_messages'] = chats_df['days_interacted_last_2_months']

chats_df.head()

Unnamed: 0,id_1,id_2,days_interacted_last_2_months,total_messages
0,1,4,6,73
1,1,6,5,56
2,2,6,9,41
3,3,1,9,9
4,3,9,20,20


<h3>Calculate metric of employees mutual interaction</h3>
\begin{align}
Metric(A, B) & = clamp_{[0;1]}(\frac{days\_of\_interaction}{60} + \frac{messages\_total}{300})\\
Metric(A, B) & \in [0;1] \\
\end{align}
From it generate width of edges in the graph

In [8]:
min_line_width, max_line_width = 1, 25

chats_df['connection_metric'] = np.minimum(chats_df['days_interacted_last_2_months'] / 60 + chats_df['total_messages'] / 300, 1.0)

chats_df['line_width'] = np.ceil(min_line_width + chats_df['connection_metric'] * (max_line_width - min_line_width)).astype('int32')

# every manager should have special lines with managed employees
chats_df['is_manager_line'] = (chats_df['id_1'] == df.loc[chats_df['id_2'], 'manager_id'].reset_index().manager_id) | (chats_df['id_2'] == df.loc[chats_df['id_1'], 'manager_id'].reset_index().manager_id)

chats_df.head()

Unnamed: 0,id_1,id_2,days_interacted_last_2_months,total_messages,connection_metric,line_width,is_manager_line
0,1,4,6,73,0.343333,10,False
1,1,6,5,56,0.27,8,False
2,2,6,9,41,0.286667,8,False
3,3,1,9,9,0.18,6,False
4,3,9,20,20,0.4,11,False


<h3>Generating lists of employees skills</h3>
For futher filtering on the graph

In [9]:
all_skills = ['C++', 'C#', 'JS', 'React', 'Angular', 'Android', 'IOS', 'PHP', 
              'MySQL', 'PostgreSQL', 'Oracle', 'Java', 'Docker', 'Kubernetes', 
              'CUDA', 'Python', 'NodeJS', 'TypeScript', 'AWS', 'Cisco']
skills_counts = np.ceil(np.clip(np.random.normal(2, 1, df.shape[0]), a_min=1, a_max=5)).astype('int32')
people_skills = pd.Series([np.random.choice(all_skills, cnt, replace=False) for cnt in skills_counts], index=df.index)
df['skills_list'] = people_skills

<h3>Preview generated data, saving in CSV</h3>

In [10]:
df.head()

Unnamed: 0_level_0,name,email,manager_id,project,role,interests_list,directly_managed_people,total_managed_people,dot_radius,skills_list
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,Piper Dingledine,Piper.Dingledine@fakegame.com,7,Fake mobile game,Sales,"['Sales tactic', 'Storytelling', 'Relation bui...",0,0,5.909091,"[PHP, NodeJS, TypeScript]"
2,Dennis Cardenas,Dennis.Cardenas@fakegame.com,7,Fake mobile game,Director,"['Strategic Thinking', 'Communication', 'Decis...",0,0,5.909091,"[MySQL, Cisco]"
3,William Mooney,William.Mooney@fakegame.com,7,Fake mobile game,Programmer,"['Perseverance', 'Self-Motivation', 'Independe...",0,0,5.909091,"[Cisco, C++]"
4,Ashley Rifkin,Ashley.Rifkin@fakegame.com,7,Fake mobile game,Sales,"['Sales tactic', 'Storytelling', 'Relation bui...",0,0,5.909091,"[TypeScript, MySQL, Java]"
5,Lisa Yruegas,Lisa.Yruegas@fakegame.com,7,Fake mobile game,Director,"['Strategic Thinking', 'Communication', 'Decis...",0,0,5.909091,"[Kubernetes, React, Android]"


In [11]:
chats_df.head()

Unnamed: 0,id_1,id_2,days_interacted_last_2_months,total_messages,connection_metric,line_width,is_manager_line
0,1,4,6,73,0.343333,10,False
1,1,6,5,56,0.27,8,False
2,2,6,9,41,0.286667,8,False
3,3,1,9,9,0.18,6,False
4,3,9,20,20,0.4,11,False


In [12]:
df.to_csv('./updated_employees.csv')
chats_df.to_csv('./employees_connections.csv')