In [4]:
import numpy as np
import pandas as pd
import scipy
import math
import heapq

from datetime import datetime

# Plotting tools
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
class Profile(object):
    def __init__(self):
        self.attributes = {'id': None, 'gender': None, 'skillset1': None, 'skillset2': None, 'job_title': [], 
                           'job_start_end': [], 'job_length': [], 'company': [], 'education': [], 'education_level': [], 'elite': []}

    def check_valid_profile(self):
        for a in self.attributes:
            if not self.attributes[a]:
                return False
        return True

    def update(self, entry):
        # education record
        if entry[26]:
            if entry[19] and entry[27] and entry[28]:
                self.attributes['education'].append(entry[19])
                self.attributes['education_level'].append(entry[27])
                self.attributes['elite'].append(entry[28])
        # employment record
        else:
            if entry[2]:
                self.attributes['gender'] = entry[2]
            if entry[3] != -1:
                self.attributes['skillset1'] = entry[3]
            if entry[5] != -1:
                self.attributes['skillset2'] = entry[5]
            if entry[17] and entry[19]:
                if entry[12] and entry[15]:
                    self.attributes['job_start_end'].append((entry[11], '2019-03-01'))
                    self.attributes['job_title'].append(entry[17])
                    self.attributes['company'].append(entry[19])     
                    self.attributes['job_length'].append('ongoing')
                if entry[12] and entry[14]:
                    self.attributes['job_start_end'].append((entry[11], entry[13]))
                    self.attributes['job_title'].append(entry[17])
                    self.attributes['company'].append(entry[19])
                    self.attributes['job_length'].append(entry[16])
        return

In [3]:
chunksize = 1 #approximately 23644157 entries; wc -l bay_area.csv
profiles = {}

curr_df = pd.DataFrame()
curr_entry = 0
curr_profile_id = None
curr_profile = Profile()
data=pd.read_csv('bay_area.csv', sep='\t', chunksize=chunksize)

# get first entry
for chunk in data:
    curr_person = chunk
    curr_profile_id = curr_person.iloc[0][0]
    curr_profile.attributes['id'] = curr_profile_id
    break

for chunk in data:
    next_entry = chunk.iloc[0]
    # if ids don't match create a new profile
    if next_entry[0] != curr_profile_id:
        # check valid profile to add
        if curr_profile.check_valid_profile():
            profiles[curr_profile_id] = curr_profile

        # create new profile
        curr_profile_id = next_entry[0]
        curr_profile = Profile()
        curr_profile.attributes['id'] = curr_profile_id

    # update current entry to existing profile
    curr_profile.update(next_entry)

    # log progress
    curr_entry += 1
    if curr_entry % 10000 == 0:
        print(curr_entry)

KeyboardInterrupt: 

In [None]:
# serialize processed entries
import pickle
with open('complete_entries.pickle', 'wb') as handle:
    pickle.dump(profiles, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [6]:
#open processed entries
import pickle
with open('complete_entries.pickle', 'rb') as handle:
    profiles = pickle.load(handle)

In [7]:
print(str(len(profiles)) + ' completed entries')

159450 completed entries


In [8]:
# see initial distribution skills
skillset_counts = {}
for idx, person in profiles.items():
    skillset = person.attributes['skillset1']
    if skillset in skillset_counts:
        skillset_counts[skillset] += 1
    else:
        skillset_counts[skillset] = 1
d = {'Skillset': list(skillset_counts.keys()), 'Counts': list(skillset_counts.values())}
skill_counts = pd.DataFrame(d)
skill_counts.sort_values(by=['Counts'], ascending=False)

Unnamed: 0,Skillset,Counts
1,Business Development,11037
4,Administration,9671
0,Non-Profit and Community,8849
18,Electrical Engineering,7734
16,CRM and Sales Management,7631
6,Data Analysis,7467
11,Banking and Finance,7400
7,Pharmaceutical,6734
8,Legal,6481
17,Web Development,6217


In [9]:
education_counts = {}
for idx, person in profiles.items():
    education = person.attributes['education']
    for e in education:
        if e in education_counts:
            education_counts[e] += 1
        else:
            education_counts[e] = 1
d = {'Education': list(education_counts.keys()), 'Counts': list(education_counts.values())}
education_counts = pd.DataFrame(d)
education_counts.sort_values(by=['Counts'], ascending=False)

Unnamed: 0,Education,Counts
0,"University of California, Berkeley",50337
10,Stanford University,32678
40,UC Berkeley,6763
54,University of Southern California,5969
8,Stanford University Graduate School of Business,4985
41,Berkeley College,4614
25,Massachusetts Institute of Technology,3942
6,"University of California, Berkeley, Haas Schoo...",3457
1,Harvard University,3449
26,New York University,3097


In [10]:
company_counts = {}
for idx, person in profiles.items():
    company = person.attributes['company']
    for c in company:
        if c in company_counts:
            company_counts[c] += 1
        else:
            company_counts[c] = 1
d = {'Company': list(company_counts.keys()), 'Counts': list(company_counts.values())}
company_counts = pd.DataFrame(d)
company_counts.sort_values(by=['Counts'], ascending=False)

Unnamed: 0,Company,Counts
5,TIME_OFF,17338
355,Stanford University,6755
60,Google,6746
346,UC Berkeley,6610
2129,Microsoft,2856
1079,Apple,2582
84,"University of California, Berkeley",2349
620,IBM,2186
4032,Facebook,1823
108,Kaiser Permanente,1799


In [11]:
job_title_counts = {}
for idx, person in profiles.items():
    job = person.attributes['job_title']
    for j in job:
        if j in job_title_counts:
            job_title_counts[j] += 1
        else:
            job_title_counts[j] = 1
d = {'Job Title': list(job_title_counts.keys()), 'Counts': list(job_title_counts.values())}
job_title_counts = pd.DataFrame(d)
job_title_counts.sort_values(by=['Counts'], ascending=False)

Unnamed: 0,Job Title,Counts
116,"Intern,intern",10200
34,"Research Assistant,research assistant",9089
417,"Software Engineer,software engineer",7170
490,"Consultant,consultant",5613
51,"Associate,associate",5255
1030,"Project Manager,project manager",3887
482,"President,president",3471
312,"Founder,founder",3039
494,"CEO,chief executive officer",2995
244,"Partner,partner",2991


In [12]:
test_profiles = []
for idx, p in profiles.items():
    person = p.attributes
    if ('University of California, Berkeley' in person['education'] or 'UC Berkeley' in person['education']) and ('Software Engineer,software engineer' in person['job_title'] or 'Intern,intern' in person['job_title'] or 'Senior Software Engineer,senior software engineer' in person['job_title'] or 'Software Engineering Intern,software,engineering,intern' in person['job_title'] or 'Project Manager,project manager' in person['job_title']):
        test_profiles.append(idx)
print(len(test_profiles))

5500


In [13]:
def heuristic(prof_1, prof_2, t):
    # longest common substring of jobs between the two profiles
    res = 0
    p1_jobs = prof_1.attributes['company']
    p2_jobs = prof_2.attributes['company']
    if p1_jobs[0] != p2_jobs[0]:
        return 0
    lcs = [[0 for j in range(len(p2_jobs) + 1)] for i in range(len(p1_jobs) + 1)]
    for i in range(len(p1_jobs) + 1):
        for j in range(len(p2_jobs) + 1):
            if i == 0 or j == 0:
                lcs[i][j] = 0
            else:
                p1_start = datetime.strptime(prof_1.attributes['job_start_end'][i-1][0], "%Y-%m-%d")
                p2_start = datetime.strptime(prof_2.attributes['job_start_end'][j-1][0], "%Y-%m-%d")
                p1_title = prof_1.attributes['job_title'][i-1]
                p2_title = prof_2.attributes['job_title'][j-1]
                diff = abs((p1_start - p2_start).days) / 365
                if p1_jobs[i-1] == p2_jobs[j-1] and p1_title == p2_title and diff >= t - 1 and diff < t + 1:
                    lcs[i][j] = lcs[i-1][j-1] + 1
                    res = max(res, lcs[i][j])
                else:
                    lcs[i][j] = 0
    return res
    
    
    

In [14]:
best_profiles = {}
counter = 1
#for t in range(2, 11, 2):
curr_best_profiles = {}
for i in range(len(test_profiles)):
    curr = []
    if counter % 50 == 0:
        print(counter)
    for j in range(len(test_profiles)):
        if i == j:
            continue
        prof_1 = profiles[test_profiles[i]]
        prof_2 = profiles[test_profiles[j]]
        h = heuristic(prof_1, prof_2, 2)
        if len(curr) <= 3:
            heapq.heappush(curr, (h, test_profiles[j], prof_2))
        else:
            heapq.heappushpop(curr, (h, test_profiles[j], prof_2))
    curr_best_profiles[test_profiles[i]] = curr
    counter += 1
#best_profiles[t] = curr_best_profiles

50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750
3800
3850
3900
3950
4000
4050
4100
4150
4200
4250
4300
4350
4400
4450
4500
4550
4600
4650
4700
4750
4800
4850
4900
4950
5000
5050
5100
5150
5200
5250
5300
5350
5400
5450
5500


In [27]:
curr_best_profiles

{'f0a33c77-0f2b-3674-8405-8f8d58841010': [(0,
   'ffd9d4dd-efad-3108-9665-5510f011d26e',
   <__main__.Profile at 0x13a8258d0>),
  (0,
   'ffeae9b0-7b5d-3402-8eaa-a73ea7e41c5b',
   <__main__.Profile at 0x13592a390>),
  (0,
   'ffec9c10-afec-3c88-9f3f-e93756b4742f',
   <__main__.Profile at 0x11b2d6ac8>),
  (0,
   'fff52775-ca69-376e-a5ed-28b465d96c99',
   <__main__.Profile at 0x13d387400>)],
 '14f4b656-e9c8-36f8-9abf-fa79eb4c106b': [(0,
   'ffd9d4dd-efad-3108-9665-5510f011d26e',
   <__main__.Profile at 0x13a8258d0>),
  (0,
   'ffeae9b0-7b5d-3402-8eaa-a73ea7e41c5b',
   <__main__.Profile at 0x13592a390>),
  (0,
   'fff52775-ca69-376e-a5ed-28b465d96c99',
   <__main__.Profile at 0x13d387400>),
  (0,
   'ffec9c10-afec-3c88-9f3f-e93756b4742f',
   <__main__.Profile at 0x11b2d6ac8>)],
 '2849ead1-7a99-38a4-8fbb-6a96747c8e91': [(0,
   'ffd9d4dd-efad-3108-9665-5510f011d26e',
   <__main__.Profile at 0x13a8258d0>),
  (0,
   'ffeae9b0-7b5d-3402-8eaa-a73ea7e41c5b',
   <__main__.Profile at 0x13592a390>)

In [15]:
entries = []
count = 0
for i, j in curr_best_profiles.items():
    for curr in j:
        if curr[0] > 1:
            print(curr[0])
            entries.append((str(i), curr[1]))
            break
for e in entries:
    print(e)

3
2
3
2
('45ad96f6-255d-36db-bfdd-700378a9f2ab', '8036791a-75d6-356f-b3a6-7e79441e3096')
('7bbe3efb-bb7e-352e-98e2-b0aa3325f1b6', '807165a1-25c8-3fa8-b9d4-55cc21977f78')
('8036791a-75d6-356f-b3a6-7e79441e3096', '45ad96f6-255d-36db-bfdd-700378a9f2ab')
('807165a1-25c8-3fa8-b9d4-55cc21977f78', '7bbe3efb-bb7e-352e-98e2-b0aa3325f1b6')


In [16]:
def compare_profiles(profile_1, profile_2):
    d = {profile_1: profiles[profile_1].attributes, profile_2: profiles[profile_2].attributes}
    df = pd.DataFrame(d)
    return df

In [31]:
profiles['807165a1-25c8-3fa8-b9d4-55cc21977f78'].attributes

{'company': ['Geoworks',
  'Global PC',
  'Ars Digita',
  'Musicmatch/Yahoo',
  'Google',
  'Google',
  'Google'],
 'education': ['University of California, Berkeley'],
 'education_level': [4],
 'elite': [True],
 'gender': 2,
 'id': '807165a1-25c8-3fa8-b9d4-55cc21977f78',
 'job_length': [1277, 151, 427, 1553, 638, 823, 1642],
 'job_start_end': [('1995-01-01', '1998-07-01'),
  ('1998-11-01', '1999-04-01'),
  ('1999-04-01', '2000-06-01'),
  ('2000-10-01', '2005-01-01'),
  ('2006-01-01', '2007-10-01'),
  ('2007-10-01', '2010-01-01'),
  ('2010-01-01', '2014-07-01')],
 'job_title': ['Software Engineer,software engineer',
  'Software Engineer,software engineer',
  'Software Engineer,software engineer',
  'Senior Software Engineer,senior software engineer',
  'Senior Software Engineer,senior software engineer',
  'Senior Software Engineer,senior software engineer',
  'Senior Software Engineer,senior software engineer'],
 'skillset1': 'Software Engineering',
 'skillset2': 'Digital Marketing'}

In [19]:
pd.set_option('display.max_colwidth', -1)
compare_profiles('7bbe3efb-bb7e-352e-98e2-b0aa3325f1b6', '807165a1-25c8-3fa8-b9d4-55cc21977f78')

Unnamed: 0,7bbe3efb-bb7e-352e-98e2-b0aa3325f1b6,807165a1-25c8-3fa8-b9d4-55cc21977f78
company,"[Geoworks, Global PC, Ars Digita, Musicmatch/Yahoo, Google, Google, Google]","[Geoworks, Global PC, Ars Digita, Musicmatch/Yahoo, Google, Google, Google]"
education,"[University of California, Berkeley]","[University of California, Berkeley]"
education_level,[4],[4]
elite,[True],[True]
gender,2,2
id,7bbe3efb-bb7e-352e-98e2-b0aa3325f1b6,807165a1-25c8-3fa8-b9d4-55cc21977f78
job_length,"[1277, 151, 427, 1553, 638, 823, 1642]","[1277, 151, 427, 1553, 638, 823, 1642]"
job_start_end,"[(1995-01-01, 1998-07-01), (1998-11-01, 1999-04-01), (1999-04-01, 2000-06-01), (2000-10-01, 2005-01-01), (2006-01-01, 2007-10-01), (2007-10-01, 2010-01-01), (2010-01-01, 2014-07-01)]","[(1995-01-01, 1998-07-01), (1998-11-01, 1999-04-01), (1999-04-01, 2000-06-01), (2000-10-01, 2005-01-01), (2006-01-01, 2007-10-01), (2007-10-01, 2010-01-01), (2010-01-01, 2014-07-01)]"
job_title,"[Software Engineer,software engineer, Software Engineer,software engineer, Software Engineer,software engineer, Senior Software Engineer,senior software engineer, Senior Software Engineer,senior software engineer, Senior Software Engineer,senior software engineer, Senior Software Engineer,senior software engineer]","[Software Engineer,software engineer, Software Engineer,software engineer, Software Engineer,software engineer, Senior Software Engineer,senior software engineer, Senior Software Engineer,senior software engineer, Senior Software Engineer,senior software engineer, Senior Software Engineer,senior software engineer]"
skillset1,Software Engineering,Software Engineering


In [46]:
profiles['ab1599bd-1245-31b7-b9b1-2da5fb58ed96'].attributes

{'company': ['Lawrence Berkeley National Laboratory',
  'UC Berkeley',
  'UC Berkeley',
  'LinkedIn',
  'C3 IoT'],
 'education': ['UC Berkeley'],
 'education_level': [4],
 'elite': [True],
 'gender': 2,
 'id': 'ab1599bd-1245-31b7-b9b1-2da5fb58ed96',
 'job_length': [120, 365, 120, 61, 'ongoing'],
 'job_start_end': [('2013-01-01', '2013-05-01'),
  ('2013-05-01', '2014-05-01'),
  ('2014-01-01', '2014-05-01'),
  ('2014-06-01', '2014-08-01'),
  ('2015-02-01', '2019-03-01')],
 'job_title': ['Research Assistant,research assistant',
  'Research Assistant,research assistant',
  'Course Reader',
  'Software Engineer Intern,software engineer,intern',
  'Software Engineer,software engineer'],
 'skillset1': 'Software Engineering',
 'skillset2': 'Data Analysis'}

In [252]:
profiles['5d9fa74f-214c-3e58-b76f-438378f96168'].attributes

{'company': ['UC Berkeley', 'Microsoft', 'Sift Science', 'Google'],
 'education': ['UC Berkeley', 'UC Berkeley'],
 'education_level': [4, 4],
 'elite': [True, True],
 'gender': 2,
 'id': '5d9fa74f-214c-3e58-b76f-438378f96168',
 'job_length': [2465, 92, 92, 'ongoing'],
 'job_start_end': [('2011-08-01', '2018-05-01'),
  ('2013-05-01', '2013-08-01'),
  ('2016-06-01', '2016-09-01'),
  ('2018-07-01', '2019-03-01')],
 'job_title': ['Graduate Student Researcher,graduate student,researcher',
  'Research Intern,research,intern',
  'Technical Intern,technical,intern',
  'Software Engineer,software engineer'],
 'skillset1': 'Software Engineering',
 'skillset2': 'Electrical Engineering'}

In [190]:
profiles['29951db6-74b5-3bdd-8c8e-4c501782ac4e'].attributes

{'company': ['UC Berkeley School of Law',
  'UC Berkeley',
  'Hewlett-Packard',
  'Apple Inc.',
  'Amazon',
  'Google',
  'Smartcar, Inc.',
  'Driver',
  'TIME_OFF'],
 'education': ['University of California, Berkeley'],
 'education_level': [4],
 'elite': [True],
 'gender': 2,
 'id': '29951db6-74b5-3bdd-8c8e-4c501782ac4e',
 'job_length': [122, 122, 214, 92, 274, 883, 183, 214, 'ongoing'],
 'job_start_end': [('2009-08-01', '2009-12-01'),
  ('2009-08-01', '2009-12-01'),
  ('2010-05-01', '2010-12-01'),
  ('2011-05-01', '2011-08-01'),
  ('2012-07-01', '2013-04-01'),
  ('2013-05-01', '2015-10-01'),
  ('2015-10-01', '2016-04-01'),
  ('2016-05-01', '2016-12-01'),
  ('2016-12-01', '2019-03-01')],
 'job_title': ['Information Technologies',
  'Lab Assistant,laboratory assistant',
  'Firmware Intern,intern',
  'Test Engineering and Triage Intern,test,engineering,intern',
  'Software Developer Engineer,software developer,engineer',
  'Software Engineer,software engineer',
  'Software Engineer,soft

In [216]:
heuristic(profiles['245d8f01-3817-38b5-9513-cc0ab59c5790'], profiles['245d8f01-3817-38b5-9513-cc0ab59c5790'], 0)

7

In [158]:
best_profiles = {}
for t in range(2, 11, 2):
    matches = []
    for i in range(len(test_profiles)):
        idx = test_profiles[i]
        ref_idx = profiles[idx].attributes['company'].index('Google')
        ref_start = datetime.strptime(profiles[idx].attributes['job_start_end'][ref_idx][0], "%Y-%m-%d")
        ref_len = profiles[idx].attributes['job_length'][ref_idx]
        for j in range(i+1, len(test_profiles)):
            curr = test_profiles[j]
            curr_idx = profiles[curr].attributes['company'].index('Google')
            curr_start = datetime.strptime(profiles[curr].attributes['job_start_end'][curr_idx][0], "%Y-%m-%d")
            curr_len = profiles[curr].attributes['job_length'][curr_idx]
            diff = (ref_start - curr_start).days / 365
            if diff < 0:
                first = idx
                second = curr
                diff = abs(diff)
            else:
                first = curr
                second = idx
            if diff >= t and diff < t+0.5:
                matches.append((first, second))
    best_profiles[t] = matches
for i in best_profiles:
    print(len(best_profiles[i]))
print(best_profiles[2][5])
profiles['245d8f01-3817-38b5-9513-cc0ab59c5790'].attributes
        

1861
1178
692
528
342
('245d8f01-3817-38b5-9513-cc0ab59c5790', '7843e331-c96d-38cd-a538-e9799f2cea17')


{'company': ['Lockheed Martin',
  'Lockheed Martin',
  'Lockheed Martin',
  'Bignoggins Productions',
  'Lockheed Martin',
  'Quest Visual, Inc.',
  'Google'],
 'education': ['University of California, Berkeley', 'Stanford University'],
 'education_level': [4, 4],
 'elite': [True, True],
 'gender': 2,
 'id': '245d8f01-3817-38b5-9513-cc0ab59c5790',
 'job_length': [304, 457, 1157, 1096, 517, 791, 'ongoing'],
 'job_start_end': [('2005-07-01', '2006-05-01'),
  ('2006-05-01', '2007-08-01'),
  ('2007-08-01', '2010-10-01'),
  ('2010-06-01', '2013-06-01'),
  ('2010-10-01', '2012-03-01'),
  ('2012-03-01', '2014-05-01'),
  ('2014-05-01', '2019-03-01')],
 'job_title': ['Systems Engineer Associate,systems engineer,",",associate',
  'Research Engineer Associate,research engineer,",",associate',
  'Software Engineer,software engineer',
  'Lead Android Developer,android developer,lead',
  'Software Engineer Senior,software engineer,senior',
  'Programmer,programmer',
  'Software Engineer,software eng

In [96]:
((datetime.strptime('2019-03-01', "%Y-%m-%d")- datetime.strptime('2018-03-01', "%Y-%m-%d")).days) / 365

1.0

In [53]:
profiles[test_profiles[12]].attributes

{'company': ['California State Summer School for Mathematics and Science',
  'Self Employed',
  'WD, a Western Digital company',
  'Dell',
  'Dell',
  'Google'],
 'education': ['UC Berkeley'],
 'education_level': [4],
 'elite': [True],
 'gender': 2,
 'id': '5bf8844f-aa65-34b8-9c73-3793b2011b57',
 'job_length': [31, 700, 61, 92, 397, 'ongoing'],
 'job_start_end': [('2010-07-01', '2010-08-01'),
  ('2010-09-01', '2012-08-01'),
  ('2013-06-01', '2013-08-01'),
  ('2014-05-01', '2014-08-01'),
  ('2015-08-01', '2016-09-01'),
  ('2016-10-01', '2019-03-01')],
 'job_title': ['Student/Researcher',
  'Private Tutor,private,tutor',
  'Software Engineer Intern,software engineer,intern',
  'Software Developer and Test Engineer Intern,software developer,test engineer,intern',
  'Software Development Engineer,development engineer,software',
  'Software Engineer,software engineer'],
 'skillset1': 'Web Development',
 'skillset2': 'Software Engineering'}

In [20]:
import itertools
pairs = itertools.combinations(list(profiles.keys()), 2)

In [5]:
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

  from numpy.core.umath_tests import inner1d


In [31]:
import json
learner = ActiveLearner(
    estimator=RandomForestClassifier(),
    query_strategy=uncertainty_sampling
)
for i in pairs:
    break
print(str(profiles[i[0]].attributes))
learner.teach([str((profiles[i[0]].attributes, profiles[i[1]].attributes))], [0])

{'id': '3243dc62-9392-3e2f-b80a-d645202a4095', 'gender': 1, 'skillset1': 'Non-Profit and Community', 'skillset2': 'Social Media and Communications', 'job_title': ['Social Media and Public Relations Administrator,public relations,media,administrator', 'Graduate Student Assistant,graduate student,student assistant', 'Graduate Teaching Assistant,teaching assistant,graduate'], 'job_start_end': [('2011-07-01', '2012-04-01'), ('2013-04-01', '2014-05-01'), ('2015-10-01', '2019-03-01')], 'job_length': [275, 395, 'ongoing'], 'company': ['MAIYA Gallery', 'Reischauer Institute of Japanese Studies', 'UCLA'], 'education': ['University of California, Berkeley', 'Harvard University'], 'education_level': [4, 5], 'elite': [True, True]}


ValueError: could not convert string to float: '({\'id\': \'3243dc62-9392-3e2f-b80a-d645202a4095\', \'gender\': 1, \'skillset1\': \'Non-Profit and Community\', \'skillset2\': \'Social Media and Communications\', \'job_title\': [\'Social Media and Public Relations Administrator,public relations,media,administrator\', \'Graduate Student Assistant,graduate student,student assistant\', \'Graduate Teaching Assistant,teaching assistant,graduate\'], \'job_start_end\': [(\'2011-07-01\', \'2012-04-01\'), (\'2013-04-01\', \'2014-05-01\'), (\'2015-10-01\', \'2019-03-01\')], \'job_length\': [275, 395, \'ongoing\'], \'company\': [\'MAIYA Gallery\', \'Reischauer Institute of Japanese Studies\', \'UCLA\'], \'education\': [\'University of California, Berkeley\', \'Harvard University\'], \'education_level\': [4, 5], \'elite\': [True, True]}, {\'id\': \'8509d39a-2942-3882-ac59-d292e661aeb4\', \'gender\': 1, \'skillset1\': \'Human Resources (Junior)\', \'skillset2\': \'Graphic Design\', \'job_title\': [\'Undergraduate Research Assistant,research assistant\', \'Production Assistant,production assistant\', \'Omni Merchant Intern Digital Team,merchant,intern,","\', \'Peer Advisor: Brown Office of International Programs,advisor,office\', \'Research Analyst,research analyst\'], \'job_start_end\': [(\'2014-03-01\', \'2014-06-01\'), (\'2014-06-01\', \'2015-08-01\'), (\'2016-06-01\', \'2016-07-01\'), (\'2016-09-01\', \'2017-06-01\'), (\'2017-08-01\', \'2019-03-01\')], \'job_length\': [92, 426, 30, 273, \'ongoing\'], \'company\': [\'Northwestern University Department of History\', \'San Francisco Opera\', "Bloomingdale\'s", \'Brown University\', \'The Research Board\'], \'education\': [\'Northwestern University\', \'Brown University\'], \'education_level\': [4, 4], \'elite\': [True, True]})'

In [203]:
# see one entry of data
df.iloc[7]

0        49fe9f08-244a-3e3d-9d8b-2fe2f59c5870
1                                        1987
2                                           0
3                                          -1
4                                           0
5                                          -1
6                                           0
7                                   Hyderabad
8                                       India
9                                           4
10                                      False
11                                 2012-06-01
12                                       True
13                                 2014-02-01
14                                       True
15                                      False
16                                        610
17    Software Consultant,software,consultant
18                                 TECHNOLOGY
19                                   Qualcomm
20                                   Qualcomm
21                                

In [None]:
import itertools
pairs = list(itertools.combinations(list(profiles.keys()), 2))
print(len(pairs))

In [204]:
def heuristic(prof_x, prof_y):
    score = 0
    # check if job titles (regular and normalized) and departement are the same
    if prof_x[17].split(',')[0] == prof_y[17].split(',')[0]:
        score += 2
    if prof_x[17].split(',')[-1] == prof_y[17].split(',')[-1]:
        score += 2
    if prof_x[18].split(',')[0] == prof_y[18].split(',')[0]:
        score += 1
    # check if company's are the same
    if prof_x[19] == prof_y[19]:
        score += 3
    # check if primary and secondary industry's are the same
    if prof_x[3] != '-1' and prof_x[3] == prof_y[3]:
        score += 1
    if prof_x[5] != '-1' and prof_x[5] == prof_y[5]:
        score += 0.5
    # check education
    if prof_x[9] > 0 and prof_x[9] == prof_y[9]:
        score += 1
        if prof_x[10] == prof_y[10]:
            score += 1
    return score
    

In [205]:
best_profiles = {}
profiles = {}
top_k_profiles = 2
for row in df.iterrows():
    curr_matches = []
    for curr_row in df.iterrows():
        if curr_row[0] != row[0]:
            curr_score = heuristic(row[1], curr_row[1])
            if len(curr_matches) < top_k_profiles:
                curr_id = curr_row[1][0]
                heapq.heappush(curr_matches, (curr_score, curr_id))
            else:
                curr_id = curr_row[1][0]
                heapq.heappushpop(curr_matches, (curr_score, curr_id))
    profile_id = row[1][0]
    profiles[profile_id] = row
    best_profiles[profile_id] = curr_matches

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/ericgan/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-205-547e8828b3d6>", line 8, in <module>
    curr_score = heuristic(row[1], curr_row[1])
  File "<ipython-input-204-807184f18f68>", line 14, in heuristic
    if prof_x[3] != '-1' and prof_x[3] == prof_y[3]:
  File "/Users/ericgan/anaconda/lib/python3.6/site-packages/pandas/core/series.py", line 868, in __getitem__
    result = self.index.get_value(self, key)
  File "/Users/ericgan/anaconda/lib/python3.6/site-packages/pandas/core/indexes/base.py", line 4371, in get_value
    k = self._convert_scalar_indexer(k, kind='getitem')
  File "/Users/ericgan/anaconda/lib/python3.6/site-packages/pandas/core/indexes/numeric.py", line 211, in _convert_scalar_indexer
    ._convert_scalar_indexer(key, kind=kind))
  File "/Users/ericgan/anaconda/lib/python3.6/site-packages/pand

KeyboardInterrupt: 

In [208]:
best_profiles

{'9a198910-11dd-3641-8ff6-1830a9a03ac0': [(5,
   '56ad2c71-6739-3115-99fc-764f5403ff5a'),
  (5, '746ff61a-3d2c-303f-b761-fed153eda5e4')],
 'e56c378b-2158-3292-ae36-e7836c748fe9': [(5,
   '7c58b12b-b6c0-3cda-9d0f-cb56e12006f7'),
  (5, 'e02ae028-1a4d-3dd8-a70d-e70bf8c3e6f0')],
 '7f2f6b28-8ee9-3ed1-a8a2-cf0ac6e92d6b': [(5,
   'e335e017-d9db-3760-bc33-e8237ca4806f'),
  (5, 'f9df40ad-4477-3b68-b238-4dc4bd55e049')],
 '783c7c66-b3c5-3411-8799-03d67552e797': [(5,
   'f45c447f-c22c-3795-aaac-47b9a1b69feb'),
  (5, 'f6d0f5d7-2281-3594-bdd5-04ce9554fd04')],
 '38d07dd6-f611-3145-b363-0a4ef84a48ba': [(4,
   'f703fdc4-e303-33d3-b207-0db51eeea4a1'),
  (5, '61895f88-f1d3-3f04-9f1b-1064479b94e5')],
 'c1eeb54e-d256-351f-becc-d5901a58c862': [(3.5,
   'f91ea749-17f9-3502-9334-2bc35fe6c2dd'),
  (4, '0a08cf84-690f-398d-bc2f-50437e6f9fe9')],
 'd2d9b5ad-965c-39b7-af38-2a05f5a501b1': [(7,
   'f5f6a6fa-5961-3f86-9490-54cd24d9f906'),
  (7, 'f84e01c5-9ee1-3e6c-9a85-032e93f0caf6')],
 '49fe9f08-244a-3e3d-9d8b-2fe2f5

In [209]:
print(profiles['a28f29d4-23a2-3f38-a841-a0fe27206858'][1])
print(profiles['6980e52e-c7aa-3773-9cea-e66d2f90ff09'][1])

0      a28f29d4-23a2-3f38-a841-a0fe27206858
1                                      1982
2                                         2
3                           Web Development
4                                  0.435665
5                      Business Development
6                                  0.384697
7                                Menlo Park
8                             United States
9                                         7
10                                     True
11                               2016-09-01
12                                     True
13                                     None
14                                    False
15                                     True
16                                      860
17    Research Assistant,research assistant
18                               TECHNOLOGY
19                      Stanford University
20                      stanford university
21                               UNIVERSITY
22                            UN

In [149]:
for i in best_profiles:
    if len(best_profiles[i]) > 0:
        print(profiles[i])
        for j in best_profiles[i]:
            print(profiles[j])

(20081, 0     f0c03235-e0e4-3b93-8d10-fad8de38f2f4
1                                     2001
2                                        2
3                          Web Development
4                                 0.977094
5                     Software Engineering
6                                0.0171799
7                                 San Jose
8                            United States
9                                        0
10                                   False
11                              2014-02-01
12                                    True
13                                    None
14                                   False
15                                    True
16                                    1803
17     Software Engineer,software engineer
18                              TECHNOLOGY
19                                   Apple
20                                   Apple
21                                    AAPL
22                                    XNAS
23 