In [201]:
import numpy as np
import pandas as pd
import scipy
import math
import heapq

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline

In [202]:
chunksize = 320 #approximately 23644157 entries; wc -l bay_area.csv

df = pd.DataFrame()
curr_chunk = 0
data=pd.read_csv('bay_area.csv', sep='\t', chunksize=chunksize)

for chunk in data:
    rand_person = chunk.sample(n=1)
    df = pd.concat([df, rand_person])
    
    # log progress
    curr_chunk += 1
    if curr_chunk % 10000 == 0:
        print(curr_chunk)

df.columns = [i for i in range(len(df.columns))]

# drop entries with poor data
df = df.dropna(subset=[17, 18])
df

10000
20000
30000
40000
50000
60000
70000


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,30,31,32
319,9a198910-11dd-3641-8ff6-1830a9a03ac0,1981,2,-1,0.000000,-1,0.000000,San Mateo,United States,7,...,False,"livermore, ca",92,False,,False,,,,1540512888000
2161,e56c378b-2158-3292-ae36-e7836c748fe9,1969,0,-1,0.000000,-1,0.000000,San Francisco,United States,4,...,False,"beverly hills, costa mesa, boston",31,False,,False,,,,1540111954000
3794,7f2f6b28-8ee9-3ed1-a8a2-cf0ac6e92d6b,2001,2,-1,0.000000,-1,0.000000,San Francisco,United States,4,...,False,,611310,False,,False,,,,1534734981000
4646,783c7c66-b3c5-3411-8799-03d67552e797,1979,2,-1,0.000000,-1,0.000000,San Francisco,United States,5,...,False,"tampa/st. petersburg, florida area",,False,,False,,,,1532134613000
6047,38d07dd6-f611-3145-b363-0a4ef84a48ba,1981,2,-1,0.000000,-1,0.000000,,Indonesia,4,...,False,"greater jakarta area, indonesia",,False,,False,,,,1533121658000
6924,c1eeb54e-d256-351f-becc-d5901a58c862,1970,1,Administration,0.487604,Pharmaceutical,0.137303,,United Kingdom,5,...,True,"manchester, united kingdom",,False,,False,,,BBG000P5DRT5,1496781882000
8114,d2d9b5ad-965c-39b7-af38-2a05f5a501b1,1965,2,-1,0.000000,-1,0.000000,San Francisco,United States,4,...,False,,,False,,False,,,,1538850995000
8758,49fe9f08-244a-3e3d-9d8b-2fe2f59c5870,1987,0,-1,0.000000,-1,0.000000,Hyderabad,India,4,...,True,greater san diego area,334413,False,,False,,,BBG000CGC1X8,1538842674000
8963,e2c8c3fa-15c2-3adc-ab95-a1503741a8ea,1983,1,-1,0.000000,-1,0.000000,San Francisco,United States,0,...,False,bay area,56,False,,False,,,,1538952095000
9935,f9bc8941-bf95-354b-ba6f-88f426d3fda6,2001,1,-1,0.000000,-1,0.000000,San Francisco,United States,0,...,True,,561320,False,,False,,,BBG000BS5DR2,1538818315000


In [203]:
# see one entry of data
df.iloc[7]

0        49fe9f08-244a-3e3d-9d8b-2fe2f59c5870
1                                        1987
2                                           0
3                                          -1
4                                           0
5                                          -1
6                                           0
7                                   Hyderabad
8                                       India
9                                           4
10                                      False
11                                 2012-06-01
12                                       True
13                                 2014-02-01
14                                       True
15                                      False
16                                        610
17    Software Consultant,software,consultant
18                                 TECHNOLOGY
19                                   Qualcomm
20                                   Qualcomm
21                                

In [None]:
def heuristic(prof_x, prof_y):
    score = 0
    # check if job titles (regular and normalized) and departement are the same
    if prof_x[17].split(',')[0] == prof_y[17].split(',')[0]:
        score += 2
    if prof_x[17].split(',')[-1] == prof_y[17].split(',')[-1]:
        score += 2
    if prof_x[18].split(',')[0] == prof_y[18].split(',')[0]:
        score += 1
    # check if company's are the same
    if prof_x[19] == prof_y[19]:
        score += 3
    # check if primary and secondary industry's are the same
    if prof_x[3] != '-1' and prof_x[3] == prof_y[3]:
        score += 1
    if prof_x[5] != '-1' and prof_x[5] == prof_y[5]:
        score += 0.5
    # check education
    if prof_x[9] > 0 and prof_x[9] == prof_y[9]:
        score += 1
        if prof_x[10] == prof_y[10]:
            score += 1
    return score
    

In [None]:
best_profiles = {}
profiles = {}
top_k_profiles = 2
for row in df.iterrows():
    curr_matches = []
    for curr_row in df.iterrows():
        if curr_row[0] != row[0]:
            curr_score = heuristic(row[1], curr_row[1])
            if len(curr_matches) < top_k_profiles:
                curr_id = curr_row[1][0]
                heapq.heappush(curr_matches, (curr_score, curr_id))
            else:
                curr_id = curr_row[1][0]
                heapq.heappushpop(curr_matches, (curr_score, curr_id))
    profile_id = row[1][0]
    profiles[profile_id] = row
    best_profiles[profile_id] = curr_matches

In [None]:
best_profiles

In [199]:
print(profiles['3b0a8a0c-b49a-3ee8-b071-422554a3b9b9'][1])
print(profiles['846d43b8-3ea6-3c36-8fca-84539eab93d2'][1])

0     3b0a8a0c-b49a-3ee8-b071-422554a3b9b9
1                                     1981
2                                        1
3                     Software Engineering
4                                 0.794306
5                   Electrical Engineering
6                                 0.132796
7                                 San Jose
8                            United States
9                                        5
10                                   False
11                              2014-07-01
12                                    True
13                              2017-06-01
14                                    True
15                                   False
16                                    1066
17     Software Engineer,software engineer
18                              TECHNOLOGY
19                       Intel Corporation
20                       Intel Corporation
21                                    INTC
22                                    XNAS
23         

In [149]:
for i in best_profiles:
    if len(best_profiles[i]) > 0:
        print(profiles[i])
        for j in best_profiles[i]:
            print(profiles[j])

(20081, 0     f0c03235-e0e4-3b93-8d10-fad8de38f2f4
1                                     2001
2                                        2
3                          Web Development
4                                 0.977094
5                     Software Engineering
6                                0.0171799
7                                 San Jose
8                            United States
9                                        0
10                                   False
11                              2014-02-01
12                                    True
13                                    None
14                                   False
15                                    True
16                                    1803
17     Software Engineer,software engineer
18                              TECHNOLOGY
19                                   Apple
20                                   Apple
21                                    AAPL
22                                    XNAS
23 