In [6]:
import pandas as pd
import tiktoken
import re

# from utils.embeddings_utils import get_embedding

embedding_model = "text-embedding-3-small"
embedding_encoding = "cl100k_base"
max_tokens = 8000  # the maximum for text-embedding-3-small is 8191

In [2]:
individuals = [
    {
        "nome": "John Doe",
        "area": "Software Engineering",
        "minibio": "Passionate software engineer with 5+ years of experience in developing scalable web applications. Proficient in Python, JavaScript, and React.",
        "linkedin": "https://www.linkedin.com/in/johndoe"
    },
    {
        "nome": "Jane Smith",
        "area": "Data Science",
        "minibio": "Data scientist with expertise in machine learning algorithms and predictive analytics. Experienced in Python, R, and TensorFlow.",
        "linkedin": "https://www.linkedin.com/in/janesmith"
    },
    {
        "nome": "Michael Johnson",
        "area": "Finance",
        "minibio": "Finance professional with a background in investment banking. Skilled in financial modeling, risk management, and corporate finance.",
        "linkedin": "https://www.linkedin.com/in/michaeljohnson"
    },
    {
        "nome": "Emily Wang",
        "area": "Marketing",
        "minibio": "Creative marketing specialist with a focus on digital marketing strategies and social media management. Experienced in SEO, SEM, and content creation.",
        "linkedin": "https://www.linkedin.com/in/emilywang"
    }
]


## Preprocessing data

In [10]:
# Drop unwanted columns
dataset = pd.read_excel("data/hack_people.xlsx")
# dataset = pd.DataFrame(individuals)

cols = ["nome", "area", "minibio", "linkedin"]
# Ensure that all  columsn are strings
for column in dataset.columns:
    dataset[column] = dataset[column].astype(str)

dataset["area"] = dataset["area"].str.lower()
dataset["nome"] = dataset["nome"].str.lower()

for c in dataset.columns:
    dataset[c] = dataset[c].str.strip()
# For now drop it
dataset.drop(columns=["linkedin"], inplace=True)
dataset.head()

Unnamed: 0,nome,area,minibio
0,antonio victor de oliveira matos paiva,engenharia mecatrônica/aviação/educação,sou apaixonado por revoluções tecnológicas e j...
1,barbara rezende neri,"tecnologia,",Ciência da computação na Federal de São João d...
2,breno keller lie,estudante de adm,Colégio suíço brasileiro e PUCPR
3,bruno mateus tizer das chagas,tecnologia,
4,daniel marcelo gonzaga de lorena braga,"desenvolvedor de software,","Estudo na Descomplica ADS, trabalhei na Cerc u..."


#### Normalize text

In [11]:
pd.options.mode.chained_assignment = None #https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#evaluation-order-matters
# s is input text
def normalize_text(s, sep_token = " \n "):
    s = re.sub(r'\s+',  ' ', s).strip()
    s = re.sub(r". ,","",s)
    # remove all instances of multiple spaces
    s = s.replace("..",".")
    s = s.replace(". .",".")
    s = s.replace("\n", "")
    s = s.strip()
    
    return s

dataset.fillna('', inplace=True)

dataset = dataset.map(normalize_text)

In [12]:
dataset.columns.tolist()

['nome', 'area', 'minibio']

In [13]:
# Combine the text columns into a single column
dataset["combined"] = "Nome: " + dataset["nome"] + ";" + "Area: " + dataset["area"] + "; Minibio: " + dataset["minibio"]
dataset.combined.head()


0    Nome: antonio victor de oliveira matos paiva;A...
1    Nome: barbara rezende neri;Area: tecnologia,; ...
2    Nome: breno keller lie;Area: estudante de adm;...
3    Nome: bruno mateus tizer das chagas;Area: tecn...
4    Nome: daniel marcelo gonzaga de lorena braga;A...
Name: combined, dtype: object

## Embedding

In [14]:
top_n = 1000

encoding = tiktoken.get_encoding(embedding_encoding)

dataset["n_tokens"] = dataset.combined.apply(lambda x: len(encoding.encode(x)))
dataset = dataset[dataset.n_tokens <= max_tokens].tail(top_n)
len(dataset)

52

In [15]:
dataset.head()

Unnamed: 0,nome,area,minibio,combined,n_tokens
0,antonio victor de oliveira matos paiva,engenharia mecatrônica/aviação/educação,sou apaixonado por revoluções tecnológicas e j...,Nome: antonio victor de oliveira matos paiva;A...,77
1,barbara rezende neri,"tecnologia,",Ciência da computação na Federal de São João d...,"Nome: barbara rezende neri;Area: tecnologia,; ...",33
2,breno keller lie,estudante de adm,Colégio suíço brasileiro e PUCPR,Nome: breno keller lie;Area: estudante de adm;...,31
3,bruno mateus tizer das chagas,tecnologia,,Nome: bruno mateus tizer das chagas;Area: tecn...,22
4,daniel marcelo gonzaga de lorena braga,"desenvolvedor de software,","Estudo na Descomplica ADS, trabalhei na Cerc u...",Nome: daniel marcelo gonzaga de lorena braga;A...,70


In [16]:
max(dataset.n_tokens)

245

In [24]:
## Get embeddings
import cohere

api_key = "306PkdzQVOkbODG9TN62g97mb32XspH7b4gO2mt5"

co = cohere.Client(api_key)

response = co.embed(
    texts=dataset.combined.tolist(),
    model="embed-multilingual-v3.0",
    input_type="classification",
)

if hasattr(response, "id"):
    print("Something went wrong")

Something went wrong


In [30]:
dataset["embedding"] = response.embeddings


In [36]:
dataset.to_csv("data/embeddings.csv", index=False)

## Matching

In [65]:
# Create classifier
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
test_samples = [5,7,13,21]
knn = KNeighborsClassifier(n_neighbors=2)
test_indices = dataset.index.isin(test_samples)
train_dataset = dataset[~test_indices]
test_dataset = dataset[test_indices]
X_train, y_train, X_test, y_test = train_dataset.embedding.tolist(), train_dataset.index, test_dataset.embedding.tolist(), test_dataset.index
knn.fit(X_train, y_train)


In [85]:
def get_neighboors(sample : int, n=3):
    neighbors = knn.kneighbors(np.array(dataset.iloc[sample].embedding).reshape((1, -1)), n_neighbors=3, return_distance=False)
    nearest_samples = dataset.iloc[neighbors[0]]
    person_selected = dataset.iloc[sample]
    print(f"Pessoa com nome {person_selected['nome']} e i={sample} deu match com:")
    display(nearest_samples)

for i in range(len(test_samples)):
    get_neighboors(test_samples[i], n=3)
# # Get the indices of the 3 nearest neighbors
# neighbors = knn.kneighbors(np.array(dataset.iloc[sample].embedding).reshape((1, -1)), n_neighbors=3, return_distance=False)

# # Get the corresponding samples
# nearest_samples = dataset.iloc[neighbors[0]]

# # Display the nearest samples
# display(nearest_samples)


Pessoa com nome davi d'avila versan e i=5 deu match com:


Unnamed: 0,nome,area,minibio,combined,n_tokens,embedding
0,antonio victor de oliveira matos paiva,engenharia mecatrônica/aviação/educação,sou apaixonado por revoluções tecnológicas e j...,Nome: antonio victor de oliveira matos paiva;A...,77,"[0.017227173, 0.04748535, -0.037750244, 0.0078..."
18,igor mattos dos santos varejao,ia e cloud,Faço pesquisa em aprendizado profundo (IA) a 4...,Nome: igor mattos dos santos varejao;Area: ia ...,102,"[-0.0023479462, 0.03024292, -0.050994873, 0.02..."
1,barbara rezende neri,"tecnologia,",Ciência da computação na Federal de São João d...,"Nome: barbara rezende neri;Area: tecnologia,; ...",33,"[-0.009788513, 0.044647217, 0.0030574799, 0.00..."


Pessoa com nome enzo craveiro da costa gomes e i=7 deu match com:


Unnamed: 0,nome,area,minibio,combined,n_tokens,embedding
6,eduardo da silva marcelino,desenvolvedor back-end;,Trabalho há 4 anos no Club&Casa Design e Curso...,Nome: eduardo da silva marcelino;Area: desenvo...,44,"[-0.009529114, 0.026489258, -0.029052734, 0.00..."
3,bruno mateus tizer das chagas,tecnologia,,Nome: bruno mateus tizer das chagas;Area: tecn...,22,"[0.0129470825, 0.033477783, -0.035125732, 0.01..."
9,enzo teodosio portela,tecnologia,Enzo é um empreendedor apaixonado por tecnolog...,Nome: enzo teodosio portela;Area: tecnologia; ...,214,"[0.003250122, 0.026947021, -0.016357422, 0.007..."


Pessoa com nome geovanna vitoria de souza e i=13 deu match com:


Unnamed: 0,nome,area,minibio,combined,n_tokens,embedding
37,marcelo miguel pereira de assis,"ai, dev fullstack","Goiânia-GO, estou no Inteli",Nome: marcelo miguel pereira de assis;Area: ai...,38,"[0.014060974, 0.045715332, -0.050231934, 0.019..."
0,antonio victor de oliveira matos paiva,engenharia mecatrônica/aviação/educação,sou apaixonado por revoluções tecnológicas e j...,Nome: antonio victor de oliveira matos paiva;A...,77,"[0.017227173, 0.04748535, -0.037750244, 0.0078..."
19,joao marcelo de andrade brito,"marketing, trabalho ajudando ecommerces a esca...",UFABC - Ciências da computação,Nome: joao marcelo de andrade brito;Area: mark...,56,"[0.027114868, 0.045043945, -0.0025119781, 0.02..."


Pessoa com nome joao victor moreira da silva e i=21 deu match com:


Unnamed: 0,nome,area,minibio,combined,n_tokens,embedding
5,davi d'avila versan,engenharia de software,Instituto de Tecnologia e Liderança (Inteli),Nome: davi d'avila versan;Area: engenharia de ...,34,"[-0.015052795, 0.022079468, -0.033081055, 0.04..."
18,igor mattos dos santos varejao,ia e cloud,Faço pesquisa em aprendizado profundo (IA) a 4...,Nome: igor mattos dos santos varejao;Area: ia ...,102,"[-0.0023479462, 0.03024292, -0.050994873, 0.02..."
17,ian louzada vancura ede,vendas e ia,Nascido e criado em Belo Horizonte fiz o ensin...,Nome: ian louzada vancura ede;Area: vendas e i...,111,"[0.00027370453, 0.017562866, -0.03781128, 0.02..."


In [82]:
links = pd.read_excel("data/hack_people.xlsx")[["nome", "linkedin"]]


In [91]:
import pandas as pd
# Increase the column display size to 1000
# Increase the display size of the 'minibio' column to 1000
pd.set_option('display.max_colwidth', 1000)

# Your code here to display the DataFrame or column

links.iloc[[21, 5, 18, 17]]

Unnamed: 0,nome,linkedin
21,JOAO VICTOR MOREIRA DA SILVA,https://www.linkedin.com/in/jv-silva
5,DAVI D'AVILA VERSAN,https://www.linkedin.com/in/daviversan
18,IGOR MATTOS DOS SANTOS VAREJAO,https://www.linkedin.com/in/igor-varej%C3%A3o-7859031b0?utm_source=share&utm_campaign=share_via&utm_content=profile&utm_medium=android_app
17,IAN LOUZADA VANCURA EDE,https://www.linkedin.com/in/ian-ede?utm_source=share&utm_campaign=share_via&utm_content=profile&utm_medium=android_app


In [92]:
dataset.iloc[[21, 5, 18, 17]]

Unnamed: 0,nome,area,minibio,combined,n_tokens,embedding
21,joao victor moreira da silva,desenvolvedor frontend,"BTG, Trusted House Sitters, Boticário, Tramontina e Flash Menu","Nome: joao victor moreira da silva;Area: desenvolvedor frontend; Minibio: BTG, Trusted House Sitters, Boticário, Tramontina e Flash Menu",42,"[-0.01725769, 0.01763916, -0.018981934, -0.035095215, 0.036895752, 0.031829834, 0.0076026917, -0.066833496, -0.015434265, 6.264448e-05, 0.016143799, 0.042785645, 0.008804321, 0.017654419, 0.034484863, 0.010032654, -0.003414154, 0.03677368, -0.029800415, -0.05621338, -0.017349243, 0.0055770874, 0.036468506, 0.06036377, 0.01007843, -0.0035896301, 0.0067214966, 0.007003784, 0.008346558, -0.050689697, -0.016052246, 0.0011901855, -0.016738892, 0.0059928894, -0.022140503, -0.013137817, -0.0045051575, -0.0657959, -0.022369385, 0.051605225, -0.022140503, -0.028045654, -0.032043457, 0.031280518, -0.012504578, -0.03527832, 0.042266846, 0.039093018, -0.0016860962, 0.032348633, 0.014389038, -0.00655365, 0.009880066, 0.005634308, 0.001745224, 0.011810303, 0.03845215, 0.02494812, -0.018859863, 0.035125732, -0.008529663, -0.042907715, -0.08532715, 0.011817932, 0.011062622, 0.041748047, 0.0513916, -0.066467285, 0.006832123, -0.013198853, -0.0030651093, -0.0231781, 0.010803223, 0.011276245, -0.0385..."
5,davi d'avila versan,engenharia de software,Instituto de Tecnologia e Liderança (Inteli),Nome: davi d'avila versan;Area: engenharia de software; Minibio: Instituto de Tecnologia e Liderança (Inteli),34,"[-0.015052795, 0.022079468, -0.033081055, 0.041229248, 0.00087690353, -0.009963989, 0.007713318, -0.023422241, -0.0049095154, -0.026748657, 0.0084991455, -0.009643555, -0.012329102, 0.0053596497, -0.02482605, 0.016601562, -0.015388489, 0.0016593933, -0.025482178, -0.0063972473, -0.0149383545, 0.023101807, 0.0049438477, 0.0015239716, -0.015151978, 0.035064697, 0.014823914, -0.03100586, 0.03164673, -0.02168274, -0.007575989, -0.01600647, 0.010108948, 0.014175415, -0.02859497, -0.03451538, 0.013015747, -0.04537964, 0.014015198, 0.014228821, -0.056488037, -0.031280518, -0.03262329, 0.04864502, -0.0030288696, -0.035003662, 0.03189087, 0.04837036, 0.014518738, 0.039794922, -0.0066490173, -0.009529114, 0.021438599, -0.01260376, 0.022979736, 0.0028438568, 0.015319824, -0.011642456, -0.029678345, 0.005065918, -0.0053710938, -0.046966553, -0.0871582, 0.036315918, -0.019958496, 0.0036125183, 0.019699097, 0.019485474, 0.027130127, 0.019210815, -0.0012989044, 0.01600647, 0.0065994263, 0.0236358..."
18,igor mattos dos santos varejao,ia e cloud,Faço pesquisa em aprendizado profundo (IA) a 4 anos no laboratório NINFA na Universidade Federal do Espirito Santo (UFES) - Trabalho atualmente na empresa Olho do Dono uma startup que utiliza IA para pesatem do gado - Estou atualmente cursando Ciência da Computação na UFES já finalizando o curso,Nome: igor mattos dos santos varejao;Area: ia e cloud; Minibio: Faço pesquisa em aprendizado profundo (IA) a 4 anos no laboratório NINFA na Universidade Federal do Espirito Santo (UFES) - Trabalho atualmente na empresa Olho do Dono uma startup que utiliza IA para pesatem do gado - Estou atualmente cursando Ciência da Computação na UFES já finalizando o curso,102,"[-0.0023479462, 0.03024292, -0.050994873, 0.025619507, 0.0051727295, -0.0030231476, 0.0072898865, -0.035614014, -0.01727295, -0.0062446594, -0.014305115, 0.009262085, 0.0016069412, 0.017044067, 0.0037059784, 0.0032043457, 0.017959595, 0.0018053055, -0.018157959, -0.02468872, -0.03390503, 0.011985779, 0.024383545, -0.03289795, -0.02293396, 0.046051025, 0.009086609, -0.03564453, 0.029037476, -0.013053894, -0.0061912537, -0.024749756, 0.05606079, 0.06008911, -0.021484375, -0.018798828, 0.026870728, -0.007297516, 0.011070251, -0.013587952, 0.007575989, -0.005264282, -0.038330078, 0.045928955, -0.012496948, -0.014160156, 0.02671814, 0.055511475, -0.0025043488, 0.039764404, -0.020965576, 0.016204834, 0.020202637, -0.017944336, 0.021408081, 0.010429382, 0.021270752, 0.025466919, -0.04815674, 0.020309448, 0.001991272, -0.046203613, -0.11212158, 0.051757812, 0.02229309, 0.018615723, 0.015838623, 0.008049011, 0.01713562, 0.018356323, -0.0017499924, 0.0067710876, 0.041931152, 0.0030822754, -0..."
17,ian louzada vancura ede,vendas e ia,"Nascido e criado em Belo Horizonte fiz o ensino médio em um colégio técnico de TI, Cotemig. Trabalhei como desenvolvedor Salesforce por 1 ano, depois fui pra área de vendas e mkt. Atualmente tenho desenvolvido minha própria empresa, que aplica IA em vendas e atendimentos de outros negócios","Nome: ian louzada vancura ede;Area: vendas e ia; Minibio: Nascido e criado em Belo Horizonte fiz o ensino médio em um colégio técnico de TI, Cotemig. Trabalhei como desenvolvedor Salesforce por 1 ano, depois fui pra área de vendas e mkt. Atualmente tenho desenvolvido minha própria empresa, que aplica IA em vendas e atendimentos de outros negócios",111,"[0.00027370453, 0.017562866, -0.03781128, 0.026657104, -0.026123047, -0.032958984, 0.0013284683, -0.027816772, 0.003686905, -0.011749268, 0.015563965, 0.030303955, -0.0073051453, -0.0013875961, -0.006931305, 0.0099105835, -0.01852417, 0.0011806488, -0.041748047, -0.017211914, -0.00484848, 0.029525757, -0.0036334991, 0.036193848, 0.023651123, 0.035217285, 0.015052795, -0.0035953522, 0.03375244, -0.054992676, -0.014907837, -0.015281677, 0.064208984, 0.03982544, -0.024871826, -0.059661865, 0.033416748, -0.0069999695, -0.00038933754, -0.026306152, -0.014213562, -0.01171875, -0.0115356445, 0.02116394, -0.01600647, -0.041534424, 0.034576416, 0.02885437, 0.011009216, 0.034179688, -0.001865387, 0.035827637, 0.009017944, -0.011886597, -0.011489868, 0.012260437, 0.06298828, 0.047851562, -0.06512451, 0.02861023, -0.0032310486, -0.05291748, -0.10211182, 0.04083252, -0.024993896, 0.011978149, 0.015541077, -0.024261475, 0.045135498, 0.02468872, -0.02116394, -0.0048980713, 0.0067367554, 0.0163269..."
