In [3]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [4]:
employees = pd.read_csv(r"fau_onboarding.csv")
employees.columns

Index(['id', 'teams', 'previous_experience', 'hobbies', 'sports'], dtype='object')

In [5]:
employees.head()

Unnamed: 0,id,teams,previous_experience,hobbies,sports
0,emp_001,team_02,Expert.,"Hobbies for learning, reading, and curiosity.",tennis
1,emp_002,team_01,Novice.,"Hobbies for arts, creativity, and imagination.",volleyball
2,emp_003,team_03,Advanced beginner.,"Hobbies for fitness, health, and wellbeing.",volleyball
3,emp_004,team_04,Competent.,"Hobbies for money, assets, and content creation.",tennis
4,emp_005,team_05,Proficient.,"Hobbies for fitness, health, and wellbeing.",swimming


In [6]:
def create_soup(x):
    return''.join(x['previous_experience']) + ''.join(x['hobbies']) + ''.join(x['sports']) 
employees['soup'] = employees.apply(create_soup, axis = 1)

In [7]:
tfidf = TfidfVectorizer(stop_words = 'english')
tfidf_matrix = tfidf.fit_transform(employees['soup'])
tfidf_matrix.shape

(33, 27)

In [8]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

indices = pd.Series(employees.index, index = employees['id']).drop_duplicates()

In [42]:
def get_recommendations (ID, cosine_sim = cosine_sim):
    #get index of employee that matches the employee id
    IDx = indices[ID]
    
    sim_scores = list(enumerate(cosine_sim[IDx]))
    
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    
    #get the score of the 5 most similar employees
    sim_scores = sim_scores[1:6]
    
    #get employee indices
    employee_indices = [i[0] for i in sim_scores]
    
    #return the top 5 most similar employees
    return employees['id'].iloc[employee_indices]

In [43]:
get_recommendations('emp_033', cosine_sim)

26    emp_027
1     emp_002
2     emp_003
12    emp_013
24    emp_025
Name: id, dtype: object