In [1]:
# importing all necessary modules
import pandas as pd
from gensim.models import Word2Vec
import gensim
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings
warnings.filterwarnings(action='ignore')

## Get open positions and transformed datasets

In [3]:
#get the open positions dataset from web scraping
open_positions = pd.read_csv(r".\linkedin_jobs_scraping.csv")

#get all companies dataset from web scraping
all_companies = pd.read_csv(r".\all_companies_scraping.csv")

#get the data after some transformations - extract the descriptions of past jobs (from databricks)
df = pd.read_csv(r".\transformed_df.csv")
df['is_in_similarities'] = df['url'].apply(lambda x: str(x) in list_urls_similar)

#save the new dataframe
df.to_csv('with_column_is_similar.csv',index = False)

# all people from linkedin dataset
df = pd.read_csv('with_column_is_similar.csv') 

# get the people that similar from the first model
profile = pd.read_csv(r".\similar_people.csv")
profile = profile.head(30)

list_urls_similar = list(profile['url'])

my_similar_people = df[df['is_in_similarities']].reset_index(drop =True)

## Pre-Proccessing the data to word2vec model

In [8]:
# preparing Concatenating to train the word2vec model
s = ' '.join([x for x in df.dropna(subset = ['combined'])['combined']])

open_p = ' '.join([x for x in open_positions['Title']])

all_text = open_p + s


# replaces escape character with space
f = all_text.replace("\n", " ")
f = f.replace(",", "")
f = f.replace("-", "")

data = []

# iterate through each sentence in the file
for i in sent_tokenize(f):
    temp = []

    # tokenize the sentence into words
    for j in word_tokenize(i):
        temp.append(j.lower())

    data.append(temp)

## Create the word2vec model

In [4]:
# create Skip Gram model word2Vec
model = gensim.models.Word2Vec(data, min_count=1, vector_size=100,window=5, sg=1)
model.save("my_word2vec.model")
model = gensim.models.Word2Vec.load("my_word2vec.model")

## Calculate the similarity between the descriptions and the open positions titles

In [29]:
#create the similarities between descriptions to the open positions titles
similarities = {}
len_of_similar = len(my_similar_people)
#run over the descriptions
for i in range(len_of_similar):
    my_similar_people_desc = str(my_similar_people.iloc[i]['combined'])
    words_in_similar_people = [word.lower() for word in my_similar_people_desc.split(' ')]
    #run over the titles from open positions
    for title in open_positions.iloc[:]['Title']:
        counter = 0
        sum_similarities=0
        words_in_title = [word.lower() for word in title.split(' ')]
        #calculate similarity between words
        for word1 in words_in_title:
            for word2 in words_in_similar_people:
                try:
                    sum_similarities += model.wv.similarity(word1, word2)
                except:
                    continue
                counter+=1
        try:        
            similarities[my_similar_people_desc,title] = sum_similarities/counter
        except:
            continue

## Get the most reccomended jobs

In [30]:
#get the top jobs 
top_jobs_similarites = sorted(similarities.items(), key=lambda item: item[1], reverse = True)[:3]
top_jobs = [key[0][1] for key in top_jobs_similarites]

In [31]:
# extract keys and values
keys = [(k[0], k[1]) for k in similarities.keys()]
values = list(similarities.values())

# create DataFrame from the dictionary
df_similarity = pd.DataFrame({'Description': [k[0] for k in keys], 'Title': [k[1] for k in keys], 'Similarity': values})

In [32]:
#group by similarity scores and order descending 
similarities_titles = df_similarity.groupby('Title').mean().sort_values(by = 'Similarity',ascending = False)

similarities_titles = similarities_titles.reset_index()

In [34]:
#get top n most reccomended titles

top_n_jobs = 20
similarities_titles = similarities_titles.merge(open_positions,left_on='Title',right_on='Title')

similarities_titles = similarities_titles.drop_duplicates(subset = ['Title','Company_Name']).sort_values(by = 'Similarity',ascending=False)

top_similarities_titles = similarities_titles.head(top_n_jobs)

In [38]:
top_similarities_titles

Unnamed: 0,Title,Similarity,Company_Name,Company_URL,Job_Location,Seniority_level,Employment_type,Job_function,Industries,About,Job_URL,Posted,Collected
0,Foreman/Project Manager,0.499091,Certified Apartment Staffing,https://www.linkedin.com/company/prolific-staf...,"Arlington, TX",Mid-Senior level,Full-time,Management and Manufacturing,Staffing and Recruiting,Job DescriptionWe are seeking a Foreman/Projec...,https://www.linkedin.com/jobs/view/foreman-pro...,5 hours ago,2024-03-25 15:47:43.813415
3,Admin/Compliance Analyst-Trainee (Korean),0.493735,ecocareers,https://uk.linkedin.com/company/ecocareers?trk...,"New York, NY",Internship,Full-time,Legal,Staffing and Recruiting,Our client is seeking anAdmin/Compliance Analy...,https://www.linkedin.com/jobs/view/admin-compl...,14 hours ago,2024-03-24 12:23:59.259974
4,SENIOR ACCOUNTANT/ACCOUNTING MANAGER,0.493459,Milestone Property Management,https://www.linkedin.com/company/milestone-pro...,"Portland, OR",Mid-Senior level,Full-time,Accounting/Auditing and Finance,Investment Management,Job DescriptionMilestone Property Management i...,https://www.linkedin.com/jobs/view/senior-acco...,1 day ago,2024-03-25 10:01:45.893607
6,SENIOR ACCOUNTANT/ACCOUNTING MANAGER,0.493459,Source 1 Solutions,https://www.linkedin.com/company/source-1-solu...,"Clearwater, FL",Mid-Senior level,Contract,Accounting/Auditing and Finance,Human Resources Services,Job DescriptionABOUT THE ROLE:As the Senior Ac...,https://www.linkedin.com/jobs/view/senior-acco...,1 day ago,2024-03-24 20:24:27.201087
7,HR Coordinator/Recruiter,0.491202,American Pool,https://www.linkedin.com/company/american-pool...,"Miami, FL",Entry level,Full-time,Human Resources,Recreational Facilities,As the Human Resources Coordinator you will su...,https://www.linkedin.com/jobs/view/hr-coordina...,1 week ago,2024-03-24 00:26:35.242434
8,Inside/OSP Manager,0.486677,TekSynap,https://www.linkedin.com/company/teksynap?trk=...,"Arlington, VA",Mid-Senior level,Full-time,Other,Information Technology & Services,We are seeking an Inside/OSP Manager to join o...,https://www.linkedin.com/jobs/view/inside-osp-...,1 week ago,2024-03-25 10:01:45.893607
9,HR/Payroll Specialist,0.486531,Pressed Juicery,https://www.linkedin.com/company/pressedoffici...,"Culver City, CA",Associate,Full-time,"Human Resources, General Business, and Adminis...","Food and Beverage Retail, Food and Beverage Ma...",About Pressed JuiceryPressed Juicery is at the...,https://www.linkedin.com/jobs/view/hr-payroll-...,2 weeks ago,2024-03-24 00:26:35.242434
10,Senior Director-Finance,0.482603,NYU Grossman Long Island School of Medicine,https://www.linkedin.com/school/nyu-long-islan...,"New York, NY",Director,Full-time,Finance and Sales,Higher Education,NYU Grossman School of Medicine is one of the...,https://www.linkedin.com/jobs/view/senior-dire...,2 days ago,2024-03-24 20:24:27.201087
12,Senior Specialist-AML,0.482603,KeyBank,https://www.linkedin.com/company/keybank?trk=p...,United States,Mid-Senior level,Full-time,Information Technology,Banking,"Location:For Those Who Work At Home - Various,...",https://www.linkedin.com/jobs/view/senior-spec...,2 days ago,2024-03-24 20:24:27.201087
14,Bookkeeper/Office Manager,0.482183,Staff Financial Group,https://www.linkedin.com/company/staff-financi...,"Orlando, FL",Mid-Senior level,Full-time,Administrative,Staffing and Recruiting,Bookkeeper/Office ManagerOur client has an imm...,https://www.linkedin.com/jobs/view/bookkeeper-...,2 days ago,2024-03-24 15:59:56.979478


In [39]:
top_similarities_titles.reset_index().to_excel('Most_similar_jobs_profile.xlsx')

#### Plotting using Power Bi