# Notebook 3: Matching between Skills Extracted from Listings and Skills Extracted from Courses

Using Jaccard Similarity Measure with 3-gram tokenizer

#### This notebook produces the following data into the _output_datasets_ folder:
```
(SKILL) NODE						skill__node.csv
skill_id:ID
skill_name
aliases[]
:LABEL = "SKILL"

[TEACHES] RELATION					teaches__relation.csv
:START_ID = course_id
:END_ID = skill_id
:TYPE = "TEACHES"
```

## Imports

In [1]:
# %pip install py_stringmatching
import pandas as pd
import py_stringmatching as sm
import math

## Read Data

In [2]:
# this cell is to support running the notebook in Google Colab

mydrive = ""  # this is when we run locally

# Google Colab:
# from google.colab import drive
# drive.mount('/content/drive')
# mydrive = "/content/drive/MyDrive/DSE 203 — etl/DSE203_Project/"  # this is when we run on COLAB Leslie
# mydrive = "/content/drive/MyDrive/DSE203_Project/"  # this is when we run on COLAB Sergey

input_dir = mydrive+"input_datasets/"
output_dir = mydrive+"output_datasets/"
temp_dir = mydrive+"temp_datasets/"

In [3]:
# read dataset
listing_skills_df = pd.read_csv(temp_dir+'listing_skills_TEMP.csv')
courses_skills_df = pd.read_csv(temp_dir+'courses_skills_TEMP.csv')

listing_skills_df
# courses_skills_df

Unnamed: 0,listing_skill_id,listing_skill_name
0,0,salesforce
1,1,mobilesdk
2,2,salesforce1 platform and lightning components
3,3,lightning connect
4,4,apex
...,...,...
164,164,reactjs
165,165,tdd testdriven
166,166,mvcdesign
167,167,pdf resume pdfwe


In [4]:
# we don't want to throw away NaN, just not to break mapping
courses_skills_df.course_skill_name.fillna('no skill', inplace=True)
courses_skills_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 233 entries, 0 to 232
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   course_skill_id    233 non-null    int64 
 1   course_skill_name  233 non-null    object
dtypes: int64(1), object(1)
memory usage: 3.8+ KB


In [5]:
# we don't want to throw away NaN, just not to break mapping
listing_skills_df.listing_skill_name.fillna('no skill', inplace=True)
listing_skills_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169 entries, 0 to 168
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   listing_skill_id    169 non-null    int64 
 1   listing_skill_name  169 non-null    object
dtypes: int64(1), object(1)
memory usage: 2.8+ KB


## Matching: Using Jaccard Similarity Measure with 3-gram tokenizer

In [6]:
jaccard = sm.Jaccard()
qval=3
qgram_tokenizer = sm.QgramTokenizer(qval)

def calc_boundaries_jac_qg(s1, t):
    '''
    Calculate threshold boundaries for q-gram tokenizer and Jaccard similarity measure.
    Returns: list of [lower_boundary, upper_boundary] in amount of tokens (words)
    '''
    
    s1_tokens_length = len(qgram_tokenizer.tokenize(str(s1)))
    min_l = t*s1_tokens_length
    max_l = (1/t)*s1_tokens_length
    return [math.floor(min_l), math.ceil(max_l)]

def calc_jaccard_score_qg(s1, s2):
    '''
    Calculate Jaccard similarity score between 2 strings s1 and s2 with white q-gram tokenizer
    '''
    s1_tokens = qgram_tokenizer.tokenize(str(s1).lower())  # lowering just in case we forgot to do it before
    s2_tokens = qgram_tokenizer.tokenize(str(s2).lower())  # lowering just in case we forgot to do it before
    
    score = jaccard.get_sim_score(s1_tokens, s2_tokens)
    return score

In [7]:
# calculate number of tokens for each string in orgNames
courses_skills_df['n_tokens'] = courses_skills_df.course_skill_name.apply(lambda x: len(qgram_tokenizer.tokenize(str(x))))
courses_skills_df.head(5)

Unnamed: 0,course_skill_id,course_skill_name,n_tokens
0,0,statistical analysis,22
1,1,jupyter notebook,18
2,2,data analysis,15
3,3,trading,9
4,4,computer programming,22


In [8]:
listing_skill_id_list = listing_skills_df.listing_skill_id.to_list()
listing_skill_name_list = listing_skills_df.listing_skill_name.to_list()

In [9]:
%%time
t = 0.6
# this df will hold the final results patent_string | list_of_matches
final_df = pd.DataFrame(columns = ['listing_skill_id', 'listing_skill_name', 'course_skill_names', 'course_skill_ids'])

# for current_string in patent_strings_list:
for listing_skill_id, listing_skill_name in zip(listing_skill_id_list, listing_skill_name_list):

    # calculate threshold for count filter:
    boundaries = calc_boundaries_jac_qg(listing_skill_name, t)

    # Pruning: apply filtering based on the boundaries
    df_to_process = courses_skills_df[(courses_skills_df.n_tokens>=boundaries[0]) & (courses_skills_df.n_tokens<=boundaries[1])].copy()
    
    # just print to see the progress
    print(f'\r{listing_skill_id}. {listing_skill_name}    Matching with {len(df_to_process)} records.', end="")
    
    # calculate similarity measure score for each string in filtered df
    df_to_process['sim_score'] = df_to_process.course_skill_name.apply(lambda x: calc_jaccard_score_qg(x, listing_skill_name))

    # this will hold the list of matches for given string
    temp_df = df_to_process[df_to_process.sim_score>t].sort_values('sim_score', ascending=False)
    matched_course_skill_name_list = temp_df.course_skill_name.to_list()
    matched_course_skill_id_list = temp_df.course_skill_id.to_list()
    
    # convert to df with row patent_string | list_of_matches
    int_df = pd.DataFrame(data = [[str(listing_skill_id), listing_skill_name, matched_course_skill_name_list, matched_course_skill_id_list]], 
                          columns = ['listing_skill_id', 'listing_skill_name', 'course_skill_names', 'course_skill_ids'])

    # append to the final df
    final_df = pd.concat([final_df, int_df])

168. skills, knowledge and abilities handson    Matching with 60 records.CPU times: user 465 ms, sys: 40.2 ms, total: 505 ms
Wall time: 435 ms


In [10]:
final_df

Unnamed: 0,listing_skill_id,listing_skill_name,course_skill_names,course_skill_ids
0,0,salesforce,[],[]
0,1,mobilesdk,[],[]
0,2,salesforce1 platform and lightning components,[],[]
0,3,lightning connect,[],[]
0,4,apex,[],[]
...,...,...,...,...
0,164,reactjs,[],[]
0,165,tdd testdriven,[],[]
0,166,mvcdesign,[],[]
0,167,pdf resume pdfwe,[],[]


In [11]:
# lets look what was matched:
final_df[final_df.course_skill_names.str.len()>0]

Unnamed: 0,listing_skill_id,listing_skill_name,course_skill_names,course_skill_ids
0,18,python,[python],[6]
0,20,javascript,[javascripts],[118]
0,23,java,[java],[34]
0,31,analytics,[analytics],[163]
0,37,selenium,[selenium],[77]
0,118,finance,[finance],[12]
0,150,ui,[ui],[75]


In [12]:
# for aliases attribute, we'll make a list of all matches
final_df['all_skills_matched'] = final_df['listing_skill_name'].apply(lambda x: x.split(':'))+final_df['course_skill_names']
final_df.all_skills_matched = final_df.all_skills_matched.apply((set)).apply(list)
final_df['all_skills_matched_str'] = final_df.all_skills_matched.apply(lambda x: ";".join(x))
# final_df[final_df.course_skill_names.str.len()>0]

In [13]:
final_df.to_csv(temp_dir+'matched_listing_courses_skills__TEMP.csv', index=False)

## Create Nodes table for SKILL

In [14]:
skill_node_df = final_df[['listing_skill_id', 'listing_skill_name', 'all_skills_matched_str']]
skill_node_df.columns = ['skill_id:ID', 'skill_name', 'aliases[]']
skill_node_df[':LABEL']="SKILL"
skill_node_df.to_csv(output_dir+'skill__node.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


## Create Relationships table for COURSE (course_id) - TEACHES - SKILL (listing_skill_id, not course_skill_id)

In [15]:
# read courses
# courses_df = pd.read_csv(output_dir+"course__node.csv")
# courses_df.head(5)

In [16]:
# read matched listing_skills with course_skills 
matched_course_skills_df = final_df[final_df.course_skill_names.str.len()>0]
matched_course_skills_df.head(5)

Unnamed: 0,listing_skill_id,listing_skill_name,course_skill_names,course_skill_ids,all_skills_matched,all_skills_matched_str
0,18,python,[python],[6],[python],python
0,20,javascript,[javascripts],[118],"[javascript, javascripts]",javascript;javascripts
0,23,java,[java],[34],[java],java
0,31,analytics,[analytics],[163],[analytics],analytics
0,37,selenium,[selenium],[77],[selenium],selenium


In [17]:
matched_course_skills_exploded_df = matched_course_skills_df.explode('course_skill_ids')
matched_course_skills_exploded_df.head(5)

Unnamed: 0,listing_skill_id,listing_skill_name,course_skill_names,course_skill_ids,all_skills_matched,all_skills_matched_str
0,18,python,[python],6,[python],python
0,20,javascript,[javascripts],118,"[javascript, javascripts]",javascript;javascripts
0,23,java,[java],34,[java],java
0,31,analytics,[analytics],163,[analytics],analytics
0,37,selenium,[selenium],77,[selenium],selenium


In [18]:
# read relation between courses and course_skills
courses_skills_relation_df = pd.read_csv(temp_dir+"courses_skills_relationship_TEMP.csv")
courses_skills_relation_df.head(5)

Unnamed: 0,course_id,course_skill_id
0,0,0
1,0,1
2,0,2
3,6,2
4,19,2


In [19]:
# we need to merge matched_course_skills_exploded_df and courses_skills_relation_df to get connection between course_id and listing_skill_id
course_listing_skill_relation_df = courses_skills_relation_df.merge(matched_course_skills_exploded_df, 
                                                                    how='outer', 
                                                                    left_on='course_skill_id', 
                                                                    right_on='course_skill_ids')

# final = JobListing.merge(skills, how='outer', left_on='listing_skills', right_on='job_listing_skill_name')

course_listing_skill_relation_df

Unnamed: 0,course_id,course_skill_id,listing_skill_id,listing_skill_name,course_skill_names,course_skill_ids,all_skills_matched,all_skills_matched_str
0,0,0,,,,,,
1,0,1,,,,,,
2,0,2,,,,,,
3,6,2,,,,,,
4,19,2,,,,,,
...,...,...,...,...,...,...,...,...
269,19,228,,,,,,
270,19,229,,,,,,
271,19,230,,,,,,
272,19,231,,,,,,


In [20]:
course_listing_skill_relation_df = course_listing_skill_relation_df[['course_id', 'listing_skill_id']]
course_listing_skill_relation_df.columns = [':START_ID', ':END_ID']
course_listing_skill_relation_df[':TYPE']='TEACHES'
course_listing_skill_relation_df.dropna(inplace=True)
course_listing_skill_relation_df.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Unnamed: 0,:START_ID,:END_ID,:TYPE
12,0,18,TEACHES
13,3,18,TEACHES
14,4,18,TEACHES
22,0,118,TEACHES
23,18,118,TEACHES


In [21]:
course_listing_skill_relation_df.to_csv(output_dir+"teaches__relation.csv", index=False)

In [22]:
len(courses_skills_df)

233