# Notebook 3: Matching between Skills Extracted from Listings and Skills Extracted from Courses

Using Jaccard Similarity Measure with 3-gram tokenizer

#### This notebook produces the following data into the _output_datasets_ folder:
```
(SKILL) NODE						skill__node.csv
skill_id:ID
skill_name
aliases[]
:LABEL = "SKILL"

[TEACHES] RELATION					teaches__relation.csv
:START_ID = course_id
:END_ID = skill_id
:TYPE = "TEACHES"
```

## Imports

In [None]:
# %pip install py_stringmatching
import pandas as pd
import py_stringmatching as sm
import math

## Read Data

In [None]:
# this cell is to support running the notebook in Google Colab

mydrive = ""  # this is when we run locally

# Google Colab:
from google.colab import drive
drive.mount('/content/drive')
mydrive = "/content/drive/MyDrive/DSE 203 — etl/DSE203_Project/"  # this is when we run on COLAB Leslie
mydrive = "/content/drive/MyDrive/DSE203_Project/"  # this is when we run on COLAB Sergey

input_dir = mydrive+"input_datasets/"
output_dir = mydrive+"output_datasets/"
temp_dir = mydrive+"temp_datasets/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# read dataset
listing_skills_df = pd.read_csv(temp_dir+'listing_skills_TEMP.csv')
courses_skills_df = pd.read_csv(temp_dir+'courses_skills_TEMP.csv')

listing_skills_df
# courses_skills_df

Unnamed: 0,listing_skill_id,listing_skill_name
0,0,ecommerceretail qa
1,1,lan
2,2,peoplesoft
3,3,bourne shell scripting
4,4,groovy
...,...,...
29418,29418,nosqldatabase
29419,29419,programmingdevelopment
29420,29420,programming on win xp788.1
29421,29421,skills win32 programming expertcc++ programming


In [None]:
# we don't want to throw away NaN, just not to break mapping
courses_skills_df.course_skill_name.fillna('no skill', inplace=True)
courses_skills_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16761 entries, 0 to 16760
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   course_skill_id    16761 non-null  int64 
 1   course_skill_name  16761 non-null  object
dtypes: int64(1), object(1)
memory usage: 262.0+ KB


In [None]:
# we don't want to throw away NaN, just not to break mapping
listing_skills_df.listing_skill_name.fillna('no skill', inplace=True)
listing_skills_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29423 entries, 0 to 29422
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   listing_skill_id    29423 non-null  int64 
 1   listing_skill_name  29423 non-null  object
dtypes: int64(1), object(1)
memory usage: 459.9+ KB


## Matching: Using Jaccard Similarity Measure with 3-gram tokenizer

In [None]:
jaccard = sm.Jaccard()
qval=3
qgram_tokenizer = sm.QgramTokenizer(qval)

def calc_boundaries_jac_qg(s1, t):
    '''
    Calculate threshold boundaries for q-gram tokenizer and Jaccard similarity measure.
    Returns: list of [lower_boundary, upper_boundary] in amount of tokens (words)
    '''
    
    s1_tokens_length = len(qgram_tokenizer.tokenize(str(s1)))
    min_l = t*s1_tokens_length
    max_l = (1/t)*s1_tokens_length
    return [math.floor(min_l), math.ceil(max_l)]

def calc_jaccard_score_qg(s1, s2):
    '''
    Calculate Jaccard similarity score between 2 strings s1 and s2 with white q-gram tokenizer
    '''
    s1_tokens = qgram_tokenizer.tokenize(str(s1).lower())  # lowering just in case we forgot to do it before
    s2_tokens = qgram_tokenizer.tokenize(str(s2).lower())  # lowering just in case we forgot to do it before
    
    score = jaccard.get_sim_score(s1_tokens, s2_tokens)
    return score

In [None]:
# calculate number of tokens for each string in orgNames
courses_skills_df['n_tokens'] = courses_skills_df.course_skill_name.apply(lambda x: len(qgram_tokenizer.tokenize(str(x))))
courses_skills_df.head(5)

Unnamed: 0,course_skill_id,course_skill_name,n_tokens
0,0,dialogue,10
1,1,celtx,7
2,2,creative writing,18
3,3,peering,9
4,4,film,6


In [None]:
listing_skill_id_list = listing_skills_df.listing_skill_id.to_list()
listing_skill_name_list = listing_skills_df.listing_skill_name.to_list()

In [None]:
%%time
t = 0.6
# this df will hold the final results patent_string | list_of_matches
final_df = pd.DataFrame(columns = ['listing_skill_id', 'listing_skill_name', 'course_skill_names', 'course_skill_ids'])

# for current_string in patent_strings_list:
for listing_skill_id, listing_skill_name in zip(listing_skill_id_list, listing_skill_name_list):

    # calculate threshold for count filter:
    boundaries = calc_boundaries_jac_qg(listing_skill_name, t)

    # Pruning: apply filtering based on the boundaries
    df_to_process = courses_skills_df[(courses_skills_df.n_tokens>=boundaries[0]) & (courses_skills_df.n_tokens<=boundaries[1])].copy()
    
    # just print to see the progress
    print(f'\r{listing_skill_id}. {listing_skill_name}    Matching with {len(df_to_process)} records.', end="")
    
    # calculate similarity measure score for each string in filtered df
    df_to_process['sim_score'] = df_to_process.course_skill_name.apply(lambda x: calc_jaccard_score_qg(x, listing_skill_name))

    # this will hold the list of matches for given string
    temp_df = df_to_process[df_to_process.sim_score>t].sort_values('sim_score', ascending=False)
    matched_course_skill_name_list = temp_df.course_skill_name.to_list()
    matched_course_skill_id_list = temp_df.course_skill_id.to_list()
    
    # convert to df with row patent_string | list_of_matches
    int_df = pd.DataFrame(data = [[str(listing_skill_id), listing_skill_name, matched_course_skill_name_list, matched_course_skill_id_list]], 
                          columns = ['listing_skill_id', 'listing_skill_name', 'course_skill_names', 'course_skill_ids'])

    # append to the final df
    final_df = pd.concat([final_df, int_df])

25659. windows      29422. cc    Matching with 1270 records.CPU times: user 1h 8min 37s, sys: 36.1 s, total: 1h 9min 13s
Wall time: 1h 9min 41s


In [None]:
final_df

Unnamed: 0,listing_skill_id,listing_skill_name,course_skill_names,course_skill_ids
0,0,ecommerceretail qa,[],[]
0,1,lan,[lan],[7652]
0,2,peoplesoft,[],[]
0,3,bourne shell scripting,[],[]
0,4,groovy,[],[]
...,...,...,...,...
0,29418,nosqldatabase,[],[]
0,29419,programmingdevelopment,[program development],[14122]
0,29420,programming on win xp788.1,[],[]
0,29421,skills win32 programming expertcc++ programming,[],[]


In [None]:
# lets look what was matched:
final_df[final_df.course_skill_names.str.len()>0]

Unnamed: 0,listing_skill_id,listing_skill_name,course_skill_names,course_skill_ids
0,1,lan,[lan],[7652]
0,6,selenium,[selenium],[70]
0,7,unix,[unix],[1720]
0,10,development,"[developmental, redevelopment, drug development]","[4405, 14793, 196]"
0,11,relational databases,"[relational database, relational database syst...","[936, 3409]"
...,...,...,...,...
0,29381,slack,[slack],[7456]
0,29398,reactive programming,"[reactive programming, interactive programming]","[4038, 15762]"
0,29414,digital media design,[digital design],[2099]
0,29419,programmingdevelopment,[program development],[14122]


In [None]:
# for aliases attribute, we'll make a list of all matches
final_df['all_skills_matched'] = final_df['listing_skill_name'].apply(lambda x: x.split(':'))+final_df['course_skill_names']
final_df.all_skills_matched = final_df.all_skills_matched.apply((set)).apply(list)
final_df['all_skills_matched_str'] = final_df.all_skills_matched.apply(lambda x: ";".join(x))
# final_df[final_df.course_skill_names.str.len()>0]

In [None]:
final_df.to_csv(temp_dir+'matched_listing_courses_skills__TEMP.csv', index=False)

## Create Nodes table for SKILL

In [None]:
skill_node_df = final_df[['listing_skill_id', 'listing_skill_name', 'all_skills_matched_str']]
skill_node_df.columns = ['skill_id:ID', 'skill_name', 'aliases[]']
skill_node_df[':LABEL']="SKILL"
skill_node_df.to_csv(output_dir+'skill__node.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  skill_node_df[':LABEL']="SKILL"


## Create Relationships table for COURSE (course_id) - TEACHES - SKILL (listing_skill_id, not course_skill_id)

In [None]:
# read courses
# courses_df = pd.read_csv(output_dir+"course__node.csv")
# courses_df.head(5)

In [None]:
# read matched listing_skills with course_skills 
matched_course_skills_df = final_df[final_df.course_skill_names.str.len()>0]
matched_course_skills_df.head(5)

Unnamed: 0,listing_skill_id,listing_skill_name,course_skill_names,course_skill_ids,all_skills_matched,all_skills_matched_str
0,1,lan,[lan],[7652],[lan],lan
0,6,selenium,[selenium],[70],[selenium],selenium
0,7,unix,[unix],[1720],[unix],unix
0,10,development,"[developmental, redevelopment, drug development]","[4405, 14793, 196]","[developmental, drug development, development,...",developmental;drug development;development;red...
0,11,relational databases,"[relational database, relational database syst...","[936, 3409]","[relational database systems, relational datab...",relational database systems;relational databas...


In [None]:
matched_course_skills_exploded_df = matched_course_skills_df.explode('course_skill_ids')
matched_course_skills_exploded_df.head(5)

Unnamed: 0,listing_skill_id,listing_skill_name,course_skill_names,course_skill_ids,all_skills_matched,all_skills_matched_str
0,1,lan,[lan],7652,[lan],lan
0,6,selenium,[selenium],70,[selenium],selenium
0,7,unix,[unix],1720,[unix],unix
0,10,development,"[developmental, redevelopment, drug development]",4405,"[developmental, drug development, development,...",developmental;drug development;development;red...
0,10,development,"[developmental, redevelopment, drug development]",14793,"[developmental, drug development, development,...",developmental;drug development;development;red...


In [None]:
# read relation between courses and course_skills
courses_skills_relation_df = pd.read_csv(temp_dir+"courses_skills_relationship_TEMP.csv")
courses_skills_relation_df.head(5)

Unnamed: 0,course_id,course_skill_id
0,0,0
1,330,0
2,1906,0
3,2424,0
4,2445,0


In [None]:
# we need to merge matched_course_skills_exploded_df and courses_skills_relation_df to get connection between course_id and listing_skill_id
course_listing_skill_relation_df = courses_skills_relation_df.merge(matched_course_skills_exploded_df, 
                                                                    how='outer', 
                                                                    left_on='course_skill_id', 
                                                                    right_on='course_skill_ids')

# final = JobListing.merge(skills, how='outer', left_on='listing_skills', right_on='job_listing_skill_name')

course_listing_skill_relation_df

Unnamed: 0,course_id,course_skill_id,listing_skill_id,listing_skill_name,course_skill_names,course_skill_ids,all_skills_matched,all_skills_matched_str
0,0,0,20261,dialogue,[dialogue],0,[dialogue],dialogue
1,330,0,20261,dialogue,[dialogue],0,[dialogue],dialogue
2,1906,0,20261,dialogue,[dialogue],0,[dialogue],dialogue
3,2424,0,20261,dialogue,[dialogue],0,[dialogue],dialogue
4,2445,0,20261,dialogue,[dialogue],0,[dialogue],dialogue
...,...,...,...,...,...,...,...,...
68181,3518,16756,,,,,,
68182,3519,16757,,,,,,
68183,3520,16758,,,,,,
68184,3520,16759,,,,,,


In [None]:
course_listing_skill_relation_df = course_listing_skill_relation_df[['course_id', 'listing_skill_id']]
course_listing_skill_relation_df.columns = [':START_ID', ':END_ID']
course_listing_skill_relation_df[':TYPE']='TEACHES'
course_listing_skill_relation_df.dropna(inplace=True)
course_listing_skill_relation_df.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  course_listing_skill_relation_df[':TYPE']='TEACHES'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Unnamed: 0,:START_ID,:END_ID,:TYPE
0,0,20261,TEACHES
1,330,20261,TEACHES
2,1906,20261,TEACHES
3,2424,20261,TEACHES
4,2445,20261,TEACHES


In [None]:
course_listing_skill_relation_df.to_csv(output_dir+"teaches__relation.csv", index=False)

In [None]:
len(courses_skills_df)

16761