# Notebook 3: Matching between Skills Extracted from Listings and Skills Extracted from Courses

Using Jaccard Similarity Measure with 3-gram tokenizer

#### This notebook produces the following data into the _output_datasets_ folder:
```
(SKILL) NODE						skill__node.csv
skill_id:ID
skill_name
aliases[]
:LABEL = "SKILL"

[TEACHES] RELATION					teaches__relation.csv
:START_ID = course_id
:END_ID = skill_id
:TYPE = "TEACHES"
```

## Imports

In [1]:
import pandas as pd
import py_stringmatching as sm
import math

## Read Data

In [2]:
mydrive = ""   # this is when we run locally

# Google Colab:
# from google.colab import drive
# drive.mount('/content/drive')
# mydrive = "/content/drive/MyDrive/DSE 203 — etl/DSE203_Project/"  # this is when we run on COLAB

input_dir = mydrive+"input_datasets/"
output_dir = mydrive+"output_datasets/"
temp_dir = mydrive+"temp_datasets/"

In [3]:
# read dataset
listing_skills_df = pd.read_csv(temp_dir+'listing_skills_TEMP.csv')
courses_skills_df = pd.read_csv(temp_dir+'courses_skills_TEMP.csv')

listing_skills_df
# courses_skills_df

Unnamed: 0,listing_skill_id,listing_skill_name
0,0,salesforce
1,1,salesforce1 platform and lightning components
2,2,mobilesdk
3,3,apex
4,4,lightning connect
...,...,...
164,164,pdf resume
165,165,css
166,166,mvcdesign
167,167,essential responsibilitiesfunctions research


In [4]:
courses_skills_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 233 entries, 0 to 232
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   course_skill_id    233 non-null    int64 
 1   course_skill_name  233 non-null    object
dtypes: int64(1), object(1)
memory usage: 3.8+ KB


In [5]:
listing_skills_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169 entries, 0 to 168
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   listing_skill_id    169 non-null    int64 
 1   listing_skill_name  169 non-null    object
dtypes: int64(1), object(1)
memory usage: 2.8+ KB


## Matching: Using Jaccard Similarity Measure with 3-gram tokenizer

In [6]:
jaccard = sm.Jaccard()
qval=3
qgram_tokenizer = sm.QgramTokenizer(qval)

def calc_boundaries_jac_qg(s1, t):
    '''
    Calculate threshold boundaries for q-gram tokenizer and Jaccard similarity measure.
    Returns: list of [lower_boundary, upper_boundary] in amount of tokens (words)
    '''
    
    s1_tokens_length = len(qgram_tokenizer.tokenize(s1))
    min_l = t*s1_tokens_length
    max_l = (1/t)*s1_tokens_length
    return [math.floor(min_l), math.ceil(max_l)]

def calc_jaccard_score_qg(s1, s2):
    '''
    Calculate Jaccard similarity score between 2 strings s1 and s2 with white q-gram tokenizer
    '''
    s1_tokens = qgram_tokenizer.tokenize(s1.lower())  # lowering just in case we forgot to do it before
    s2_tokens = qgram_tokenizer.tokenize(s2.lower())  # lowering just in case we forgot to do it before
    
    score = jaccard.get_sim_score(s1_tokens, s2_tokens)
    return score

In [7]:
# calculate number of tokens for each string in orgNames
courses_skills_df['n_tokens'] = courses_skills_df.course_skill_name.apply(lambda x: len(qgram_tokenizer.tokenize(x)))
courses_skills_df.head(5)

Unnamed: 0,course_skill_id,course_skill_name,n_tokens
0,0,trading,9
1,1,financial analysis business finance,37
2,2,financial modeling,20
3,3,data analysis,15
4,4,python,8


In [8]:
listing_skill_id_list = listing_skills_df.listing_skill_id.to_list()
listing_skill_name_list = listing_skills_df.listing_skill_name.to_list()

In [9]:
%%time
t = 0.4
# this df will hold the final results patent_string | list_of_matches
final_df = pd.DataFrame(columns = ['listing_skill_id', 'listing_skill_name', 'course_skill_names', 'course_skill_ids'])

# for current_string in patent_strings_list:
for listing_skill_id, listing_skill_name in zip(listing_skill_id_list, listing_skill_name_list):

    # calculate threshold for count filter:
    boundaries = calc_boundaries_jac_qg(listing_skill_name, t)

    # Pruning: apply filtering based on the boundaries
    df_to_process = courses_skills_df[(courses_skills_df.n_tokens>=boundaries[0]) & (courses_skills_df.n_tokens<=boundaries[1])].copy()
    
    # just print to see the progress
    print(f'\r{listing_skill_id}. {listing_skill_name}    Matching with {len(df_to_process)} records.', end="")
    
    # calculate similarity measure score for each string in filtered df
    df_to_process['sim_score'] = df_to_process.course_skill_name.apply(lambda x: calc_jaccard_score_qg(x, listing_skill_name))

    # this will hold the list of matches for given string
    temp_df = df_to_process[df_to_process.sim_score>t].sort_values('sim_score', ascending=False)
    matched_course_skill_name_list = temp_df.course_skill_name.to_list()
    matched_course_skill_id_list = temp_df.course_skill_id.to_list()
    # course_ids_list = temp_df.course_id.to_list()

    # course_names_list = temp_df.course_name.to_list()
    
    # convert to df with row patent_string | list_of_matches
    int_df = pd.DataFrame(data = [[str(listing_skill_id), listing_skill_name, matched_course_skill_name_list, matched_course_skill_id_list]], 
                          columns = ['listing_skill_id', 'listing_skill_name', 'course_skill_names', 'course_skill_ids'])

    # append to the final df
    final_df = pd.concat([final_df, int_df])

168. html5    Matching with 134 records.CPU times: total: 797 msth 110 records.ds.ds.
Wall time: 789 ms


In [10]:
final_df

Unnamed: 0,listing_skill_id,listing_skill_name,course_skill_names,course_skill_ids
0,0,salesforce,[],[]
0,1,salesforce1 platform and lightning components,[],[]
0,2,mobilesdk,[],[]
0,3,apex,[],[]
0,4,lightning connect,[],[]
...,...,...,...,...
0,164,pdf resume,[],[]
0,165,css,[],[]
0,166,mvcdesign,[],[]
0,167,essential responsibilitiesfunctions research,[],[]


In [11]:
# lets look what was matched:
final_df[final_df.course_skill_names.str.len()>0]

Unnamed: 0,listing_skill_id,listing_skill_name,course_skill_names,course_skill_ids
0,19,web development,"[product development, android development]","[152, 198]"
0,20,javascript,"[javascripts, javascript syntax]","[118, 68]"
0,21,python,[python],[4]
0,22,analyst,[analysis],[9]
0,24,java,[java],[39]
0,28,testing,"[testng, ab testing]","[92, 110]"
0,33,analytics,[analytics],[168]
0,36,documentation,[documents],[145]
0,40,ecommerceretail qa,[ecommerce],[116]
0,50,selenium,"[selenium, basic selenium, selenium java]","[74, 72, 114]"


In [12]:
# for aliases attribute, we'll make a list of all matches
final_df['all_skills_matched'] = final_df['listing_skill_name'].apply(lambda x: x.split(':'))+final_df['course_skill_names']
final_df.all_skills_matched = final_df.all_skills_matched.apply((set)).apply(list)
final_df['all_skills_matched_str'] = final_df.all_skills_matched.apply(lambda x: ";".join(x))
final_df[final_df.course_skill_names.str.len()>0]

Unnamed: 0,listing_skill_id,listing_skill_name,course_skill_names,course_skill_ids,all_skills_matched,all_skills_matched_str
0,19,web development,"[product development, android development]","[152, 198]","[android development, product development, web...",android development;product development;web de...
0,20,javascript,"[javascripts, javascript syntax]","[118, 68]","[javascript syntax, javascript, javascripts]",javascript syntax;javascript;javascripts
0,21,python,[python],[4],[python],python
0,22,analyst,[analysis],[9],"[analysis, analyst]",analysis;analyst
0,24,java,[java],[39],[java],java
0,28,testing,"[testng, ab testing]","[92, 110]","[ab testing, testng, testing]",ab testing;testng;testing
0,33,analytics,[analytics],[168],[analytics],analytics
0,36,documentation,[documents],[145],"[documents, documentation]",documents;documentation
0,40,ecommerceretail qa,[ecommerce],[116],"[ecommerce, ecommerceretail qa]",ecommerce;ecommerceretail qa
0,50,selenium,"[selenium, basic selenium, selenium java]","[74, 72, 114]","[basic selenium, selenium, selenium java]",basic selenium;selenium;selenium java


In [13]:
# final_df.to_csv(temp_dir+'matched_listing_courses_skills__TEMP.csv', index=False)

## Create Nodes table for SKILL

In [14]:
skill_node_df = final_df[['listing_skill_id', 'listing_skill_name', 'all_skills_matched_str']]
skill_node_df.columns = ['skill_id:ID', 'skill_name', 'aliases[]']
skill_node_df[':LABEL']="SKILL"
skill_node_df.to_csv(output_dir+'skill__node.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  skill_node_df[':LABEL']="SKILL"


## Create Relationships table for COURSE (course_id) - TEACHES - SKILL (listing_skill_id, not course_skill_id)

In [15]:
# read courses
# courses_df = pd.read_csv(output_dir+"course__node.csv")
# courses_df.head(5)

In [16]:
# read matched listing_skills with course_skills 
matched_course_skills_df = final_df[final_df.course_skill_names.str.len()>0]
matched_course_skills_df.head(5)

Unnamed: 0,listing_skill_id,listing_skill_name,course_skill_names,course_skill_ids,all_skills_matched,all_skills_matched_str
0,19,web development,"[product development, android development]","[152, 198]","[android development, product development, web...",android development;product development;web de...
0,20,javascript,"[javascripts, javascript syntax]","[118, 68]","[javascript syntax, javascript, javascripts]",javascript syntax;javascript;javascripts
0,21,python,[python],[4],[python],python
0,22,analyst,[analysis],[9],"[analysis, analyst]",analysis;analyst
0,24,java,[java],[39],[java],java


In [17]:
matched_course_skills_exploded_df = matched_course_skills_df.explode('course_skill_ids')
matched_course_skills_exploded_df.head(5)

Unnamed: 0,listing_skill_id,listing_skill_name,course_skill_names,course_skill_ids,all_skills_matched,all_skills_matched_str
0,19,web development,"[product development, android development]",152,"[android development, product development, web...",android development;product development;web de...
0,19,web development,"[product development, android development]",198,"[android development, product development, web...",android development;product development;web de...
0,20,javascript,"[javascripts, javascript syntax]",118,"[javascript syntax, javascript, javascripts]",javascript syntax;javascript;javascripts
0,20,javascript,"[javascripts, javascript syntax]",68,"[javascript syntax, javascript, javascripts]",javascript syntax;javascript;javascripts
0,21,python,[python],4,[python],python


In [18]:
# read relation between courses and course_skills
courses_skills_relation_df = pd.read_csv(temp_dir+"courses_skills_relationship_TEMP.csv")
courses_skills_relation_df.head(5)

Unnamed: 0,course_id,course_skill_id
0,0,0
1,0,1
2,0,2
3,0,3
4,6,3


In [19]:
# we need to merge matched_course_skills_exploded_df and courses_skills_relation_df to get connection between course_id and listing_skill_id
course_listing_skill_relation_df = courses_skills_relation_df.merge(matched_course_skills_exploded_df, 
                                                                    how='outer', 
                                                                    left_on='course_skill_id', 
                                                                    right_on='course_skill_ids')

# final = JobListing.merge(skills, how='outer', left_on='listing_skills', right_on='job_listing_skill_name')

course_listing_skill_relation_df

Unnamed: 0,course_id,course_skill_id,listing_skill_id,listing_skill_name,course_skill_names,course_skill_ids,all_skills_matched,all_skills_matched_str
0,0,0,,,,,,
1,0,1,,,,,,
2,0,2,,,,,,
3,0,3,,,,,,
4,6,3,,,,,,
...,...,...,...,...,...,...,...,...
275,19,228,,,,,,
276,19,229,,,,,,
277,19,230,,,,,,
278,19,231,,,,,,


In [20]:
course_listing_skill_relation_df = course_listing_skill_relation_df[['course_id', 'listing_skill_id']]
course_listing_skill_relation_df.columns = [':START_ID', ':END_ID']
course_listing_skill_relation_df[':TYPE']='TEACHES'
course_listing_skill_relation_df.dropna(inplace=True)
course_listing_skill_relation_df.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  course_listing_skill_relation_df[':TYPE']='TEACHES'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  course_listing_skill_relation_df.dropna(inplace=True)


Unnamed: 0,:START_ID,:END_ID,:TYPE
6,0,21,TEACHES
7,3,21,TEACHES
8,4,21,TEACHES
9,0,111,TEACHES
10,18,111,TEACHES


In [21]:
course_listing_skill_relation_df.to_csv(output_dir+"teaches__relation.csv", index=False)