-
Notifications
You must be signed in to change notification settings - Fork 3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Recompute job keywords and scores on skills update #62
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
import assert from 'assert'; | ||
import { forEachAsync } from 'foreachasync'; | ||
|
||
import { Skills, Jobs } from '../schema'; | ||
|
||
class AllSkills { | ||
constructor(jobAnalyzer) { | ||
this.jobAnalyzer = jobAnalyzer; | ||
} | ||
|
||
static async setup() { | ||
// Make sure there is always one and only one entry | ||
const count = await Skills.countDocuments(); | ||
assert(count === 0 || count === 1); | ||
|
||
if (count === 0) { | ||
await Skills.create({}); | ||
} | ||
} | ||
|
||
/* gets and returns a set containing the collective skills of all the users */ | ||
static async getAll() { | ||
const doc = await Skills.findOne({}); | ||
return doc.skills; | ||
} | ||
|
||
/** | ||
* Updates all skills and recomputes job keyword counts and scores | ||
* @param {Array<String>} skills new skills to add to all skills | ||
*/ | ||
async update(skills) { | ||
const oldSkillsData = await Skills.findOneAndUpdate({}, { | ||
$addToSet: { | ||
skills, | ||
}, | ||
}).orFail(); | ||
|
||
const updatedSkills = await AllSkills.getAll(); | ||
// Find newly added skills | ||
const newSkills = updatedSkills.slice(oldSkillsData.skills.length); | ||
|
||
if (newSkills.length === 0) { | ||
return; | ||
} | ||
|
||
// Update keyword counts of each job | ||
const jobs = await Jobs.find({}); | ||
await forEachAsync(jobs, async (_, jobIdx) => { | ||
this.jobAnalyzer.computeJobKeywordCount(jobs[jobIdx], newSkills); | ||
await jobs[jobIdx].save(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can make this synchronous and save after the for loop (may have better performance, especially when jobs is much larger) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You'd still need to loop through all jobs and save each, so don't see the difference? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I spent some time googling about how There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Doing
shouldn't have any performance difference compared to
|
||
}); | ||
|
||
// Computes tf idf for newly added skills | ||
await this.jobAnalyzer.computeJobScores(oldSkillsData.skills.length); | ||
} | ||
} | ||
|
||
export default AllSkills; |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
import AllSkills from './AllSkills'; | ||
|
||
export default AllSkills; |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,8 @@ | ||
import Logger from 'js-logger'; | ||
import { forEachAsync } from 'foreachasync'; | ||
|
||
import User from '../user'; | ||
import Response from '../types'; | ||
import AllSkills from '../all_skills'; | ||
import { Jobs, Users } from '../schema'; | ||
import { JOBS_PER_SEND } from '../constants'; | ||
|
||
|
@@ -18,37 +18,56 @@ class JobAnalyzer { | |
}); | ||
} | ||
|
||
async computeJobScores() { | ||
/** | ||
* Computes the number of times the given keywords appear in the given job | ||
* and modifies in the job in-place | ||
* @param {Array<String>} keywords | ||
* @param {Job} job | ||
*/ | ||
computeJobKeywordCount(job, keywords) { | ||
// Add the number of occurance of all keywords of the result | ||
const description = job.description.toLowerCase(); | ||
keywords.forEach((keyword) => { | ||
wchang22 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// TODO: matches "java" with "javascript" from description | ||
// NOTE: if you map with spaces around it, problems such as "java," arise | ||
const re = new RegExp(keyword, 'g'); | ||
job.keywords.push({ | ||
name: keyword, | ||
count: (description.match(re) || []).length, | ||
}); | ||
}); | ||
} | ||
|
||
/** | ||
* Computes tf-idf scores for all jobs using all user skills | ||
* Optionally specify a range of skills to use | ||
* | ||
* @param {Number} skillsStart Index of first skill to use | ||
* @param {Number} skillsEnd One past the index of the last skill to use | ||
*/ | ||
async computeJobScores(skillsStart, skillsEnd) { | ||
this.logger.info('Starting to compute job scores...'); | ||
|
||
const jobs = await Jobs.find({}); | ||
const skills = await User._getAllSkills(); | ||
const offset = skillsStart || 0; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this does There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If skillsStart is not passed in, this evaluates to undefined || 0 = 0 instead of undefined. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. assert statement? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will it ever be undefined? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The idea is that you can call this with or without an argument. i.e. computeJobScores() // computes for all skills
computeJobScores(5, 12) // computes skills 5 to 11 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see what you are trying to do now. Could we set the default of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is no such thing as optional arguments in JS.
|
||
const allSkills = await AllSkills.getAll(); | ||
const newKeywords = offset > 0 ? allSkills.slice(offset, skillsEnd) : allSkills; | ||
|
||
await forEachAsync(skills, async (skill, skillIdx) => { | ||
const keyword = skill.replace(/[-/\\^$*+?.()|[\]{}]/g, '\\$&'); | ||
const docCount = jobs.reduce((sum, posting) => sum | ||
+ Number(posting.keywords[skillIdx].count > 0), 0); | ||
await forEachAsync(newKeywords, async (_, newKeywordIdxBase) => { | ||
const allKeywordIdx = newKeywordIdxBase + offset; | ||
// Count the number of jobs with the given skill | ||
const docCount = jobs.reduce((sum, job) => sum | ||
+ Number(job.keywords[allKeywordIdx].count > 0), 0); | ||
|
||
const jobsLen = jobs.length; | ||
// calculate tf_idf each doc and save it | ||
await forEachAsync(jobs, async (job, i) => { | ||
const keywordOccurrences = job.keywords[skillIdx].count; // TODO: what if new keyword? | ||
await forEachAsync(jobs, async (job, jobIdx) => { | ||
const keywordOccurrences = job.keywords[allKeywordIdx].count; | ||
const wordCount = job.description.split(' ').length; | ||
const tf = keywordOccurrences / wordCount; | ||
const idf = docCount !== 0 ? Math.log(jobsLen / docCount) : 0; | ||
const tfidf = tf * idf; | ||
const idf = docCount !== 0 ? Math.log(jobs.length / docCount) : 0; | ||
|
||
// add name and tf_idf score to each job's keywords the first time | ||
// replace tf_idf score for a keyword for each job | ||
const keywordIdx = job.keywords.findIndex(elem => elem.name === keyword); | ||
if (keywordIdx === -1) { | ||
job.keywords.push({ | ||
name: keyword, | ||
tfidf, | ||
}); | ||
} else { | ||
jobs[i].keywords[keywordIdx].tfidf = tfidf; | ||
} | ||
jobs[jobIdx].keywords[allKeywordIdx].tfidf = tf * idf; | ||
|
||
await job.save(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can save all jobs outside the async block There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we could prob do There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tried, got |
||
}); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
import Jobs from './job_schema'; | ||
import Users from './user_schema'; | ||
import Skills from './skills_schema'; | ||
|
||
export { Jobs, Users }; | ||
export { Jobs, Users, Skills }; |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
import mongoose from 'mongoose'; | ||
|
||
|
||
const skillsSchema = new mongoose.Schema({ | ||
skills: [String], | ||
}, | ||
{ | ||
versionKey: false, | ||
}); | ||
|
||
const Skills = mongoose.model('Skills', skillsSchema); | ||
|
||
export default Skills; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
as a side note: if new jobs are added while new skills are added, this is still broken lol
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Very
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Create an issue, author?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#71