In [12]:
import numpy as np
import pandas as pd
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF

In [13]:
class PotentialTalents:
    def __init__(self, filepath):
        self.rawData = pd.read_csv(filepath)
        self.processedData = self.rawData.copy()
        self.stop_words = set(stopwords.words('english'))
        self.addDefaultStopWords()
        self.rankedData = None
        self.textProcessing()

    def addStopWord(self, word):
        self.stop_words.add(word)
        
    def addDefaultStopWords(self):
        self.addStopWord('area')
        self.addStopWord('greater')
        
    def textProcessing(self):
        # Copy and merge in the location data
        #self.processedData['mergeColumns'] = self.processedData['job_title'].astype(str) + " " + \
        #                                     self.processedData['location'].astype(str)
        self.processedData['mergeColumns'] = self.processedData['job_title'].astype(str)

        # Tokenize merged column
        self.processedData['tokenized'] = self.processedData['mergeColumns'].apply(word_tokenize)

        # Move to lowercase, remove punctuation
        self.processedData['lower'] = self.processedData['tokenized'].apply(lambda x: [word.lower() for word in x])
        self.processedData['no_punc'] = self.processedData['lower'].apply(lambda x: [word for word in x if word not in string.punctuation])

        # Remove stop words
        self.processedData['stopwords_removed'] = self.processedData['no_punc'].apply(lambda x: [word for word in x if word not in self.stop_words])
        self.processedData['jobTitleString'] = [' '.join(map(str,l)) for l in self.processedData['stopwords_removed']]
        self.processedData['jobTitleString'] = self.processedData['jobTitleString'].str.replace('-', ' ')
    
        # Convert connections
        self.processedData['connectionsInt'] = self.processedData['connection'].str.replace(r'\D', '').astype(int)

    def getTFIDF(self, data=None, searchTermSize=1):
        vectorizer = TfidfVectorizer(max_df=0.9, min_df=1, max_features=1000, use_idf=True, ngram_range=(searchTermSize, searchTermSize))
        tfidf = vectorizer.fit_transform(data)
        tfidfNames = vectorizer.get_feature_names()
        tfidfDataFrame = pd.DataFrame(tfidf.toarray(), columns=list(tfidfNames))
        return tfidfDataFrame

    def genInitialRankings(self, searchTerm='Business', passThrough=False):
        data = self.processedData.copy()
        self.searchTerms = [searchTerm]
        
        ## Calculate TFIDF for search term
        tfidf = self.getTFIDF(data=data['jobTitleString'], searchTermSize=len(searchTerm.split(' ')))
        self.rankedData = pd.concat([data, tfidf], axis=1)
        
        ## Calculate fit (TFIDF result + small delta for number of connections)
        self.rankedData['fit'] = self.rankedData[searchTerm] + \
                                 self.rankedData[searchTerm]*self.rankedData['connectionsInt']/500.0
        self.rankedData['fit'] = self.rankedData['fit']/self.rankedData['fit'].max()
        
        # Extract only relevant columns and sort by fit
        self.rankedData['starred'] = 0.0
        self.rankedData = self.rankedData[['id', 'job_title', 'location', 'connection', 'fit', 'starred']].copy()
        self.rankedData = self.rankedData.sort_values(by='fit', ascending=False)
        self.rankedData = self.rankedData.reset_index(drop=True)
        if (not passThrough):
            display (self.rankedData.set_index('id')[0:15])
        
    def starEntry(self, starID=None):
        if (starID is not None):
            newSearchTerm = self.processedData.loc[self.processedData.id == starID].jobTitleString.values[0]
            self.searchTerms.append(newSearchTerm)
            
            baseRankings = [self.rankedData.copy()]
            for searchTerm in self.searchTerms[1:]:
                self.genInitialRankings(searchTerm, passThrough=True)
                baseRankings.append(self.rankedData[['id', 'fit']].copy())

            for baseFits in baseRankings[1:]:
                baseRankings[0] = pd.merge(baseRankings[0], baseFits, on='id', how='left')
                baseRankings[0]['fit'] = baseRankings[0].fit_x + baseRankings[0].fit_y
                baseRankings[0] = baseRankings[0].drop(['fit_x', 'fit_y'], axis=1)
            
            starIndex = baseRankings[0].loc[baseRankings[0].id == starID].index.values[0]
            baseRankings[0].loc[starIndex, 'starred'] = 1.0
            self.rankedData = baseRankings[0].copy()
            
            # Adjust rankings so starred entries are at top
            for starIndex in self.rankedData.loc[self.rankedData.starred == 1.0].index.values:
                self.rankedData.loc[starIndex, 'fit'] = self.rankedData['fit'].max()
            self.rankedData['fit'] = self.rankedData['fit']/self.rankedData['fit'].max()
            self.rankedData = self.rankedData.sort_values(by=['fit','starred'], ascending=False)
            self.rankedData = self.rankedData.reset_index(drop=True)
        display(self.rankedData.set_index('id')[0:15])
        
    def getRankings(self):
        return self.rankedData.set_index('id').copy()

## USAGE INSTRUCTIONS
##### 1) processData --> `foo = PotentialTalents('<filepath>') `
##### 2) generateInitialRankings --> `foo.genInitialRankings(searchTerm='<desiredSearchTerm>')`
##### 3) star entries 1 at a time --> `foo.starEntry(starID='<id of entry to star>')`
##### Note: Only top 15 ranked entries are shown, to explore rest use `bar = foo.getRankings()` to extract rankings dataFrame

In [14]:
test = PotentialTalents('potential-talents - Aspiring human resources - seeking human resources.csv')

test.genInitialRankings(searchTerm='aspiring human resources')



Unnamed: 0_level_0,job_title,location,connection,fit,starred
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,1.0,0.0
33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.952715,0.0
21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.952715,0.0
17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.952715,0.0
46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.952715,0.0
3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.952715,0.0
58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.952715,0.0
6,Aspiring Human Resources Specialist,Greater New York City Area,1,0.696114,0.0
24,Aspiring Human Resources Specialist,Greater New York City Area,1,0.696114,0.0
36,Aspiring Human Resources Specialist,Greater New York City Area,1,0.696114,0.0


In [15]:
test.starEntry(starID=97)

Unnamed: 0_level_0,job_title,location,connection,starred,fit
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,1.0,1.0
33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.0,0.952715
21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.0,0.952715
17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.0,0.952715
46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.0,0.952715
3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.0,0.952715
58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.0,0.952715
82,Aspiring Human Resources Professional | An ene...,"Austin, Texas Area",174,0.0,0.34815
6,Aspiring Human Resources Specialist,Greater New York City Area,1,0.0,0.348057
24,Aspiring Human Resources Specialist,Greater New York City Area,1,0.0,0.348057


In [16]:
test.starEntry(starID=6)

Unnamed: 0_level_0,job_title,location,connection,starred,fit
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,1.0,1.0
6,Aspiring Human Resources Specialist,Greater New York City Area,1,1.0,1.0
24,Aspiring Human Resources Specialist,Greater New York City Area,1,0.0,1.0
36,Aspiring Human Resources Specialist,Greater New York City Area,1,0.0,1.0
60,Aspiring Human Resources Specialist,Greater New York City Area,1,0.0,1.0
49,Aspiring Human Resources Specialist,Greater New York City Area,1,0.0,1.0
33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.0,0.706732
21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.0,0.706732
17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.0,0.706732
46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.0,0.706732
