In [1]:
import pandas as pd
import numpy as np

from collections import Counter

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer

In [3]:
jobs = pd.read_csv( 'unique_jobs.csv', encoding = 'utf-8' )
search_terms = jobs.search_term.unique()

### Combine job descriptions with skills

In [4]:
def bind_desc_skill( job ):
    if job[1] is np.nan:
        doc = job[0]
    else:
        doc = ' '.join( job )
    return doc.replace( '\n', ' ' )

In [5]:
jobs['desc_skill'] = jobs[ ['job_description', 'skills'] ].apply( bind_desc_skill, axis = 1 )

### Separate Jobs

In [7]:
jobs_by_term = {}
for term in jobs.search_term.unique():
    jobs_by_term[term] = jobs[ jobs.search_term == term ]

# Top N grams for job description

In [2]:
class unigram( object ):
    def __init__(self, doc, stem = True ):
        # doc to lowercase
        low_doc = doc.lower()
        # Punctuation removal
        tokenizer = RegexpTokenizer( r'\w+' )
        tokens_P = tokenizer.tokenize( low_doc )
        # Stopwords removal
        custom_stopwords = stopwords.words( 'english' ) + ['nbsp', 'amp']
        tokens_PS = [ token for token in tokens_P if token not in custom_stopwords ]
        # Stemming
        if stem == True:
            stemmer = PorterStemmer()
            final_tokens = [ stemmer.stem( token ) for token in tokens_PS ]
        else:
            final_tokens = tokens_PS
        # FreqDist
        freq = nltk.FreqDist( final_tokens )
        # ====================================
        self.tokens  = final_tokens
        self.freq    = freq
        self.total_n = len( freq )
        
    def top_n( self, n ):
        return self.freq.most_common( n )

In [51]:
def bind_string( string_list ):
    one_string = unicode( '' )
    for s in string_list:
        one_string += ' ' + s
    return one_string

desc_by_term = {} # Each search term with one single string
for term in jobs.search_term.unique():
    desc_by_term[term] = bind_string( jobs_by_term[term].desc_skill )

In [82]:
top_n = 10
term = 'Data+Scientist'
# Tokenize job desc
uni_gram_stem = unigram( desc_by_term[ term ], stem = True )
bi_gram = nltk.FreqDist( nltk.bigrams( uni_gram_stem.tokens ) )
tri_gram = nltk.FreqDist( nltk.trigrams( uni_gram_stem.tokens ) )

In [83]:
pd.DataFrame( { 'Uni': uni_gram_stem.top_n( top_n ),
                'Bi' : bi_gram.most_common( top_n ),
                'Tri': tri_gram.most_common( top_n )
              } )[['Uni', 'Bi', 'Tri']]

Unnamed: 0,Uni,Bi,Tri
0,"(data, 5057)","((big, data), 454)","((equal, opportun, employ), 93)"
1,"(experi, 2823)","((year, experi), 359)","((new, york, citi), 89)"
2,"(work, 2213)","((data, scienc), 319)","((5, year, experi), 85)"
3,"(team, 1938)","((new, york), 291)","((without, regard, race), 78)"
4,"(busi, 1912)","((comput, scienc), 289)","((degre, comput, scienc), 78)"
5,"(develop, 1821)","((machin, learn), 275)","((race, color, religion), 75)"
6,"(manag, 1751)","((commun, skill), 251)","((regard, race, color), 74)"
7,"(analyt, 1639)","((e, g), 201)","((2, year, experi), 70)"
8,"(skill, 1208)","((data, scientist), 194)","((sexual, orient, gender), 69)"
9,"(product, 1146)","((experi, work), 189)","((orient, gender, ident), 69)"
