In [1]:
import urllib2
from bs4 import BeautifulSoup
import itertools
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import time
import nltk
from nltk.stem import WordNetLemmatizer

import gensim
import pickle
from gensim import corpora, models, similarities

import psycopg2
import pandas as pd
import numpy as np
import sqlalchemy as sa
from sqlalchemy_utils import database_exists, create_database
from collections import Counter

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = 8, 5  # plotsize
plt.rcParams['lines.linewidth'] = 2
plt.rcParams['lines.color'] = 'r'
plt.rcParams['font.size'] = 14

pd.set_option('display.max_columns', 500)



### load the resumes

In [2]:
path = '/home/hxzheng/Insight_DS_Fellowship/Project/JobRecomm/Resume_text/'

texts = pd.read_pickle(path + 'texts_b4tfidf.txt')
dictionary_b4tfidf = corpora.Dictionary(texts)

In [3]:
def tfidf_generator(texts, dictionary, corpus_only=False):
    class MyCorpus(object):
        def __iter__(self):
            for text in texts:
                yield dictionary.doc2bow(text)

    my_corpus = MyCorpus()
    tfidfModel = models.TfidfModel(my_corpus) # initialize a model
    if corpus_only:
        return my_corpus, tfidfModel
    else:
        corpus_tfidf = tfidfModel[my_corpus]

        return my_corpus, corpus_tfidf   

In [4]:
def tfidf_filter(texts,corpus_tfidf, threshold): # filter out all words with tfidf<threshold
    new_texts=[] 
    t=time.time()
    for i, doc in enumerate(corpus_tfidf):
        text=[]
        for j, (_,tfidf) in enumerate(doc):
            if tfidf>=threshold:
                text.append(texts[i][j])
        new_texts.append(text)  

    return new_texts

### perform tfidf-fitlering

In [5]:
_, corpus_tfidf = tfidf_generator(texts, dictionary_b4tfidf)
minthresh=[0.0,0.024, 0.043, 0.076, 0.12, 0.16]  # threshold to filter out high-freq/low-freq words
for i, threshold in enumerate(minthresh):
    
    t=time.time()
    new_texts = tfidf_filter(texts,corpus_tfidf, threshold)
    new_dict =corpora.Dictionary(new_texts)  
    my_corpus, tfidfModel =  tfidf_generator(new_texts, new_dict, corpus_only=True)
    
    tfidfModel.save(path + 'tfidfmodel'+str(i)+'.tfidf') # store the tfidf-model
    new_dict.save(path + 'dict'+str(i)+'.dict') # store the dictionary
    corpora.MmCorpus.serialize(path + 'corpus'+str(i)+'.mm', my_corpus) # store to disk, for later us
    
    elapsed = (time.time() - t)/60 
    print "Done with tfidf %.3f with %d unique words left in %.2f minutes"%(threshold,len(new_dict),elapsed)

Done with tfidf 0.000 with 20532 unique words left in 0.31 minutes
Done with tfidf 0.024 with 17899 unique words left in 0.30 minutes
Done with tfidf 0.043 with 14720 unique words left in 0.21 minutes
Done with tfidf 0.076 with 10071 unique words left in 0.17 minutes
Done with tfidf 0.120 with 6239 unique words left in 0.13 minutes
Done with tfidf 0.160 with 4202 unique words left in 0.11 minutes


### example

In [8]:
dict4 = gensim.corpora.Dictionary.load(path + 'dict5'+'.dict')
mm = gensim.corpora.MmCorpus(path + 'corpus5'+'.mm')
tfidfModel4 = models.TfidfModel.load(path + 'tfidfmodel5'+'.tfidf')
topwords = dict4.token2id.keys()
print len(topwords)
#print topwords

4202


In [9]:
p=0
for doc in tfidfModel4[mm]:
    if doc==[]:
        p=p+1
    print doc

print "Number of empty documents: %d"%p    

[(0, 0.3810751421222164), (1, 0.18487150087536086), (2, 0.23912772632449078), (3, 0.368106202818228), (4, 0.461645552160682), (5, 0.3977950128713343), (6, 0.461645552160682), (7, 0.20843683514547573)]
[(8, 0.5141574268758274), (9, 0.3846595324576298), (10, 0.19867311994250264), (11, 0.4854022492375205), (12, 0.38141109920361665), (13, 0.26512215636955366), (14, 0.31117301928608837)]
[(15, 0.49725510008552587), (16, 0.26138749988015236), (17, 0.38543456392278685), (18, 0.3077362861828488), (19, 0.27318375374084525), (20, 0.33157120306436877), (21, 0.40234570657864865), (22, 0.3077362861828488)]
[(23, 0.15359874321125097), (24, 0.3347483521550141), (25, 0.21673054594515137), (26, 0.36950996097360794), (27, 0.4749446999228797), (28, 0.5203529703949188), (29, 0.42953642945084053)]
[(23, 0.13470957195177094), (30, 0.22717822277379626), (31, 0.37671316404870425), (32, 0.45636132458014494), (33, 0.6015459998224868), (34, 0.37671316404870425), (35, 0.27621463015517167)]
[(36, 0.463397349669641