In [1]:
import urllib2
from bs4 import BeautifulSoup
import itertools
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk
from nltk.stem import WordNetLemmatizer

import psycopg2
import pandas as pd
import numpy as np
import sqlalchemy as sa
from sqlalchemy_utils import database_exists, create_database

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = 12, 8  # plotsize
plt.rcParams['lines.linewidth'] = 2
plt.rcParams['lines.color'] = 'r'
plt.rcParams['font.size'] = 14

pd.set_option('display.max_columns', 500)

In [2]:
import re

def text_cleaner(html,Oneline=True): # return a single line of cleaned text
    '''
    This function just cleans up the raw html.
    Inputs: a URL to investigate
    Outputs: Cleaned text only after parsing, tokenization, lemmerization, remove stop-words
    '''
    try:
        site = urllib2.urlopen(html).read() # Connect to the job posting
    except:
        print "Error loading " + html
        return   # Need this in case the website isn't there anymore or some other weird connection problem 

    soup_obj = BeautifulSoup(site,"lxml") # Get the html from the site

    for script in soup_obj(["script", "style"]):
        script.extract() # Remove these two elements from the BS4 object
    
    text = soup_obj.body.get_text('\n') # Get the text from this

    lines = (line.strip() for line in text.splitlines()) # break into lines
#    lines = [line for i, line in enumerate(lines) if i>16]
#    for line in lines:
#        print line
    chunks = (phrase.strip() for line in lines for phrase in line.split("  ")) # break multi-headlines into a line each
    
    def chunk_space(chunk):
        chunk_out = chunk + ' ' # Need to fix spacing issue
        return chunk_out  

    text = ''.join(chunk_space(chunk) for chunk in chunks if chunk).encode('utf-8') # Get rid of all blank lines and ends of line

    # Now clean out all of the unicode junk (this line works great!!!)
    try:
        text = text.decode('unicode_escape').encode('ascii', 'ignore') # Need this as some websites aren't formatted
    except:                                                            # in a way that this works, can occasionally throw
        return                                                         # an exception

    text = re.sub("[^a-zA-Z+3]"," ", text)  # Now get rid of any terms that aren't words (include 3 for d3.js)
                                                # Also include + for C++
        
    # remove the junk from the beginning and end parts of Indeed
    try:
        text = ((text.lower()).split('advanced search')[1]).split('save resume')[0]
    except:
        return text
    
    return text

## Connect to the database to access resume links

In [3]:
con = psycopg2.connect(database = 'indeed', user ='hxzheng',host='/tmp/')

sql_query = """
SELECT * FROM resume_table2;
"""
resumes = pd.read_sql_query(sql_query,con)
resumes.head()

Unnamed: 0,index,resume_url,job_class
0,0,http://www.indeed.com/r/Vikas-Patil/d9dc3f1bd6...,Data Scientist
1,1,http://www.indeed.com/r/Robert-Sousek/f78d89b0...,Data Scientist
2,2,http://www.indeed.com/me/Kris_I_Ford?sp=0,Data Scientist
3,3,http://www.indeed.com/r/Joydeep-Singh/ca6d4c0d...,Data Scientist
4,4,http://www.indeed.com/r/Ahmad-Nahhas/b9b9970cc...,Data Scientist


In [4]:
nresumes = len(resumes['job_class'])

In [5]:
jobclass = resumes['job_class'].unique()
print jobclass

['Data Scientist' 'Software Engineer' 'Consultant' 'Strategy Manager']


In [6]:
for i in resumes.index[:5]:
    print resumes['resume_url'][i]

http://www.indeed.com/r/Vikas-Patil/d9dc3f1bd6af7cbe?sp=0
http://www.indeed.com/r/Robert-Sousek/f78d89b06989a308?sp=0
http://www.indeed.com/me/Kris_I_Ford?sp=0
http://www.indeed.com/r/Joydeep-Singh/ca6d4c0d86888cb1?sp=0
http://www.indeed.com/r/Ahmad-Nahhas/b9b9970cc92db421?sp=0


In [7]:
print resumes.ix[2]['resume_url'] 

http://www.indeed.com/me/Kris_I_Ford?sp=0


In [8]:
#print text_cleaner(resumes.ix[0]['resume_url'])

In [9]:
jobclass[0]

'Data Scientist'

In [10]:
resumes.loc[resumes['job_class']==jobclass[0]].head()

Unnamed: 0,index,resume_url,job_class
0,0,http://www.indeed.com/r/Vikas-Patil/d9dc3f1bd6...,Data Scientist
1,1,http://www.indeed.com/r/Robert-Sousek/f78d89b0...,Data Scientist
2,2,http://www.indeed.com/me/Kris_I_Ford?sp=0,Data Scientist
3,3,http://www.indeed.com/r/Joydeep-Singh/ca6d4c0d...,Data Scientist
4,4,http://www.indeed.com/r/Ahmad-Nahhas/b9b9970cc...,Data Scientist


In [11]:
Listdict=[]
Listindex=[]

for i in range(len(jobclass)):
    Listindex.append(resumes.loc[resumes['job_class']==jobclass[i]].index)

In [12]:
len(Listindex)

4

In [13]:
Listindex[0]

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            707, 708, 709, 710, 711, 712, 713, 714, 715, 716],
           dtype='int64', length=717)

In [14]:
Listindex[1]

Int64Index([ 717,  718,  719,  720,  721,  722,  723,  724,  725,  726,
            ...
            1515, 1516, 1517, 1518, 1519, 1520, 1521, 1522, 1523, 1524],
           dtype='int64', length=808)

## Download resumes of Data Scientist etc.

In [15]:
import time

resume_file=['Resumes_DS2.txt','Resumes_SE2.txt', 'Resumes_Con2.txt','Resumes_SM2.txt']

for jc in range(len(jobclass)):
    p=0
    t = time.time()
    path = '/home/hxzheng/Insight_DS_Fellowship/Project/JobRecomm/Resume_text/'
    f = open(path +resume_file[jc],'w') # Write to file
    
    for i in Listindex[jc]:
        link = resumes.ix[i]['resume_url']
        try:
            text=text_cleaner(link)
            if text!=None:
                f.write(text+'\n')  # save to text file
                p=p+1
        except:
            print('Error in accessing resume %d'%i)

        time.sleep(0.1)

        if p%100==0:
            print 'Done with %d resumes'%p  


    elapsed = (time.time() - t)/60    
    print(elapsed)
    print ('Done with downloading %d %s resumes!'%(p,jobclass[jc]))   
    f.close()  
    time.sleep(120) # to prevent us from overwhelming the Indeed server

Done with 100 resumes
Done with 200 resumes
Done with 300 resumes
Done with 400 resumes
Done with 500 resumes
Done with 600 resumes
Done with 700 resumes
18.8086180528
Done with downloading 717 Data Scientist resumes!
Done with 100 resumes
Done with 200 resumes
Done with 300 resumes
Error loading http://www.indeed.com/r/Justin-Towles/be7a202f1b57f08a?sp=0
Done with 400 resumes
Done with 500 resumes
Done with 600 resumes
Done with 700 resumes
Error loading http://www.indeed.com/r/HALUK-APAYDIN/833435e22e75c26a?sp=0
Error loading http://www.indeed.com/r/Kasturi-Hariharasubramanian/014ba91622c04630?sp=0
Done with 800 resumes
24.7797529141
Done with downloading 804 Software Engineer resumes!
