# Objective: 
### Scrape Indeed.com for positions for Data Scientists and try to create a model that can predict whether they will be well paying positions.

In [1]:
# Scraping Imports
import requests
import bs4
from bs4 import BeautifulSoup
import urllib

import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

from sklearn.preprocessing import OneHotEncoder

#### Indeed Extracting Target Data

Company

Position Title

Location

Salary

URL

In [6]:
# Scraper being used 
def sky_scraper2(url):

    city = requests.get(url)
    city_data = city.content
    soup_city = BeautifulSoup(city_data)
    
    

    for item in soup_city.findAll("div",{"class":" row result"}):
        # Gets the position title
        title = item.find("a").get("title")
        #Gets the Companies names
        try:
            company = item.find(name='span', attrs={'itemprop':'name'}).text
            company = re.sub('\s+','',company)
            company.strip()
        except AttributeError:
            company = 'none'
        # Gets the positions location and only appends the city name and state abbreviation.     
        location = item.find("span", {"itemprop" : "addressLocality"}).text
        location = re.sub(" \d+", " ", location)
        location = re.sub(r'\([^)]*\)', '', location)
        location = re.sub('\s+','',location)
        
        #Gets the positions salary, if the position actually comes with one. 
        try:
            salary = item.find(name ="nobr").text
        except AttributeError:
            salary = 'none'
        
        # Gets the short description.
        description = item.find("span",{"class":"summary"}).text
        
        # Gets the url for the link to the actual job posting. 
        # This is used later to get the positions full description (after cleaning)
        direction = item.find("a",{"rel":"nofollow"}).get("href")
        url = 'http://www.indeed.com'+direction
        
        # appends all the findings to a dataframe.
        jobs_df.loc[len(jobs_df)]=[title, company, location, salary, description, url] 
        

I tried many things to try to get the full position description using the main scraper, however I encountered many road blocks.

1. SSL Errors -  These are thrown when a website believes that whoever is trying to access their site is a bot.  This can be the result of an IP hitting webpages faster than humaly possible or the website detecting that the request is not coming from a browser, in thise case they were coming from python.

2.  To avoid the possibility of being denied by the website while using python I set up a Headless Browser using Selenium.  I set Selenium to essentially run through Google Chrome so the requests coming from my function would look like they were coming from Google Chrome rather than Python.
>- a.) Selenium actually has to open web pages for every request which made it extreamly slow.
>- b.) if a script to close the page once finished was not included, then Selenium could open hundred of pages and                eventually crash your computer.
>- c.)  Websites that were not secure often cause the script to freeze without notice or explanation.
    (I though i left it running for 2 hours but it froze on a page 5 minutes in)
    
Eventually, once I dropped all the unusable data a ran another scraper that would just pass up those websites that would not accept requests from Python.

In [7]:
#list of cities used
cities = ['New+York', 'Chicago', 'San+Francisco', 'Austin', 'Seattle', 
   'Los+Angeles', 'Philadelphia', 'Atlanta', 'Dallas', 'Pittsburgh', 
   'Portland', 'Phoenix', 'Denver', 'Houston', 'Miami', 'Washington']

In [8]:
# defining the dataframe our results will append to.
cols = ['title','company','location','salary','description','url']
jobs_df = pd.DataFrame(columns = cols)

In [11]:
# Including results for both data scientist and data analyst as broadening my search will help me accumulate more results.
scientist_template = "http://www.indeed.com/jobs?q=data+scientist&l=%s&start=%s&limit=10"
analyst_template = "http://www.indeed.com/jobs?q=data+analyst&l=%s&start=%s&limit=10"
# how many results you want per each city
max_results_per_city = 10


# for loop to scrape all cities desired that are Data Scientists or Data Analysts
for city in cities:
    for start in range(0, max_results_per_city, 10):
        for template_url in [scientist_template, analyst_template]:
            sky_scraper2(template_url % (city, start))
        # Grab the results from the request (as above)
        # Append to the full set of results
        


#### As the dataset is rather large and takes a while to aquire it is recommended to save it as a CSV once the scraper finishes. 

In [None]:
# how to save to csv.
jobs_df.to_csv('indeed_jobs.csv',encoding='utf-8')

In [None]:
# import the save csv, if you need to start over for whatever reason.
jsp_df = pd.read_csv('indeed_jobs.csv')

### General Data Cleaning
1. Remove Duplicates and null
2. Remove all rows where Salary has a 'none' value

In [18]:
# Jobs pure has dropped all duplicate values
jobs_df.drop_duplicates(inplace = True)

In [20]:
# gets rid of the positions with a salary value of 'none'
jobs_df = jobs_df[jobs_df['salary'] != 'none'] 
# JSP = Jobs Salary Pure dataframe

### Salary Column Cleaning

1. Remove Dollar sign 
2. Remove everything after dashes.  (I elected to take the lower value in a range as thats what employers are going to try to pay you and try to trick you with a high end of the range.)
3. Convert hourly pay to a yearly salary ( 2000 standard work hours in a year),
4. Remove all word values ('a month','a year','an hour').
5. Remove commas so numbers can be converted to floats.
6. Convert Column type to Float.
7. drop all values less than $15,000 as that is pretty much the eqivalent of annual pay for minimum wage.

In [None]:
# Removing the dollar signs
jobs_df['salary'] = jobs_df.salary.str.replace('$' , '') 

# Remove everything after dashes
jobs_df['salary'] = jobs_df['salary'].apply(lambda x: x.split('-')[0])

In [None]:
# Function for converting hourly pay to annual salary
def day_to_year(item):
    if 'an hour' in item:
        item = item.replace('an hour', '')
        item = float(item)*2000
        return item
    else:
        return item
    
# Actually converting all the hourly positions to yearly salary
jobs_df['salary'] = jobs_df['salary'].apply(day_to_year)

In [None]:
#removing all intances of 'a day', 'a month', 'a year' and commas so i can convert to ints
jobs_df['salary'] = jobs_df.salary.str.replace('a year' , '')
jobs_df['salary'] = jobs_df.salary.str.replace('a month' , '')
jobs_df['salary'] = jobs_df.salary.str.replace('a day' , '')
jobs_df['salary'] = jobs_df.salary.str.replace(',' , '')

In [None]:
# Converting dtype to float
jobs_df['salary'] = jobs_df['salary'].convert_objects(convert_numeric=True)

# Dropping all values below wage criteria.  
jobs_df = jobs_df[jobs_df.salary > 15000]

# Check what I am left with
jobs_df.shape

In [28]:
# Just having a peak at the current dataframes structure
jobs_df.head(1)

Unnamed: 0,title,company,location,salary,description,url
0,Senior Machine Learning Data Scientist,All-InAnalytics,"Austin,TX",140000,\nMachine Learning Data Scientist. Proficient ...,http://www.indeed.com/rc/clk?jk=b87633408f9b73...


### Dataframe Cleaning: Part II
our index is all messy after all the dropped values.
1. Reset the index
2. Remove the column 'Unnamed: 0'

In [27]:
# Reseting index and dropping the 'index' made column that results.  
jobs_df.reset_index(inplace = True)
jobs_df.drop('index', axis =1, inplace=True)

### Re-Scraping to get the descriptions of the positions left in my cleaned dataframe.  
1. Build sub scraper
2. Build vanishing dataframe

This subscraper will only work on Indeed.com pages( or those that have the exact same java layout).

In [31]:
description = []

def sub_scraper(url):
    submarine = requests.get(url)
    if submarine.status_code == 200:
        sub_data = submarine.content
        sub_soup = BeautifulSoup(sub_data)
        post = 'none'
        for element in sub_soup.findAll("span",{"id":"job_summary"}):
            post = element.text
    else:
        post = 'none'
    description.append(post)

In [32]:
# running the subscraper
for link in jobs_df['url']:
    sub_scraper(link)

In [41]:
# this dataframe has the position descriptions concatted to it now.  
description_df =pd.DataFrame(description)
jobs_df_whole = pd.concat([jobs_df, description_df],axis =1)
jobs_df_whole = jobs_df_whole.rename(columns = {0:'full_description'})

### Modelling

We want to predict a binary variable - whether the salary was low or high. 

Compute the median salary and create a new binary variable that is true when the salary is high (above the median)¶
We could also perform Linear Regression (or any regression) to predict the salary value here. Instead, we are going to convert this into a binary classification problem, by predicting two classes, HIGH vs LOW salary.

While performing regression may be better, performing classification may help remove some of the noise of the extreme salaries. We don't have to choice the median as the splitting point - we could also split on the 75th percentile or any other reasonable breaking point.

In fact, the ideal scenario may be to predict many levels of salaries (which i have prepared for)

In [35]:
# lets find the quartile ranges to identify high, low, and inbtween positions.  
jobs_df_whole['salary'].describe()
# 62500 will be out delimiter, the median

count        20.000000
mean      80545.050000
std       46286.859594
min       32900.000000
25%       40000.000000
50%       62500.000000
75%      125000.000000
max      160000.000000
Name: salary, dtype: float64

In [38]:
# Set my y in my field.  
jobs_df_whole['pay_grade'] = np.where(jobs_df_whole['salary']>=62500, 1, 0)

### Multiple Categories 
outliers are 1.5*IQR away from IQR.

iqr = 125000 - 40000 = 85000.

Lower outliers =  32900 - 85000 = -52100. (dont need to worry about this)

Upper Outliers = 160000 + 85000 = 245000.

Very Low < -52100 (not possible with this dataset)

low < 40000

medium low < 62500

Medium High < 125000

High < 245000

Very High > 245000 ( also not possible with this sample

In [37]:
def pay_scaler(row):
        if row['salary'] < 40000:
            value = 'low'
        elif row['salary'] < 62500:
            value = 'med_low'
        elif row['salary'] < 125000:
            value = 'med_high'
        elif row['salary'] < 245000:
            value = 'high'
        else:
            value = 'very_high'
        return value
    
jobs_df_whole['pay_scale']= jobs_df_whole.apply(pay_scaler, axis =1)

In [None]:
# Now we got the complete data that we can work with, we may want to save it incase anything happens. 
jobs_df_whole.to_csv('indeed_jobs_ideal.csv',encoding='utf-8')
jobs_df_whole = pd.read_csv('indeed_jobs_ideal.csv')

In [43]:
# Need to create a sub dataset for all the rows i have that dont have a null value in the full description row.
jobs_df_pure = jobs_df_whole[~jobs_df_whole.full_description.str.contains("none")]

In [44]:
#Checking the shape of whats left
jobs_df_pure.shape

(5, 9)

Pretty tiny, I only scraped 100 results to begin with for times sake

We want to predict a binary variable - whether the salary was low or high. Compute the median salary and create a new binary variable that is true when the salary is high (above the median)
We could also perform Linear Regression (or any regression) to predict the salary value here. Instead, we are going to convert this into a binary classification problem, by predicting two classes, HIGH vs LOW salary.
While performing regression may be better, performing classification may help remove some of the noise of the extreme salaries. We don't have to choice the median as the splitting point - we could also split on the 75th percentile or any other reasonable breaking point.
In fact, the ideal scenario may be to predict many levels of salaries.

#### 1. Separate Necessary Features into x and y

In [47]:
X = jobs_df_pure[['full_description']]
# for binary
y1 = jobs_df_pure['pay_grade']
# for multi-class
y2 = jobs_df_pure['pay_scale']

#### 2. Apply TFIDF Vectorizer.
Term Frequency Inverse Document Frequency 

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvec = TfidfVectorizer(stop_words='english')

X_TFIDF = tvec.fit_transform(X['full_description'])

#### 3. Train-Test Split

In [49]:
from sklearn.cross_validation import train_test_split

# set 1 is for the boolean y (pay_grade)
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_TFIDF, y1, test_size = 0.25, random_state = 21)

# set 2 is for the categorical y (pay_scale)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_TFIDF, y2, test_size = 0.25, random_state = 21)

### 4a. Random Forest
Binary and multi-class classification

Create a Random Forest model to predict High/Low salary using statsmodel. Start by ONLY using the location as a feature.

In [50]:
# Necessary imports
from sklearn.cross_validation import cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.ensemble import RandomForestRegressor

In [None]:
# This can be put in the cross validation parameters.  
cv1 = StratifiedKFold(y1, n_folds=3, shuffle=True, random_state=41)
cv2 = StratifiedKFold(y2, n_folds=3, shuffle=True, random_state=41)

### Decision Tree for Binary Classifier

In [52]:
dt1 = DecisionTreeClassifier(class_weight='balanced')
c_val_score1 = cross_val_score(dt1, X_TFIDF, y1, cv=cv1, n_jobs=-1)
# using original values as CV automatically does a train-test split
print "{} Score:\t{:0.3} ± {:0.3}".format("Decision Tree", c_val_score1.mean().round(3), c_val_score1.std().round(3))

Decision Tree Score:	0.667 ± 0.236


### Decision Tree for Multi-Class Classifier

In [53]:
dt2 = DecisionTreeClassifier(class_weight='balanced')
c_val_score2 = cross_val_score(dt2, X_TFIDF, y2, cv=cv2, n_jobs=-1)
# using original values as CV automatically does a train-test split
print "{} Score:\t{:0.3} ± {:0.3}".format("Decision Tree", c_val_score2.mean().round(3), c_val_score2.std().round(3))

Decision Tree Score:	0.111 ± 0.157


### Random Forest Regressor

In [55]:
rfr = RandomForestRegressor()
rfr.fit(X_train1, y_train1)
y_pred1 = rfr.predict(X_test1)
rfr.score(X_test1, y_test1)

-0.040000000000000036

### Random Forest Classifier

In [59]:
rfc = RandomForestClassifier()

rfc.fit(X_train1, y_train1)
y_pred1 = rfc.predict(X_test1)
# convert x test to an array before scoring. np.to array opr ravel
X_test1_arr = X_test1.toarray()
rfc.score(X_test1_arr, y_test1)


0.5

### Random Forest Classifier for Multi-Class Classifier

In [None]:
rfc = RandomForestClassifier()

rfc.fit(X_train2, y_train2)

y_pred2 = rfc.predict(X_test2)

X_test2_arr = X_test2.toarray()

rfc.score(X_test2, y_test2)

In [None]:
rfc = RandomForestClassifier()

cvs2 = cross_val_score(rfc, X_train2, y_train2, cv = 12)

cvs2

### 4b. Logistic Regression Model
Binary Classification only

In [62]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

X = X_TFIDF # Term Frequency Inverser Document Frequency
y1 = jobs_df_pure['pay_grade']

X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(X_TFIDF, y1, test_size = 0.25, random_state = 21)

In [63]:
lr.fit(X_train_lr, y_train_lr)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [64]:
y_pred_lr = lr.predict(X_test_lr)

In [65]:
lr.score(X_test_lr,y_test_lr)

0.5