In [1]:
import pandas as pd
import numpy as np
import glob
import pickle
import string
import re

# Web Scrapper

<div style="width:900px;background:#D9D9D9;border:1px solid black;text-align:left;padding:8px;">



<p>
<b>Step 1:</b> URL = https://www.indeed.com
<br><b>Step 2:</b> Keyword search parameter - "Data"
<br><b>Step 3:</b> Location choice - USA (more data is avaliable)
<br><b>Step 4:</b> Loop through 6 salary search parameter - "25000","35000","50000","65000","80000"
<br><b>Step 5:</b> Each salary parameter scrape 100 pages (1000 jobs)
<br><b>Step 6:</b> Scrape information: <b>job title, location, salary, ratings, reviews, summary</b>
<br><b>Step 7:</b> Save data into pickle file

</div>

# Data Cleaning

<div style="width:900px;background:#D9D9D9;border:1px solid black;text-align:left;padding:8px;">



<p>
<b>Job title, Companies, Summaries, Locations</b>
<br>1. Change data type from unicode to string. 
<br>2. Remove punctuations and special characters and lower case all text (except location)
<br>
<br><b> Location </b> 
<br>1. Replace with state abbrevations (e.g. AL, SF)
<br>
<br><b>Reviews, Ratings</b>
<br>1. Fill NaN with 0.0
<br><br><b>Salaries</b>
<br>1. If information is avaiable, scale into annual salary. 
<br>2. If salary range is provided, obtain mean of range. 
<br>3. If information not avaliable, impute with estimated salary (what was input in search bar during scrapping) 

</div>

In [6]:
def salary_func(x):
    
    """Extracts salary information if avaliable - and scale them into annual salaries
    if salary range is provided, take mean of range"""

    try:
        salary = np.mean([float(s) for s in re.findall(r'\d+',a.replace(',','').replace('$',''))])

        if 'hour' in x:
            salary = salary * 2080
        
        elif 'day' in x:
            salary = salary * 260
            
        elif 'week' in x:
            salary = salary * 52
        
        
    except:
        salary = np.nan
        
    return salary

In [7]:
jobs_file = pd.DataFrame()

for ind, filename in enumerate(glob.iglob('./*.pickle')):   # Iterates through all pickle file in folder
    
    # Open each file as data
    with open(filename) as inputfile:
        
        data = pickle.load(inputfile)  # Load pickle file

        # Clean text features
        
        data.job_titles = data.job_titles.map(lambda x : x.encode('ascii','ignore')\
                                              .replace('\n','')\
                                              .translate(None, string.punctuation)\
                                              .lower())
        
        data.companies = data.companies.map(lambda x : x.encode('ascii','ignore')\
                                              .replace('\n','')\
                                              .translate(None, string.punctuation)\
                                              .lower())
        
        data.locations = data.locations.map(lambda x : x.encode('ascii','ignore')\
                                              .replace('\n','')\
                                              .translate(None, string.punctuation))
        
        data.summaries = data.summaries.map(lambda x : x.encode('ascii','ignore')\
                                              .replace('\n','')\
                                              .translate(None, string.punctuation)
                                              .lower())

        # Impute NaN values with 0.0
        
        data.reviews = data.reviews.fillna(0.0)

        data.ratings = data.ratings.fillna(0.0)
        
        # Extract salaries

        data.salaries = data.salaries.map(salary_func)

        # If exact salary is not provided, impute with salary that was searched on
        
        base_salary = [25000,35000,50000,65000,80000,100000]

        for indx, base in enumerate(base_salary):

            if ind == indx:
                
                data.salaries = data.salaries.fillna(base)

        
    jobs_file = jobs_file.append(data)
        
        

# EDA

<div style="width:900px;background:#D9D9D9;border:1px solid black;text-align:left;padding:8px;">

<p>
Check for NaN values
<br>
Investigate outliers
<br>
Visualise correlation matrix and distributions

</div>

In [8]:
# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt

In [9]:
jobs_file.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8960 entries, 0 to 1499
Data columns (total 7 columns):
job_titles    8960 non-null object
companies     8960 non-null object
locations     8960 non-null object
reviews       8960 non-null float64
salaries      8960 non-null float64
ratings       8960 non-null float64
summaries     8960 non-null object
dtypes: float64(3), object(4)
memory usage: 560.0+ KB


In [10]:
jobs_file.head()

Unnamed: 0,job_titles,companies,locations,reviews,salaries,ratings,summaries
0,vice president data integration manager firm...,jp morgan chase,Jersey City NJ 07310 Downtown area,14863.0,25000.0,44.4,the organizations objectives are to explain th...
1,jr research analyst,wells fargo,San Francisco CA,19547.0,25000.0,43.8,identifies potential data sources for statisti...
2,data specialist solutions,facebook,Austin TX,203.0,25000.0,52.2,effectively plan and collaborate with team mem...
3,market data analyst manager vice president,state street,New York NY,1356.0,25000.0,42.0,o automate ec risk data enrichment and data go...
4,product data analyst personalization,hulu,Seattle WA,28.0,25000.0,44.4,passion for turning data into insights and hel...


# Data Cleaning (Locations)

<div style="width:900px;background:#D9D9D9;border:1px solid black;text-align:left;padding:8px;">



<p>
Locations are in different format, I will be standardising them with abbrevations

</div>

In [11]:
# Load state abbrevations data

states_ = pd.read_csv('./us_states.csv').iloc[:,:3]

In [12]:
states_['Capital'] = states_['Capital'].str.lower()
states_['State Name'] = states_['State Name'].str.lower()

In [13]:
states_dict = states_.set_index('Abbreviation').T.to_dict('list')

In [14]:
states = []

for state in jobs_file.locations:
    
    if state == 'Remote':
        states.append('Remote')
        
    else:
        abv = re.search(r'([A-Q][A-Q])',state)
        
        if abv is None:
            
            abb = 0
            for k,v in states_dict.items():
                if state.lower() in v:
                    abb = k
                else:
                    abb = 'US'
                
            states.append(abb)
            
        else:
            
            states.append(abv.group())


In [15]:
jobs_file.locations = states

# Feature Engineering

<div style="width:900px;background:#D9D9D9;border:1px solid black;text-align:left;padding:8px;">



<p>
Based on the companies, I will engineer a new binary column on whether the company is in the Forbes US 500 list (2016 & 2017)

</div>

In [16]:
fortune16 = pd.read_csv('./fortune500_16.csv')['Company Name']     # Load data
fortune17 = pd.read_csv('./fortune500_17.csv')['Company Name']

In [17]:
fortune = pd.concat([fortune16,fortune17]).drop_duplicates().reset_index(drop=True).dropna()  # Clean data
fortune = fortune.str.translate(None, string.punctuation).str.lower()

In [18]:
fortune500 = [1 if company in list(fortune) else 0 for company in jobs_file.companies] # 1 = company in Fortune500 list

In [19]:
jobs_file['fortune500'] = fortune500

# Pre-processing

<div style="width:900px;background:#D9D9D9;border:1px solid black;text-align:left;padding:8px;">



<p>
Scale numerical data - <b>Reviews, Ratings</b>
<Br>
Get dummy variables for categorical - <b>Locations</b>

</div>

In [20]:
from sklearn.preprocessing import RobustScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler

In [21]:
jobs_file.head()

Unnamed: 0,job_titles,companies,locations,reviews,salaries,ratings,summaries,fortune500
0,vice president data integration manager firm...,jp morgan chase,NJ,14863.0,25000.0,44.4,the organizations objectives are to explain th...,1
1,jr research analyst,wells fargo,CA,19547.0,25000.0,43.8,identifies potential data sources for statisti...,1
2,data specialist solutions,facebook,US,203.0,25000.0,52.2,effectively plan and collaborate with team mem...,1
3,market data analyst manager vice president,state street,US,1356.0,25000.0,42.0,o automate ec risk data enrichment and data go...,0
4,product data analyst personalization,hulu,US,28.0,25000.0,44.4,passion for turning data into insights and hel...,0


In [24]:
rs = RobustScaler()
jobs_file.reviews = rs.fit_transform(jobs_file.reviews.values.reshape(-1, 1))

In [26]:
jobs_file.ratings = rs.fit_transform(jobs_file.ratings.values.reshape(-1, 1))

In [28]:
jobs_nocomp = jobs_file.drop('companies',axis=1) 

In [29]:
X = jobs_nocomp.drop('salaries',axis=1)
y = jobs_nocomp['salaries']

# Train Test Split

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=2,stratify=y)

# Natural Language Processing

<div style="width:900px;background:#D9D9D9;border:1px solid black;text-align:left;padding:8px;">



<p>
<b>Step 1:</b> Add customized stopwords 
<br><b>Step 2:</b> Count vectorizer on job_titles, summaries and companies

</div>

In [40]:
import locale
locale.getdefaultlocale()

ValueError: unknown locale: UTF-8

In [43]:
from nltk.corpus import stopwords

#stop = stopwords.words('english')

ValueError: unknown locale: UTF-8

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cvec = CountVectorizer(stopwords = stopwords)
cvec.fit()

In [None]:
df  = pd.DataFrame(cvec.transform([FEATURE]).todense(),
             columns=cvec.get_feature_names())

# Feature Selection

<div style="width:900px;background:#D9D9D9;border:1px solid black;text-align:left;padding:8px;">



<p>
<b>Variance Inflation Factor</b> on numerical data to check for multi-colinearity
<br><b>ANOVA Test</b> on binary data
<br>Since there is little data avaliable, I will use <b>PCA</b> or <b>LDA</b> to reduce dimensionality

</div>

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
# Variance Inflation Factor

class VIF(BaseEstimator, TransformerMixin):
    """Variance Inflation Factor"""
    
    def __init__(self,threshold=5):
        self.threshold = threshold
    
    
    def vif(self,df):
        
        vif = [variance_inflation_factor(df.iloc[:,:-4].values,i) \
               for i in range(df.iloc[:,:-4].shape[1])]
        
        vif_df = pd.DataFrame(df.iloc[:,:-4].columns,columns=['Features'])
        vif_df['VIF'] = vif    # VIF values in dataframe
        
        remove_col = list(vif_df[vif_df['VIF']>self.threshold]['Features'])   # Choose only features with VIF < 5
        selected_df = df.drop(remove_col,axis=1)
        
        return selected_df
    
    def transform(self, df, *args):
        
        selected_df = self.vif(df)
        
        return selected_df


    def fit(self, df, *args):
        return self

# Modelling & Evaluation

In [None]:
# Models
from sklearn.linear_model import LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV

In [None]:
# Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline


In [None]:
# Learning Curve
from sklearn.model_selection import learning_curve


def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

In [None]:
title = 'Learning Curve Random Forest'
plot_learning_curve(ESTIMATOR_GS.best_estimator_, title, X, y, ylim=None, cv=None,\
                        n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5))