In [None]:
#import your libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures,StandardScaler, OrdinalEncoder, LabelEncoder,KBinsDiscretizer,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression,ElasticNetCV,RidgeCV
from sklearn.model_selection import cross_validate,RandomizedSearchCV
import xgboost as xgb
import lightgbm as lgb

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


__author__ = "Jared Teerlink"
__email__ = "jteerlink@gmail.com"

### Load and Prep Data for Feature Engineering

In [None]:


#load the data into a Pandas dataframe
train_features = pd.read_csv('../Data/train_features.csv')
train_salaries = pd.read_csv('../Data/train_salaries.csv')



In [None]:
#look for duplicate data, invalid data (e.g. salaries <=0), or corrupt data and remove it

train_combined = train_features.merge(train_salaries, on = 'jobId', how = 'left')
train_combined = train_combined[train_combined.salary > 0]




In [None]:
numeric_features = train_combined.select_dtypes(include=['int64', 'float64']).drop(['salary'],axis = 1).columns
categorical_features = train_combined.select_dtypes(include=['object']).drop(['jobId','companyId','major'],axis=1).columns


In [None]:
print(numeric_features)
print(categorical_features)

In [None]:
#engineer potential features
for cat in categorical_features:
    cat_stats =train_combined['salary'].groupby(train_combined[cat]).agg({f'{cat}_mean':'mean',
                                                                          f'{cat}_max':'max',
                                                                          f'{cat}_min':'min',
                                                                          f'{cat}_median':'median',
                                                                          f'{cat}_std':'std'})
    train_combined = train_combined.merge(cat_stats,how = 'left',left_on = cat, right_on=cat_stats.index)

In [None]:
#define dataframes for training and test sets
features = train_combined.drop(train_combined.select_dtypes(include=['object']).columns,axis=1)
target = train_combined['salary']


In [None]:
train_X, test_X, train_y, test_y = train_test_split(features,target, test_size = .25)

In [None]:
# Ordered Lists for ordinal encoding

degrees = [['NONE', 'HIGH_SCHOOL', 'BACHELORS', 'MASTERS', 'DOCTORAL']]

titles = [['JANITOR', 'JUNIOR', 'SENIOR', 'MANAGER', 'VICE_PRESIDENT', 'CFO', 'CTO', 'CEO']]


In [None]:
#create transformer class to work with pandas df and ordinal encoder

from sklearn.base import TransformerMixin, BaseEstimator

class PandasToNumpy(TransformerMixin, BaseEstimator):
    def fit(self, x, y=None):
        return self
    
    def transform(self, data_inputs): 
        return np.asarray(data_inputs).reshape(-1,1)


# set up pipelines
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])


ordinal_transformer_degree = Pipeline(steps = [
    ('convert',PandasToNumpy()),
    ('ordinal', OrdinalEncoder(categories=degrees))
])

ordinal_transformer_title = Pipeline(steps = [
    ('convert',PandasToNumpy()),
    ('ordinal', OrdinalEncoder(categories=titles))
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features[:1]),
        ('degree_ordinal',ordinal_transformer_degree,[1]),
        ('title_ordinal',ordinal_transformer_title,[0])
    ])

### Establish Baseline

In [None]:
lm = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', LinearRegression(n_jobs=-1))])


In [None]:
lm.fit(train_X,train_y)
y_pred = lm.predict(test_X)

In [None]:
mean_squared_error(test_y,y_pred)

##### MSE for linear regression
1285.4

### Prep Light GBM Model pipelines and parameter search

In [None]:
param_grid = {'regressor__n_estimators': [x for x in range(300,600,50)],
          'regressor__num_leaves':[x for x in range(5,50,10)],
#           'regressor__max_depth':[x for x in range(20,80,10)],
          'regressor__learning_rate':[x for x in np.arange(.2,.35,.03)],
          'regressor__max_bin': [x for x in range(100,400,10)],
          'regressor__reg_alpha': [.1, .2, .3, .4, .5],
          'regressor__reg_lambda': [.1, .2, .3, .4, .5, .6, .7],
          'regressor__boosting_type': ['dart'],
          'regressor__subsample_for_bin':[x for x in range(200000,350000,10000)],
               'regressor__min_split_gain' : [x for x in np.arange(.01,.5,.05)],
              'regressor__min_child_weight'  : [x for x in np.arange(.05,.5,.05)],
              'regressor__min_child_samples' : [x for x in range(5,30,5)]
#               'regressor__colsample_bytree'  : [x for x in np.arange(1.0,2.5,.5)]
             }

In [None]:
lgb_reg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', lgb.LGBMRegressor())])

In [None]:
lgb_random = RandomizedSearchCV(estimator = lgb_reg, 
                               param_distributions = param_grid, 
                               n_iter = 3, 
                               cv = 5, 
                               verbose=0, 
                               n_jobs = -1,
                               scoring = 'neg_mean_squared_error')

In [None]:
lgb_random.fit(train_X,train_y)
# print(lgb_random.best_score_)
# print(lgb_random.best_params_)

##### MSE best score for Random Search
355.1

In [None]:
lgb_best = lgb_random.best_estimator_
ypred = lgb_best.predict(test_X)
print(mean_squared_error(test_y,ypred))

##### MSE for LGB Model
356.5

In [None]:
#save model for later import
import joblib
joblib.dump(lgb_random.best_estimator_, 'lightgbm.pkl')




In [None]:
#read in 

import joblib
lgbm = joblib.load('lightgbm.pkl')

In [None]:

ypred = lgbm.predict(test_X)
print(mean_squared_error(test_y,ypred))