In [1]:
#import your libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures,StandardScaler, OrdinalEncoder, LabelEncoder,KBinsDiscretizer,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression,ElasticNetCV,RidgeCV
from sklearn.model_selection import cross_validate,RandomizedSearchCV
import xgboost as xgb
import lightgbm as lgb

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


__author__ = "Jared Teerlink"
__email__ = "jteerlink@gmail.com"

In [2]:
#load the data into a Pandas dataframe
train_features = pd.read_csv('../Data/train_features.csv')
train_salaries = pd.read_csv('../Data/train_salaries.csv')
# test_features = pd.read_csv('../Data/test_features.csv')


In [9]:
#look for duplicate data, invalid data (e.g. salaries <=0), or corrupt data and remove it

train_combined = train_features.merge(train_salaries, on = 'jobId', how = 'left')
train_combined = train_combined[train_combined.salary > 0]


In [57]:
numeric_features = train_combined.select_dtypes(include=['int64', 'float64']).drop(['salary'],axis = 1).columns
categorical_features = train_combined.select_dtypes(include=['object']).columns

categorical_features

Index(['jobType', 'degree', 'major', 'industry'], dtype='object')

In [93]:
# train_combined.drop(labels = ['jobId','companyId'],inplace=True,axis=1)
                      
# train = train_combined.drop(['salary'],axis=1)
train = train_combined.drop(categorical_features,axis=1)
train = train.drop(['salary'],axis=1)
target = train_combined['salary']


In [92]:
train.head()

Unnamed: 0,yearsExperience,milesFromMetropolis,salary,jobType_mean,jobType_max,jobType_min,jobType_median,jobType_std,degree_mean,degree_max,...,major_mean,major_max,major_min,major_median,major_std,industry_mean,industry_max,industry_min,industry_median,industry_std
0,10,83,130,135.458547,301,51,132,33.069203,130.505647,301,...,133.322042,280,39,130,35.539246,115.73554,275,28,114,36.865179
1,3,73,101,145.311425,298,55,142,34.423846,101.921085,260,...,102.583864,270,17,100,36.136268,121.645362,290,31,119,38.165659
2,10,38,137,125.36863,272,44,122,31.956295,135.490979,301,...,130.372436,293,38,127,34.956981,115.73554,275,28,114,36.865179
3,8,17,142,115.368518,270,37,112,30.833865,135.490979,301,...,129.072085,282,40,126,34.832396,109.435222,264,25,108,36.09159
4,8,16,163,125.36863,272,44,122,31.956295,125.454663,294,...,130.372436,293,38,127,34.956981,130.747659,294,36,128,38.31982


In [23]:
#engineer potential features
for cat in categorical_features:
    cat_stats =train_combined['salary'].groupby(train_combined[cat]).agg({f'{cat}_mean':'mean',f'{cat}_max':'max',f'{cat}_min':'min',f'{cat}_median':'median',f'{cat}_std':'std'})
    train_combined = train_combined.merge(cat_stats,how = 'left',left_on = cat, right_on=cat_stats.index)

is deprecated and will be removed in a future version. Use                 named aggregation instead.

    >>> grouper.agg(name_1=func_1, name_2=func_2)

  This is separate from the ipykernel package so we can avoid doing imports until


In [60]:
# set up pipelines
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

# dummy_transformer = Pipeline(steps=[
#     ('onehot', OneHotEncoder(drop  = 'first'))])

# ordinal_transformer = Pipeline(steps = [
#     'ordinal'])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
#         ('cat', dummy_transformer, categorical_features)
    ])

In [81]:
lm = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', LinearRegression(n_jobs=-1))])


In [94]:
train_X, test_X, train_y, test_y = train_test_split(train,target, test_size = .25)

In [95]:
lm.fit(train_X,train_y)
y_pred = lm.predict(test_X)

In [96]:
mean_squared_error(test_y,y_pred)

385.59400641710795

In [66]:
param_grid = {'regressor__n_estimators': [x for x in range(300,600,50)],
          'regressor__num_leaves':[x for x in range(5,50,10)],
#           'regressor__max_depth':[x for x in range(20,80,10)],
          'regressor__learning_rate':[x for x in np.arange(.2,.35,.03)],
          'regressor__max_bin': [x for x in range(100,400,10)],
          'regressor__reg_alpha': [.1, .2, .3, .4, .5],
          'regressor__reg_lambda': [.1, .2, .3, .4, .5, .6, .7],
          'regressor__boosting_type': ['dart'],
          'regressor__subsample_for_bin':[x for x in range(200000,350000,10000)],
               'regressor__min_split_gain' : [x for x in np.arange(.01,.5,.05)],
              'regressor__min_child_weight'  : [x for x in np.arange(.05,.5,.05)],
              'regressor__min_child_samples' : [x for x in range(5,30,5)]
#               'regressor__colsample_bytree'  : [x for x in np.arange(1.0,2.5,.5)]
             }

In [85]:
lgb_reg = Pipeline(steps=[#('preprocessor', preprocessor),
                      ('regressor', lgb.LGBMRegressor())])

In [86]:
lgb_random = RandomizedSearchCV(estimator = lgb_reg, 
                               param_distributions = param_grid, 
                               n_iter = 3, 
                               cv = 5, 
                               verbose=0, 
                               n_jobs = 3,
                               scoring = 'neg_mean_squared_error')

In [98]:
lgb_random.fit(train_X,train_y)
print(lgb_random.best_score_)
print(lgb_random.best_params_)

-355.49015651214535
{'regressor__subsample_for_bin': 300000, 'regressor__reg_lambda': 0.3, 'regressor__reg_alpha': 0.1, 'regressor__num_leaves': 35, 'regressor__n_estimators': 400, 'regressor__min_split_gain': 0.36000000000000004, 'regressor__min_child_weight': 0.3, 'regressor__min_child_samples': 20, 'regressor__max_bin': 220, 'regressor__learning_rate': 0.2, 'regressor__boosting_type': 'dart'}


In [99]:
lgb_best = lgb_random.best_estimator_
ypred = lgb_best.predict(test_X)
print(mean_squared_error(test_y,ypred))

356.51022132131624


In [100]:
#save model for later import
import joblib
joblib.dump(lgb_random.best_estimator_, 'lightgbm.pkl')




['lightgbm.pkl']

In [82]:
#read in model to avoid retraining every time

import joblib
lgbm = joblib.load('lightgbm.pkl')

In [91]:
# lgb_best = lgb_random.best_estimator_
# lgb_best.predict(X_test)
ypred = lgbm.predict(test_X)
print(mean_squared_error(test_y,ypred))

355.15625277015005
