In [1]:
#import your libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures,StandardScaler, OrdinalEncoder, LabelEncoder,KBinsDiscretizer,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression,ElasticNetCV,RidgeCV
from sklearn.model_selection import cross_validate,RandomizedSearchCV
import xgboost as xgb
import lightgbm as lgb

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


__author__ = "Jared Teerlink"
__email__ = "jteerlink@gmail.com"

In [2]:
#load the data into a Pandas dataframe
train_features = pd.read_csv('../Data/train_features.csv')
train_salaries = pd.read_csv('../Data/train_salaries.csv')
# test_features = pd.read_csv('../Data/test_features.csv')


In [3]:
#look for duplicate data, invalid data (e.g. salaries <=0), or corrupt data and remove it



train_combined = train_features.merge(train_salaries, on = 'jobId', how = 'left')
train_combined = train_combined[train_combined.salary > 0]


Unnamed: 0,jobId,companyId,jobType,degree,major,industry,yearsExperience,milesFromMetropolis,salary
0,JOB1362684407687,COMP37,CFO,MASTERS,MATH,HEALTH,10,83,130
1,JOB1362684407688,COMP19,CEO,HIGH_SCHOOL,NONE,WEB,3,73,101
2,JOB1362684407689,COMP52,VICE_PRESIDENT,DOCTORAL,PHYSICS,HEALTH,10,38,137
3,JOB1362684407690,COMP38,MANAGER,DOCTORAL,CHEMISTRY,AUTO,8,17,142
4,JOB1362684407691,COMP7,VICE_PRESIDENT,BACHELORS,PHYSICS,FINANCE,8,16,163


In [11]:
# train_combined.drop(labels = ['jobId','companyId','major'],inplace=True,axis=1)
                      
train = train_combined.drop(['salary'],axis=1)


target = train_combined['salary']

In [12]:
train.head()

Unnamed: 0,jobType,degree,industry,yearsExperience,milesFromMetropolis
0,CFO,MASTERS,HEALTH,10,83
1,CEO,HIGH_SCHOOL,WEB,3,73
2,VICE_PRESIDENT,DOCTORAL,HEALTH,10,38
3,MANAGER,DOCTORAL,AUTO,8,17
4,VICE_PRESIDENT,BACHELORS,FINANCE,8,16


In [25]:
numeric_features = train_combined.select_dtypes(include=['int64', 'float64']).drop(['salary'],axis = 1).columns
categorical_features = train_combined.select_dtypes(include=['object']).columns

In [30]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

dummy_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop  = 'first'))])

#TODO - add pipeline for ordinal tranformations
ordinal_transformer = Pipeline(steps = [
    'ordinal'
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [31]:
lm = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', LinearRegression(n_jobs=-1))])


In [32]:
train_X, test_X, train_y, test_y = train_test_split(train,target, test_size = .25)

In [51]:
lm.fit(train_X,train_y)
y_pred = lm.predict(test_X)

In [52]:
mean_squared_error(test_y,y_pred)

398.3303562200163

In [39]:
param_grid = {'regressor__n_estimators': [x for x in range(300,600,50)],
          'regressor__num_leaves':[x for x in range(5,50,10)],
#           'regressor__max_depth':[x for x in range(20,80,10)],
          'regressor__learning_rate':[x for x in np.arange(.2,.35,.03)],
          'regressor__max_bin': [x for x in range(100,400,10)],
          'regressor__reg_alpha': [.1, .2, .3, .4, .5],
          'regressor__reg_lambda': [.1, .2, .3, .4, .5, .6, .7],
          'regressor__boosting_type': ['dart'],
          'regressor__subsample_for_bin':[x for x in range(200000,350000,10000)],
               'regressor__min_split_gain' : [x for x in np.arange(.01,.5,.05)],
              'regressor__min_child_weight'  : [x for x in np.arange(.05,.5,.05)],
              'regressor__min_child_samples' : [x for x in range(5,30,5)]
#               'regressor__colsample_bytree'  : [x for x in np.arange(1.0,2.5,.5)]
             }

In [40]:
lgb_reg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', lgb.LGBMRegressor())])

In [41]:
lgb_random = RandomizedSearchCV(estimator = lgb_reg, 
                               param_distributions = param_grid, 
                               n_iter = 3, 
                               cv = 5, 
                               verbose=3, 
                               n_jobs = -1,
                               scoring = 'neg_mean_squared_error')

In [42]:
lgb_random.fit(train_X,train_y)
print(lgb_random.best_score_)
print(lgb_random.best_params_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 21.2min finished


-378.61767141213267
{'regressor__subsample_for_bin': 240000, 'regressor__reg_lambda': 0.3, 'regressor__reg_alpha': 0.1, 'regressor__num_leaves': 15, 'regressor__n_estimators': 400, 'regressor__min_split_gain': 0.26, 'regressor__min_child_weight': 0.2, 'regressor__min_child_samples': 20, 'regressor__max_bin': 210, 'regressor__learning_rate': 0.32, 'regressor__boosting_type': 'dart'}


In [43]:
lgb_best = lgb_random.best_estimator_
# lgb_best.predict(X_test)
ypred = lgb_best.predict(test_X)
print(mean_squared_error(test_y,ypred))

380.61510889534287


ValueError: DataFrame constructor not properly called!