In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import os
import warnings
import sys

In [3]:
sys.path.append('/home/mondzi259/Apna_Assignment/src/ml_auto/')

In [4]:
from custom_estimator import Estimator
from lightgbm import LGBMRegressor
from sklearn.preprocessing import OneHotEncoder

In [5]:
warnings.filterwarnings('ignore')

In [6]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [7]:
DATA_DIR = '/home/mondzi259/Apna_Assignment/data/'

In [8]:
df = pd.read_excel(os.path.join(DATA_DIR,'data.xlsx'),sheet_name=0)
df.shape

(30000, 19)

In [9]:
df.drop('feebackgiven',axis=1,inplace=True)

In [10]:
df.head()

Unnamed: 0,shift,gender,education,created_on,dow,employer_type,applicant_location,city,area,organization,deposit,category,english,num_openings,max_salary,min_salary,is_part_time,leads_per_opening
0,0.0,0.0,3.0,413.0,3.0,0.0,0.0,1.0,4119.0,31278.0,0.0,55.0,2.0,22.0,21000.0,,0.0,0.090909
1,0.0,0.0,3.0,529.0,0.0,4.0,0.0,2.0,2017.0,32006.0,0.0,54.0,2.0,2.0,80000.0,,0.0,31.5
2,0.0,0.0,0.0,457.0,0.0,4.0,0.0,3.0,1500.0,32029.0,0.0,42.0,3.0,1.0,3500.0,,1.0,9.0
3,0.0,0.0,4.0,450.0,3.0,4.0,0.0,5.0,638.0,9925.0,0.0,60.0,4.0,4.0,11500.0,,0.0,0.0
4,0.0,0.0,0.0,480.0,0.0,1.0,0.0,2.0,4115.0,8400.0,0.0,16.0,3.0,25.0,25000.0,,0.0,1.4


### Ideas:

3. cat OHE, LE, FE, TE
6. missing min salary; app location=0; imputation w/wo
8. API
9. Explainability
10. Local/Global feature importance
11. Outliers/score

### 

In [11]:
num_cols = ['leads_per_opening','max_salary','min_salary','deposit','num_openings','created_on']
cat_cols = [i for i in df.columns.tolist() if i not in num_cols]
cat_ordinal_cols = ['applicant_location','english','education']
cat_non_ord_cols = [i for i in cat_cols if i not in cat_ordinal_cols]

In [12]:
cat_non_ord_cols,cat_ordinal_cols

(['shift',
  'gender',
  'dow',
  'employer_type',
  'city',
  'area',
  'organization',
  'category',
  'is_part_time'],
 ['applicant_location', 'english', 'education'])

In [13]:
df[cat_cols].nunique()

shift                     2
gender                    3
education                 6
dow                       7
employer_type             5
applicant_location        4
city                      7
area                   2740
organization          14886
category                 67
english                   4
is_part_time              2
dtype: int64

In [14]:
class FreqEnc:
    def __init__(self,cat_freq_cols):
        self.cat_freq_cols = cat_freq_cols
    def fit(self,df):
        self.freq_encoding_dict = {
                x: df[x].value_counts(1).to_dict() for x in self.cat_freq_cols
            }
    def transform(self,df):
        for col in self.cat_freq_cols:
            df[col+"_fe"] = df[col].map(self.freq_encoding_dict[col]).fillna(0)
        return df
    def fit_transform(self,df):
        self.fit(df)
        return self.transform(df)

In [15]:
df=df.sort_values('created_on').reset_index(drop=True)

In [16]:
df.shape

(30000, 18)

In [17]:
df['target'] = np.log1p(df.leads_per_opening)

In [18]:
df = df[df.created_on<=df.created_on.quantile(0.9)].reset_index(drop=True)

In [19]:
folds =[
    (df[(df.created_on<=df.created_on.quantile(0.7))].index.tolist(),\
df[(df.created_on>df.created_on.quantile(0.7))&\
      (df.created_on<=df.created_on.quantile(0.8))].index.tolist()),
    
    (df[(df.created_on>=df.created_on.quantile(0.1))&\
      (df.created_on<=df.created_on.quantile(0.8))].index.tolist(),\
df[(df.created_on>df.created_on.quantile(0.8))&\
      (df.created_on<=df.created_on.quantile(0.9))].index.tolist()),
    
    (df[(df.created_on>=df.created_on.quantile(0.2))&\
      (df.created_on<=df.created_on.quantile(0.9))].index.tolist(),\
df[(df.created_on>df.created_on.quantile(0.9))&\
      (df.created_on<=df.created_on.quantile(1))].index.tolist())
]

In [20]:
[(df.iloc[i].shape,df.iloc[j].shape) for i,j in folds]

[((19194, 19), (2464, 19)),
 ((18964, 19), (2778, 19)),
 ((19081, 19), (2597, 19))]

In [21]:
target = df.target.values

In [22]:
cols = ['applicant_location', 'applicant_location_mean_grpby_and_category_city', 
        'applicant_location_min_grpby_and_organization',
        'applicant_location_std_grpby_and_category_dow', 'applicant_location_std_grpby_and_organization', 
        'area_fe', 'category_area_fe', 'category_fe', 'category_organization_fe', 'created_on', 'education',
        'education_fe', 'education_mean_grpby_and_category', 'education_median_grpby_and_city',
        'education_min_grpby_and_area', 'education_std_grpby_and_area', 'education_std_grpby_and_organization', 
        'employer_type_3.0_ohe', 'english', 'gender_fe', 'max_salary', 'max_salary_mean_grpby_and_category',
        'max_salary_min_grpby_and_city', 'max_salary_std_grpby_and_category', 
        'max_salary_std_grpby_and_category_city', 'max_salary_std_grpby_and_category_dow', 
        'min_salary_max_grpby_and_category', 'min_salary_max_grpby_and_category_city', 
        'min_salary_max_grpby_and_organization', 'min_salary_mean_grpby_and_area',
       'min_salary_mean_grpby_and_category_city', 'min_salary_std_grpby_and_category_dow', 'num_openings', 
        'num_openings_max_grpby_and_area', 'num_openings_mean_grpby_and_category_dow', 
        'num_openings_mean_grpby_and_city', 'num_openings_median_grpby_and_category_dow', 
        'num_openings_median_grpby_and_city', 'num_openings_std_grpby_and_area', 
        'num_openings_std_grpby_and_category_city', 'num_openings_std_grpby_and_organization', 'organization_fe']

### Adding cat-cat interactions

In [23]:
def get_derived_cat(df,a,b):
    df[a+'_'+b]= df[a].astype('str') + '_' + df[b].astype('str')
    return df 

In [24]:
def gen_cat_cat(df):
    df = get_derived_cat(df,'category','organization')
    df = get_derived_cat(df,'category','area')
    df = get_derived_cat(df,'category','city')
    df = get_derived_cat(df,'category','dow')
    return df

In [25]:
df = gen_cat_cat(df)

In [26]:
cat_freq_cols= ['area',
 'category_area',
 'category',
 'category_organization',
 'education',
 'gender',
 'organization']

In [27]:
fe = FreqEnc(cat_freq_cols=cat_freq_cols)

df = fe.fit_transform(df)

In [28]:
ohe_cols = ['employer_type']

In [29]:
class OHE:
    def __init__(self,ohe_cols):
        self.ohe_cols = ohe_cols
        self.ohe = OneHotEncoder(sparse=False, handle_unknown="ignore")
    def fit(self,df):
        self.ohe.fit(df[self.ohe_cols])
    def transform(self,df):
        temp = self.ohe.fit_transform(df[self.ohe_cols])
        temp = pd.DataFrame(
            temp,
            columns=[self.ohe_cols[i] + '_' + str(j) +'_ohe' for i in range(len(self.ohe_cols)) for j in self.ohe.categories_[i]],
        )
        df=pd.concat([df, temp], axis=1)
        return df
    def fit_transform(self,df):
        self.fit(df)
        return self.transform(df)

In [30]:
ohe = OHE(ohe_cols=ohe_cols)
df = ohe.fit_transform(df)

In [32]:
cat_num_agg_dict = {'area':{'education':['min','std'],
         'num_openings':['max','std'],
         'min_salary':['mean']},
 'organization':{'applicant_location':['min','std'],
                'education':['std'],
                'num_openings':['std'],
                'min_salary':['max']},
 'city':{'num_openings':['mean','median'],
        'education':['median'],
        'max_salary':['min']},
 'category_city':{'applicant_location':['mean'],
                'max_salary':['std'],
                'min_salary':['max','mean'],
                'num_openings':['std']},
 'category_dow':{'applicant_location':['std'],
                'max_salary':['std'],
                'min_salary':['std'],
                'num_openings':['mean','median']},
 'category':{'education':['mean'],
             'max_salary':['mean','std'],
             'min_salary':['max']}
}

In [33]:
class CatNumAgg:
    def __init__(self,cat_num_agg_dict):
        self.cat_num_agg_dict = cat_num_agg_dict
    def fit(self,df):
        self.encoding_dict={}
        for grp_col,agg_dict in self.cat_num_agg_dict.items():
            li=[]
            cols=[grp_col]
            for agg_col, agg_funcs in agg_dict.items():
                agg_df = df.groupby(grp_col)[agg_col].agg(agg_funcs)
                cols.extend([
                    agg_col + "_" + j + "_grpby_and_" + grp_col
                    for j in agg_funcs
                ])
                li.append(agg_df)
            final_df = pd.concat(li,axis=1).reset_index()
            final_df.columns = cols
            self.encoding_dict[grp_col]=final_df.to_dict()
    def transform(self,df):
        for k in self.encoding_dict:
            agg_df = pd.DataFrame(self.encoding_dict[k])
            df = df.merge(agg_df, on=k, how="left")
        return df
    def fit_transform(self,df):
        self.fit(df)
        return self.transform(df)

In [34]:
catnumagg = CatNumAgg(cat_num_agg_dict=cat_num_agg_dict)

In [None]:
os.j

In [35]:
df = catnumagg.fit_transform(df)

In [36]:
df.shape

(27033, 63)

In [37]:
params = {'boosting_type': 'gbdt',
 'colsample_bytree': 0.4,
 'learning_rate': 0.1,
 'min_child_samples': 110,
 'n_estimators': 10000,
 'n_jobs': -1,
 'num_leaves': 16,
 'objective': 'regression',
 'subsample': 1.0,
 'subsample_freq': 10}

In [38]:
est = Estimator(model=LGBMRegressor(**params),
               early_stopping_rounds=100,
               validation_scheme=folds,
               shuffle=True)

In [40]:
est.get_repeated_out_of_folds(df[cols].values,target)

[100]	valid_0's rmse: 0.830725	valid_0's l2: 0.690104	valid_1's rmse: 0.828755	valid_1's l2: 0.686835
[200]	valid_0's rmse: 0.817524	valid_0's l2: 0.668346	valid_1's rmse: 0.781996	valid_1's l2: 0.611518
[300]	valid_0's rmse: 0.815007	valid_0's l2: 0.664237	valid_1's rmse: 0.750422	valid_1's l2: 0.563133
[100]	valid_0's rmse: 0.901607	valid_0's l2: 0.812894	valid_1's rmse: 0.821344	valid_1's l2: 0.674606
[200]	valid_0's rmse: 0.892298	valid_0's l2: 0.796196	valid_1's rmse: 0.773749	valid_1's l2: 0.598688
[100]	valid_0's rmse: 0.829818	valid_0's l2: 0.688598	valid_1's rmse: 0.826618	valid_1's l2: 0.683297
[200]	valid_0's rmse: 0.81782	valid_0's l2: 0.668829	valid_1's rmse: 0.777239	valid_1's l2: 0.604101
[300]	valid_0's rmse: 0.813138	valid_0's l2: 0.661194	valid_1's rmse: 0.744903	valid_1's l2: 0.55488
[400]	valid_0's rmse: 0.812868	valid_0's l2: 0.660754	valid_1's rmse: 0.719336	valid_1's l2: 0.517444
[500]	valid_0's rmse: 0.814631	valid_0's l2: 0.663624	valid_1's rmse: 0.696551	valid

{'cv_scores': [0.8146425043465393, 0.892241078046095, 0.812297027913692],
 'avg_cv_score': 0.8397268701021088,
 'std_scores': 0.0371454962897023,
 'overall_cv_score': 1.7173698631833838,
 'eval_score': 0.8397268701021088}

In [41]:
feat_imps = est.feature_importances(columns = cols) 
feat_imps['cum_imp']=feat_imps.feature_importance.cumsum()
feat_imps

Unnamed: 0,column,feature_importance,rank,cum_imp
20,max_salary,0.054911,1,0.054911
9,created_on,0.047953,2,0.102864
32,num_openings,0.045247,3,0.148111
1,applicant_location_mean_grpby_and_category_city,0.044281,4,0.192392
15,education_std_grpby_and_area,0.043473,5,0.235865
29,min_salary_mean_grpby_and_area,0.043134,6,0.279
40,num_openings_std_grpby_and_organization,0.040218,7,0.319218
7,category_fe,0.03739,8,0.356608
38,num_openings_std_grpby_and_area,0.035057,9,0.391665
24,max_salary_std_grpby_and_category_city,0.034403,10,0.426067


In [60]:
est.save_model(file_name='model.pkl')

'model.pkl'

In [61]:
import pickle

In [63]:
objects = []
with (open("model.pkl", "rb")) as openfile:
    while True:
        try:
            objects.append(pickle.load(openfile))
        except EOFError:
            break

In [67]:
with (open("model.pkl", "rb")) as openfile:
    model_obj = pickle.load(openfile)

In [78]:
from custom_estimator import scoring_metric

In [110]:
np.mean([i.predict(df[cols].values) for i in model_obj['fitted_models']],axis=0)

array([2.06355231, 2.90850499, 2.06281921, ..., 0.42775672, 0.44966054,
       2.49591249])

In [80]:
[scoring_metric(target,i.predict(df[cols].values)) for i in model_obj['fitted_models']]

[0.7812609945440517, 0.8047984200980185, 0.7550112757796115]

In [72]:
model_obj['params'].keys()

dict_keys(['model', 'n_splits', 'random_state', 'shuffle', 'n_jobs', 'early_stopping_rounds', 'variance_penalty', 'validation_scheme', 'cv_group_col'])

In [73]:
est = Estimator(**model_obj['params'])

In [118]:
est.get_repeated_out_of_folds(df[cols].values,target)

[100]	valid_0's rmse: 0.830725	valid_0's l2: 0.690104	valid_1's rmse: 0.828755	valid_1's l2: 0.686835
[200]	valid_0's rmse: 0.817524	valid_0's l2: 0.668346	valid_1's rmse: 0.781996	valid_1's l2: 0.611518
[300]	valid_0's rmse: 0.815007	valid_0's l2: 0.664237	valid_1's rmse: 0.750422	valid_1's l2: 0.563133
[100]	valid_0's rmse: 0.901607	valid_0's l2: 0.812894	valid_1's rmse: 0.821344	valid_1's l2: 0.674606
[200]	valid_0's rmse: 0.892298	valid_0's l2: 0.796196	valid_1's rmse: 0.773749	valid_1's l2: 0.598688
[100]	valid_0's rmse: 0.829818	valid_0's l2: 0.688598	valid_1's rmse: 0.826618	valid_1's l2: 0.683297
[200]	valid_0's rmse: 0.81782	valid_0's l2: 0.668829	valid_1's rmse: 0.777239	valid_1's l2: 0.604101
[300]	valid_0's rmse: 0.813138	valid_0's l2: 0.661194	valid_1's rmse: 0.744903	valid_1's l2: 0.55488
[400]	valid_0's rmse: 0.812868	valid_0's l2: 0.660754	valid_1's rmse: 0.719336	valid_1's l2: 0.517444
[500]	valid_0's rmse: 0.814631	valid_0's l2: 0.663624	valid_1's rmse: 0.696551	valid

{'cv_scores': [0.8146425043465393, 0.892241078046095, 0.812297027913692],
 'avg_cv_score': 0.8397268701021088,
 'std_scores': 0.0371454962897023,
 'overall_cv_score': 1.7173698631833838,
 'eval_score': 0.8397268701021088}

In [105]:
item = {'shift': [5,100],
 'gender': [5,100],
 'education': [5,100],
 'created_on': [5,100],
 'dow': [5,100],
 'employer_type': [5,100],
 'applicant_location': [5,100],
 'city': [5,100],
 'area': [5,100],
 'organization': [5,100],
 'deposit': [5,100],
 'category': [5,100],
 'english': [5,100],
 'num_openings': [5,100],
 'max_salary': [5,100],
 'min_salary': [5,100],
 'is_part_time': [10,100]}

In [106]:
pd.DataFrame(item)

Unnamed: 0,shift,gender,education,created_on,dow,employer_type,applicant_location,city,area,organization,deposit,category,english,num_openings,max_salary,min_salary,is_part_time
0,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,10
1,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100


In [109]:
model_obj

{'fitted_models': [LGBMRegressor(colsample_bytree=0.4, min_child_samples=110, n_estimators=10000,
                num_leaves=16, objective='regression', subsample_freq=10),
  LGBMRegressor(colsample_bytree=0.4, min_child_samples=110, n_estimators=10000,
                num_leaves=16, objective='regression', subsample_freq=10),
  LGBMRegressor(colsample_bytree=0.4, min_child_samples=110, n_estimators=10000,
                num_leaves=16, objective='regression', subsample_freq=10)],
 'params': {'model': ('LGBMRegressor',
   {'boosting_type': 'gbdt',
    'class_weight': None,
    'colsample_bytree': 0.4,
    'importance_type': 'split',
    'learning_rate': 0.1,
    'max_depth': -1,
    'min_child_samples': 110,
    'min_child_weight': 0.001,
    'min_split_gain': 0.0,
    'n_estimators': 10000,
    'n_jobs': -1,
    'num_leaves': 16,
    'objective': 'regression',
    'random_state': None,
    'reg_alpha': 0.0,
    'reg_lambda': 0.0,
    'silent': 'warn',
    'subsample': 1.0,
    'subsam

In [99]:
os.path.join('.','model/model.pkl')

'./model/model.pkl'

In [None]:
est.save_model('../LeadsPredictor/model/')

In [None]:
fe, ohe, catnumagg

In [None]:
len(cols)

In [111]:
import yaml

In [116]:
{'model_cols':cols}

{'model_cols': ['applicant_location',
  'applicant_location_mean_grpby_and_category_city',
  'applicant_location_min_grpby_and_organization',
  'applicant_location_std_grpby_and_category_dow',
  'applicant_location_std_grpby_and_organization',
  'area_fe',
  'category_area_fe',
  'category_fe',
  'category_organization_fe',
  'created_on',
  'education',
  'education_fe',
  'education_mean_grpby_and_category',
  'education_median_grpby_and_city',
  'education_min_grpby_and_area',
  'education_std_grpby_and_area',
  'education_std_grpby_and_organization',
  'employer_type_3.0_ohe',
  'english',
  'gender_fe',
  'max_salary',
  'max_salary_mean_grpby_and_category',
  'max_salary_min_grpby_and_city',
  'max_salary_std_grpby_and_category',
  'max_salary_std_grpby_and_category_city',
  'max_salary_std_grpby_and_category_dow',
  'min_salary_max_grpby_and_category',
  'min_salary_max_grpby_and_category_city',
  'min_salary_max_grpby_and_organization',
  'min_salary_mean_grpby_and_area',
  'mi

In [117]:
with open('data.yml', 'w') as outfile:
    yaml.dump({'model_cols':cols},outfile,default_flow_style=None)

In [None]:
# with open("./config/config.yaml", "r") as yaml_file:
#     yaml_read = yaml.safe_load(yaml_file)

# model_path = os.path.join(".", yaml_read["model_path"])
# feature_transformers_path = os.path.join(".", yaml_read["feature_transformers_path"])
# model_cols = yaml_read["model_cols"]

# logger.info("num model cols: {}".format(len(model_cols)))
# logger.info("model cols: {}".format("".join(model_cols)))

In [124]:
sys.path.append('../LeadsPredictor/src/')

In [125]:
with (open('../LeadsPredictor/feature_transformers/feat_trans.pkl', "rb")) as openfile:
    feat_trans = pickle.load(openfile)

In [128]:
feat_trans

{'fe': <data_utils.FreqEnc at 0x7f0a068d4130>,
 'ohe': <data_utils.OHE at 0x7f0a068d4d90>,
 'catnumagg': <data_utils.CatNumAgg at 0x7f0a06b41880>}

In [129]:
ohe = feat_trans['ohe']

In [131]:
ohe.transform()

Unnamed: 0,shift,gender,education,created_on,dow,employer_type,applicant_location,city,area,organization,deposit,category,english,num_openings,max_salary,min_salary,is_part_time,leads_per_opening,target,category_organization,category_area,category_city,category_dow,area_fe,category_area_fe,category_fe,category_organization_fe,education_fe,gender_fe,organization_fe,employer_type_0.0_ohe,employer_type_1.0_ohe,employer_type_2.0_ohe,employer_type_3.0_ohe,employer_type_4.0_ohe,education_min_grpby_and_area,education_std_grpby_and_area,num_openings_max_grpby_and_area,num_openings_std_grpby_and_area,min_salary_mean_grpby_and_area,applicant_location_min_grpby_and_organization,applicant_location_std_grpby_and_organization,education_std_grpby_and_organization,num_openings_std_grpby_and_organization,min_salary_max_grpby_and_organization,num_openings_mean_grpby_and_city,num_openings_median_grpby_and_city,education_median_grpby_and_city,max_salary_min_grpby_and_city,applicant_location_mean_grpby_and_category_city,max_salary_std_grpby_and_category_city,min_salary_max_grpby_and_category_city,min_salary_mean_grpby_and_category_city,num_openings_std_grpby_and_category_city,applicant_location_std_grpby_and_category_dow,max_salary_std_grpby_and_category_dow,min_salary_std_grpby_and_category_dow,num_openings_mean_grpby_and_category_dow,num_openings_median_grpby_and_category_dow,education_mean_grpby_and_category,max_salary_mean_grpby_and_category,max_salary_std_grpby_and_category,min_salary_max_grpby_and_category,employer_type_0.0_ohe.1,employer_type_4.0_ohe.1
0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,4.0,1974.0,19589.0,0.0,54.0,3.0,7.0,20000.0,,0.0,46.857143,3.86822,54.0_19589.0,54.0_1974.0,54.0_4.0,54.0_2.0,0.001369,7.4e-05,0.03492,3.7e-05,0.245589,0.941183,3.7e-05,1.0,0.0,0.0,0.0,0.0,0.0,1.235122,250.0,58.102801,10750.0,0.0,,,,,27.315584,4.0,1.0,1000.0,0.121827,13835.688697,40000.0,14878.571429,41.00497,0.408896,10118.856233,4210.791996,9.84058,2.0,1.651483,19525.777542,12349.2425,50000.0,1.0,0.0
1,0.0,0.0,1.0,8.0,5.0,4.0,0.0,4.0,754.0,23193.0,0.0,4.0,1.0,2.0,15000.0,,0.0,18.0,2.944439,4.0_23193.0,4.0_754.0,4.0_4.0,4.0_5.0,0.000703,3.7e-05,0.026856,3.7e-05,0.348685,0.941183,3.7e-05,0.0,0.0,0.0,0.0,1.0,0.0,1.00292,60.0,13.773863,15000.0,0.0,,,,,27.315584,4.0,1.0,1000.0,0.365854,6790.099026,25000.0,11862.068966,18.617856,0.566579,6240.8241,2357.611786,4.512821,2.0,2.107438,16575.787879,7300.439409,30000.0,0.0,1.0


In [134]:
ohe.ohe.categories_

[array([0., 4.])]

In [120]:
with (open('../LeadsPredictor/model/model.pkl', "rb")) as openfile:
    model_obj = pickle.load(openfile)

In [122]:
model_obj["fitted_models"]

[LGBMRegressor(colsample_bytree=0.4, min_child_samples=110, n_estimators=10000,
               num_leaves=16, objective='regression', subsample_freq=10),
 LGBMRegressor(colsample_bytree=0.4, min_child_samples=110, n_estimators=10000,
               num_leaves=16, objective='regression', subsample_freq=10),
 LGBMRegressor(colsample_bytree=0.4, min_child_samples=110, n_estimators=10000,
               num_leaves=16, objective='regression', subsample_freq=10)]