#### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures, power_transform, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold, RepeatedKFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, BaggingRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from feature_engine.outliers import Winsorizer
from category_encoders import BaseNEncoder, BinaryEncoder, CatBoostEncoder, JamesSteinEncoder, HelmertEncoder, LeaveOneOutEncoder,TargetEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, f1_score, r2_score
from tqdm.notebook import tqdm
from rfpimp import permutation_importances
import optuna



#### Getting the data

In [2]:
training = pd.read_csv('Train.csv')
testing = pd.read_csv('Test.csv')
sample = pd.read_csv('sample submission.csv')

In [3]:
sample

Unnamed: 0,EmpID,BiasInfluentialFactor,FitmentPercent
0,5664,MaritalStatus,25.14
1,23568,DegreeBranch,91.44
2,21490,MartialStatus,76.92
3,8363,HighestDegree,77.92
4,6165,Gender,85.64
5,17679,,41.23
6,9715,,50.0


In [4]:
training.shape, testing.shape

((13645, 22), (8745, 20))

In [5]:
training.head(10)

Unnamed: 0,EmpID,EmpName,LanguageOfCommunication,Age,Gender,JobProfileIDApplyingFor,HighestDegree,DegreeBranch,GraduatingInstitute,LatestDegreeCGPA,...,CurrentCTC,ExpectedCTC,MartialStatus,EmpScore,CurrentDesignation,CurrentCompanyType,DepartmentInCompany,TotalLeavesTaken,BiasInfluentialFactor,FitmentPercent
0,11041,John,English,35,Male,JR85289,B.Tech,Electrical,Tier 1,7,...,21,26,Married,5,SSE,Enterprise,Design,20,YearsOfExperince,95.4
1,15079,William,English,26,Male,JR87525,B.Tech,Artificial Intelligence,Tier 3,7,...,15,19,Married,5,BA,MidSized,Engineering,6,,67.09
2,18638,James,English,36,Female,JR87525,PhD,Computer Science,Tier 1,6,...,15,24,Single,5,SDE,MidSized,Engineering,19,Gender,91.26
3,3941,Charles,English,29,Female,JR87525,BCA,Information Technology,Tier 2,5,...,16,24,Married,5,SDE,Startup,Product,16,Gender,72.29
4,5936,George,English,25,Male,JR70175,Dual M.Tech,Computer Science,Tier 3,8,...,24,32,Married,5,SDE,Enterprise,Engineering,10,DegreeBranch,86.34
5,9670,Frank,Native,35,Male,JR88879,BCA,Computer Science,Tier 2,9,...,25,29,Married,4,DS,MidSized,Engineering,10,YearsOfExperince,93.23
6,16554,Joseph,Hindi,31,Male,JR85289,PhD,Computer Science,Tier 1,7,...,12,21,Single,3,SDE,Enterprise,Customer Success,8,CurrentCompanyType,62.29
7,3301,Thomas,English,32,Male,JR85289,B.Tech,Information Technology,Tier 2,8,...,7,17,Married,3,SSE,MidSized,Engineering,18,DegreeBranch,93.71
8,12236,Henry,English,28,Female,JR87525,M.Tech,Electrical,Tier 1,6,...,21,28,Married,4,SDE,Startup,Engineering,7,Gender,91.66
9,10157,Robert,Native,31,Female,JR88873,B.Tech,Artificial Intelligence,Tier 2,8,...,21,31,Married,3,SDE,Startup,Customer Success,10,Gender,73.31


In [6]:
asd = (training.ExpectedCTC - training.CurrentCTC)/100

In [7]:
training.columns

Index(['EmpID', 'EmpName', 'LanguageOfCommunication', 'Age', 'Gender',
       'JobProfileIDApplyingFor', 'HighestDegree', 'DegreeBranch',
       'GraduatingInstitute', 'LatestDegreeCGPA', 'YearsOfExperince',
       'GraduationYear', 'CurrentCTC', 'ExpectedCTC', 'MartialStatus',
       'EmpScore', 'CurrentDesignation', 'CurrentCompanyType',
       'DepartmentInCompany', 'TotalLeavesTaken', 'BiasInfluentialFactor',
       'FitmentPercent'],
      dtype='object')

#### As the submission contains missing values in it. So I have to keep the "BiasInfluentialFactor"

In [8]:
training['BiasInfluentialFactor'].fillna('Missing', inplace=True, axis=0)

#### Creating a function for testing set as we need it

In [9]:
def filling(df):
    if 10 <= df['YearsOfExperince'] >=12 and 33 <=df['Age'] >= 42 and 2009 <= df['GraduationYear'] >= 2011:
        return 'YearsOfExperince'
    elif df['Gender'] == 'Female' and 3 <= df['EmpScore'] >= 5:
        return 'Gender'
    elif df['DegreeBranch'] == 'Computer Science' or df['DegreeBranch'] == 'Information Technology':
        return 'DegreeBranch'
    elif df['CurrentCompanyType'] == 'Enterprise':
        return 'CurrentCompanyType'
    elif df['Gender'] == 'Male' and df['Gender'] == 'Other' and df['HighestDegree'] != 'MS' and df['HighestDegree'] != 'Computer Science' and df['HighestDegree'] != 'Information Technology' and df['MartialStatus'] == 'Married':
        return 'MartialStatus'
    elif df['Gender'] == 'Male' and df['HighestDegree'] == 'MS':
        return 'HighestDegree'
    elif df['Gender'] != 'Female' and df['HighestDegree'] != 'MS' and (df['DegreeBranch'] != 'Computer Science' or df['DegreeBranch'] != 'Information Technology') and df['CurrentCompanyType'] != 'Enterprise' and 4<= df['EmpScore'] >= 5:
        return 'EmpScore'
    elif df['ExpectedCTC'] > 39 and df['CurrentCompanyType'] != 'Enterprise':
        return 'Ethinicity'
    elif 8 <df['LatestDegreeCGPA'] >= 10 and 3<= df['EmpScore']>=5:
        return 'LatestDegreeCGPA'
    else:
        return 'Missing'

In [10]:
testing['BiasInfluentialFactor'] = testing.apply(lambda df: filling(df), axis = 1)

#### Encoding categorical columns

In [11]:
lang_map1 = pd.get_dummies(training.LanguageOfCommunication, drop_first=True)
training = training.drop('LanguageOfCommunication', axis = 1)
training = pd.concat([training,lang_map1], axis = 1)

lang_map2 = pd.get_dummies(testing.LanguageOfCommunication, drop_first=True)
testing = testing.drop('LanguageOfCommunication', axis = 1)
testing = pd.concat([testing,lang_map2], axis = 1)

In [12]:
gend_map1 = pd.get_dummies(training.Gender, drop_first=True)
training = training.drop('Gender', axis = 1)
training = pd.concat([training, gend_map1], axis= 1)

gend_map2 = pd.get_dummies(testing.Gender, drop_first=True)
testing = testing.drop('Gender', axis = 1)
testing = pd.concat([testing, gend_map2], axis= 1)

In [13]:
emp1 = pd.get_dummies(training.EmpScore, prefix='emp_score_',drop_first=True)
training = training.drop('EmpScore', axis= 1)
training = pd.concat([training, emp1], axis= 1)

In [14]:
emp2 = pd.get_dummies(testing.EmpScore, prefix='emp_score_',drop_first=True)
testing = testing.drop('EmpScore', axis= 1)
testing = pd.concat([testing, emp2], axis= 1)

In [15]:
training.JobProfileIDApplyingFor = training.JobProfileIDApplyingFor.map({'JR70175': 0,
                                                                         'JR79193': 1,
                                                                         'JR81165': 2,
                                                                         'JR85289': 3,
                                                                         'JR87525': 4,
                                                                         'JR88654': 5,
                                                                         'JR88873': 6,
                                                                         'JR88879': 7,
                                                                         'JR89890': 8})

testing.JobProfileIDApplyingFor = testing.JobProfileIDApplyingFor.map({'JR70175': 0,
                                                                         'JR79193': 1,
                                                                         'JR81165': 2,
                                                                         'JR85289': 3,
                                                                         'JR87525': 4,
                                                                         'JR88654': 5,
                                                                         'JR88873': 6,
                                                                         'JR88879': 7,
                                                                         'JR89890': 8})

In [16]:
married1 = pd.get_dummies(training.MartialStatus, drop_first=True)
training = training.drop('MartialStatus', axis = 1)
training = pd.concat([training, married1], axis= 1)

married2 = pd.get_dummies(testing.MartialStatus, drop_first=True)
testing = testing.drop('MartialStatus', axis = 1)
testing = pd.concat([testing, married2], axis= 1)

In [17]:
training.CurrentCompanyType = training.CurrentCompanyType.map({'Startup': 0,
                                                               'MidSized': 1,
                                                               'Enterprise': 2})

testing.CurrentCompanyType = testing.CurrentCompanyType.map({'Startup': 0,
                                                               'MidSized': 1,
                                                               'Enterprise': 2})

In [18]:
job1 = pd.get_dummies(training.HighestDegree, drop_first=True)
training = training.drop('HighestDegree', axis =1)
training = pd.concat([training, job1], axis= 1)

job2 = pd.get_dummies(testing.HighestDegree, drop_first=True)
testing = testing.drop('HighestDegree', axis =1)
testing = pd.concat([testing, job2], axis= 1)

In [19]:
branch1 = pd.get_dummies(training.DegreeBranch, drop_first=True)
training = training.drop('DegreeBranch', axis= 1)
training = pd.concat([training, branch1], axis=1)

branch2 = pd.get_dummies(testing.DegreeBranch, drop_first=True)
testing = testing.drop('DegreeBranch', axis= 1)
testing = pd.concat([testing, branch2], axis=1)

In [20]:
dept1 = pd.get_dummies(training.DepartmentInCompany, drop_first=True)
training = training.drop('DepartmentInCompany', axis= 1)
training = pd.concat([training, dept1], axis= 1)

dept2 = pd.get_dummies(testing.DepartmentInCompany, drop_first=True)
testing = testing.drop('DepartmentInCompany', axis= 1)
testing = pd.concat([testing, dept2], axis= 1)

In [21]:
training['CGPA_to_percentage'] = (training.LatestDegreeCGPA * 100)/10
testing['CGPA_to_percentage'] = (testing.LatestDegreeCGPA * 100)/10

In [22]:
[var for var in training.columns if training[var].dtypes == 'O']

['EmpName',
 'GraduatingInstitute',
 'CurrentDesignation',
 'BiasInfluentialFactor']

In [23]:
cat_cols = ['CurrentDesignation']

In [24]:
be = JamesSteinEncoder(sigma=0.1)
be.fit(training[cat_cols], training.FitmentPercent)
training[cat_cols] = be.transform(training[cat_cols], training.FitmentPercent)
testing[cat_cols] = be.transform(testing[cat_cols])

  elif pd.api.types.is_categorical(cols):


In [25]:
training.GraduatingInstitute = training.GraduatingInstitute.map({'Tier 1':0,
                                                                 'Tier 2':1,
                                                                 'Tier 3':2})

testing.GraduatingInstitute = testing.GraduatingInstitute.map({'Tier 1':0,
                                                                 'Tier 2':1,
                                                                 'Tier 3':2})

In [26]:
training['Expected_increase'] = (training.ExpectedCTC - training.CurrentCTC)/100
testing['Expected_increase'] = (testing.ExpectedCTC - testing.CurrentCTC)/100

In [27]:
[var for var in training.columns if training[var].dtypes != 'O']

['EmpID',
 'Age',
 'JobProfileIDApplyingFor',
 'GraduatingInstitute',
 'LatestDegreeCGPA',
 'YearsOfExperince',
 'GraduationYear',
 'CurrentCTC',
 'ExpectedCTC',
 'CurrentDesignation',
 'CurrentCompanyType',
 'TotalLeavesTaken',
 'FitmentPercent',
 'Hindi',
 'Native',
 'Male',
 'Other',
 'emp_score__2',
 'emp_score__3',
 'emp_score__4',
 'emp_score__5',
 'Single',
 'BCA',
 'Dual M.Tech',
 'Dual MBA',
 'M.Tech',
 'MCA',
 'MS',
 'PhD',
 'Computer Science',
 'Electrical',
 'Electrical and Electronics',
 'Electronics',
 'Information Technology',
 'Design',
 'Engineering',
 'Finance',
 'Product',
 'CGPA_to_percentage',
 'Expected_increase']

In [28]:
cont_cols = [
 'Age',
 'JobProfileIDApplyingFor',
 'GraduatingInstitute',
 'LatestDegreeCGPA',
 'YearsOfExperince',
 'GraduationYear',
 'CurrentCTC',
 'ExpectedCTC',
 'CurrentDesignation',
 'CurrentCompanyType',
 'TotalLeavesTaken',
 'Hindi',
 'Native',
 'Male',
 'Other',
 'emp_score__2',
 'emp_score__3',
 'emp_score__4',
 'emp_score__5',
 'Single',
 'BCA',
 'Dual M.Tech',
 'Dual MBA',
 'M.Tech',
 'MCA',
 'MS',
 'PhD',
 'Computer Science',
 'Electrical',
 'Electrical and Electronics',
 'Electronics',
 'Information Technology',
 'Design',
 'Engineering',
 'Finance',
 'Product',
 'CGPA_to_percentage',
 'Expected_increase']

In [29]:
X_train = training[cont_cols]
y_train = training.FitmentPercent
X_test = testing[cont_cols]

In [30]:
param_lgbm = {'reg_lambda': 0.578480224690412,
 'reg_alpha': 0.4152813025828909,
 'colsample_bytree': 1.0,
 'subsample': 0.6,
 'learning_rate': 0.041172171172767065,
 'max_depth': 9,
 'min_child_samples': 88,
 'num_leaves': 15}

In [31]:
NUM_FOLDS = 10
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=0)
MSE = []

for f, (train_ind, val_ind) in tqdm(enumerate(kf.split(X_train, y_train))):
    print('==================', f+1, '====================')
    train_df, val_df = X_train.iloc[train_ind], X_train.iloc[val_ind]
    train_target, val_target = y_train.iloc[train_ind], y_train.iloc[val_ind]
        
    model = LGBMRegressor(**param_lgbm)
    model.fit(train_df, train_target)
    print('Train set')
    pred = model.predict(train_df)
    print('mean_squared_error on Train data: {}'.format(mean_squared_error(train_target, pred)))

    print('Val set')
    pred_val = model.predict(val_df)
    print(mean_squared_error(val_target, pred_val))
    pred_test = model.predict(X_test)
    MSE.append(mean_squared_error(val_target, pred_val))
print('Final prediction:====',np.mean(MSE))

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

Train set
mean_squared_error on Train data: 77.56818925412364
Val set
78.31553344202224
Train set
mean_squared_error on Train data: 77.52591545288362
Val set
81.45331567400386
Train set
mean_squared_error on Train data: 77.37879677940063
Val set
82.14720791475747
Train set
mean_squared_error on Train data: 76.95236607451602
Val set
84.63995214876861
Train set
mean_squared_error on Train data: 77.3678579284666
Val set
81.37796928408325
Train set
mean_squared_error on Train data: 77.22349499300276
Val set
84.23910714135093
Train set
mean_squared_error on Train data: 78.017542064997
Val set
75.5904673266492
Train set
mean_squared_error on Train data: 77.8517805557081
Val set
78.0583552977774
Train set
mean_squared_error on Train data: 77.98613806618978
Val set
76.88752600841376
Train set
mean_squared_error on Train data: 77.42042219206208
Val set
80.78293586460275

Final prediction:==== 80.34923701024294


In [32]:
param_xgb = {'booster': 'gbtree',
 'reg_lambda': 0.00028558121918599686,
 'reg_alpha': 0.0015195590664257264,
 'max_depth': 6,
 'learning_rate': 0.04940302060852827,
 'gamma': 8.709181301105377e-08,
 'grow_policy': 'lossguide'}

In [33]:
NUM_FOLDS = 10
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=0)
MSE = []

for f, (train_ind, val_ind) in tqdm(enumerate(kf.split(X_train, y_train))):
    print('==================', f+1, '====================')
    train_df, val_df = X_train.iloc[train_ind], X_train.iloc[val_ind]
    train_target, val_target = y_train.iloc[train_ind], y_train.iloc[val_ind]
        
    model = XGBRegressor(**param_xgb)
    model.fit(train_df, train_target)
    print('Train set')
    pred = model.predict(train_df)
    print('Cat mean_squared_error on Train data: {}'.format(mean_squared_error(train_target, pred)))

    print('Val set')
    pred_val = model.predict(val_df)
    print(mean_squared_error(val_target, pred_val))
    pred_test = model.predict(X_test)
    MSE.append(mean_squared_error(val_target, pred_val))
print('Final prediction:====',np.mean(MSE))

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

Train set
Cat mean_squared_error on Train data: 66.9249302520397
Val set
79.98164130432005
Train set
Cat mean_squared_error on Train data: 66.64656652189723
Val set
83.76360482832669
Train set
Cat mean_squared_error on Train data: 66.93819132043765
Val set
83.92345825751059
Train set
Cat mean_squared_error on Train data: 66.2872092700755
Val set
86.00554968361101
Train set
Cat mean_squared_error on Train data: 66.84091000585104
Val set
80.65129935354612
Train set
Cat mean_squared_error on Train data: 66.3645420219732
Val set
86.56334577883688
Train set
Cat mean_squared_error on Train data: 67.22634324991915
Val set
77.26791743910955
Train set
Cat mean_squared_error on Train data: 67.31295774461712
Val set
79.86877854697386
Train set
Cat mean_squared_error on Train data: 66.9054726695263
Val set
78.87601379856865
Train set
Cat mean_squared_error on Train data: 66.80866691364916
Val set
81.66706003550762

Final prediction:==== 81.8568669026311


In [34]:
NUM_FOLDS = 10
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=0)
MSE = []

for f, (train_ind, val_ind) in tqdm(enumerate(kf.split(X_train, y_train))):
    print('==================', f+1, '====================')
    train_df, val_df = X_train.iloc[train_ind], X_train.iloc[val_ind]
    train_target, val_target = y_train.iloc[train_ind], y_train.iloc[val_ind]
        
    model = CatBoostRegressor(verbose=0)
    model.fit(train_df, train_target)
    print('Train set')
    pred = model.predict(train_df)
    print('mean_squared_error on Train data: {}'.format(mean_squared_error(train_target, pred)))

    print('Val set')
    pred_val = model.predict(val_df)
    print(mean_squared_error(val_target, pred_val))
    pred_test = model.predict(X_test)
    MSE.append(mean_squared_error(val_target, pred_val))
print('Final prediction:====',np.mean(MSE))

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

Train set
mean_squared_error on Train data: 48.28263864050737
Val set
82.60407190394966
Train set
mean_squared_error on Train data: 47.955562304081404
Val set
85.98859974989126
Train set
mean_squared_error on Train data: 48.0564573605673
Val set
84.78341795091208
Train set
mean_squared_error on Train data: 48.069889312766605
Val set
89.55690350053334
Train set
mean_squared_error on Train data: 48.31382552576147
Val set
84.34075072500804
Train set
mean_squared_error on Train data: 47.51506018337869
Val set
89.440464023787
Train set
mean_squared_error on Train data: 48.380582964174664
Val set
79.957183219813
Train set
mean_squared_error on Train data: 48.13792669065762
Val set
82.12515640265724
Train set
mean_squared_error on Train data: 48.07835847738842
Val set
81.08716731383997
Train set
mean_squared_error on Train data: 47.88660869034978
Val set
84.29248690630442

Final prediction:==== 84.41762016966959


#### Feature Importance

In [35]:
# function for creating a feature importance dataframe
def imp_df(column_names, importances):
    df = pd.DataFrame({'feature': column_names,
                       'feature_importance': importances}) \
           .sort_values('feature_importance', ascending = False) \
           .reset_index(drop = True)
    return df

# plotting a feature importance dataframe (horizontal barchart)
def var_imp_plot(imp_df):
    plt.figure(figsize=(20,18))
    imp_df.columns = ['feature', 'feature_importance']
    sns.barplot(x = 'feature_importance', y = 'feature', data = imp_df, orient = 'h', color = 'royalblue')

In [36]:
def r2(rf, X_train, y_train):
    return mean_squared_error(y_train, model.predict(X_train))

In [None]:
perm_imp_rfpimp = permutation_importances(model, X_train, y_train, r2)
perm_imp_rfpimp.reset_index(drop = False, inplace = True)

In [None]:
var_imp_plot(perm_imp_rfpimp)

In [None]:
testing['BiasInfluentialFactor'] = testing.BiasInfluentialFactor.astype(str).replace('Missing', np.nan)

#### Creating stacking model

In [None]:
def get_models():
    model1 = LGBMRegressor(**param_lgbm)

    model2 = LGBMRegressor(**param_lgbm)

    model3 = XGBRegressor(**param_xgb)
        
    models = {'XGB':model1, 'CAT':model2, 'LGBM':model3}

    return models

In [None]:
base_learners = get_models()
meta_learner = LGBMRegressor(**param_lgbm)

In [None]:
from mlens.ensemble import SuperLearner

# Instantiate the ensemble with 10 folds
sl = SuperLearner(
    folds=10,
    random_state=10,
    verbose=2,
    backend="multiprocessing"
)

# Add the base learners and the meta learner
sl.add(list(base_learners.values())) 
sl.add_meta(meta_learner)

# Train the ensemble
sl.fit(X_train, y_train)

# Predict the test set
p_sl = sl.predict(X_test)

#### Making submisison

In [None]:
submission = pd.DataFrame()
submission['EmpID'] = testing.EmpID
submission['BiasInfluentialFactor'] = testing['BiasInfluentialFactor']
submission['FitmentPercent'] = np.round(p_sl, decimals=2)
submission.to_csv('submission.csv',index=False)
submission.head()