In [75]:
#libraries
import pandas as pd
import numpy as np

In [76]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [77]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [78]:
import joblib

In [79]:
# read the data files
train = pd.read_csv("D:\Manikandan\Documents\Datascience_ML_DL_AI\Programming\Github\ds23_future_datascience_legend_work\Machine_Hack_Practice_23_11\data\hack_train.csv")
test = pd.read_csv("D:\Manikandan\Documents\Datascience_ML_DL_AI\Programming\Github\ds23_future_datascience_legend_work\Machine_Hack_Practice_23_11\data\hack_test.csv")
sub = pd.read_csv("D:\Manikandan\Documents\Datascience_ML_DL_AI\Programming\Github\ds23_future_datascience_legend_work\Machine_Hack_Practice_23_11\data\hack_submission.csv")

In [80]:
# to see the no. of features and observations
train.shape,test.shape,sub.shape

((54808, 14), (23490, 13), (23490, 2))

In [81]:
# to check the missing values
train.isna().sum()

employee_id                0
department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64

In [82]:
# see few records
train.head(2)

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0


In [83]:
# to see the column names and its datatype
train.dtypes

employee_id               int64
department               object
region                   object
education                object
gender                   object
recruitment_channel      object
no_of_trainings           int64
age                       int64
previous_year_rating    float64
length_of_service         int64
KPIs_met >80%             int64
awards_won?               int64
avg_training_score        int64
is_promoted               int64
dtype: object

In [84]:
# identify cols
tgt_cols = ['is_promoted']
ign_cols = ['employee_id']
cat_cols = train.select_dtypes(include='object').columns
num_cols = train.select_dtypes(exclude='object').columns

In [85]:
print(tgt_cols, ign_cols, cat_cols, num_cols, sep='\n')

['is_promoted']
['employee_id']
Index(['department', 'region', 'education', 'gender', 'recruitment_channel'], dtype='object')
Index(['employee_id', 'no_of_trainings', 'age', 'previous_year_rating',
       'length_of_service', 'KPIs_met >80%', 'awards_won?',
       'avg_training_score', 'is_promoted'],
      dtype='object')


In [86]:
train[cat_cols].head(2)

Unnamed: 0,department,region,education,gender,recruitment_channel
0,Sales & Marketing,region_7,Master's & above,f,sourcing
1,Operations,region_22,Bachelor's,m,other


In [87]:
train[num_cols].head(2)

Unnamed: 0,employee_id,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,1,35,5.0,8,1,0,49,0
1,65141,1,30,5.0,4,0,0,60,0


In [88]:
# to see numerical columns after removing the target columns and not required columns 
num_cols = train.select_dtypes(exclude='object').drop(columns=ign_cols+tgt_cols).columns
train[num_cols].head(2)

Unnamed: 0,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,1,35,5.0,8,1,0,49
1,1,30,5.0,4,0,0,60


In [89]:
# create pipeline to do preprocessing of category columns
cat_pipe_encode = Pipeline(
    steps=[
        ('impute_cat', SimpleImputer(strategy='most_frequent')), # missing values
        ('ohe', OneHotEncoder(handle_unknown='ignore')) # category encoding
    ]
)

In [90]:
# create pipeline to do preprocessing of numerical columns
num_pipe_encode = Pipeline(
    steps=[
        ('impute_num', SimpleImputer(strategy='median')), # impute missing values
        ('scale', StandardScaler()) # standard scaler
    ]
)

In [91]:
# create map b/w the pipeline and the columns
preprocess = ColumnTransformer(
    transformers = [
        ('cat_encode', cat_pipe_encode, cat_cols), # categorical columns
        ('num_encode', num_pipe_encode, num_cols) # numerical columns
    ]
)

In [92]:
# create object for the LogisticRegression algo
mymodel = LogisticRegression()

In [93]:
# merging the preprocessing and modeling in a pipeline
model_pipeline = Pipeline(
    steps=[
        ('preprocess', preprocess), # preprocessing
        ('model', mymodel) #modeling
    ]
)

In [94]:
# split the train dataset to train and validation
X = train.drop(columns = ign_cols + tgt_cols)
X.head(2)

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49
1,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60


In [95]:
y = train[tgt_cols]
y.head(2)

Unnamed: 0,is_promoted
0,0
1,0


In [96]:
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.1, random_state=42)
train_X.shape, val_X.shape, train_y.shape, val_y.shape

((49327, 12), (5481, 12), (49327, 1), (5481, 1))

In [97]:
train.shape, int(train.shape[0]*.9), int(train.shape[0]*.1)

((54808, 14), 49327, 5480)

In [98]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import f1_score

In [99]:
# fit the model
model_pipeline.fit(train_X, train_y)

In [100]:
model_pipeline.predict(train_X)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [101]:
def model_train_val_eval(train_X, val_X, train_y, val_y, model_pipeline):
    pred_train = model_pipeline.predict(train_X)
    pred_val = model_pipeline.predict(val_X)

    print("Train F1 score : ",f1_score(train_y, pred_train))
    print("Val F1 score : ",f1_score(val_y, pred_val))

In [102]:
model_train_val_eval(train_X, val_X, train_y, val_y, model_pipeline)

Train F1 score :  0.39556905485081295
Val F1 score :  0.4020979020979021


In [103]:
sub.head(2)

Unnamed: 0,employee_id,is_promoted
0,8724,0
1,74430,0


In [104]:
sub['is_promoted'] = model_pipeline.predict(test)
sub.to_csv('submisssion.csv',index=False)

In [107]:
train.dtypes

employee_id               int64
department               object
region                   object
education                object
gender                   object
recruitment_channel      object
no_of_trainings           int64
age                       int64
previous_year_rating    float64
length_of_service         int64
KPIs_met >80%             int64
awards_won?               int64
avg_training_score        int64
is_promoted               int64
dtype: object

In [106]:
# create the pickle file
joblib.dump(model_pipeline, 'promote_pipeline_model.pkl')

['promote_pipeline_model.pkl']

In [109]:
train.columns.to_frame()

Unnamed: 0,0
employee_id,employee_id
department,department
region,region
education,education
gender,gender
recruitment_channel,recruitment_channel
no_of_trainings,no_of_trainings
age,age
previous_year_rating,previous_year_rating
length_of_service,length_of_service


In [111]:
train.columns.to_frame

<bound method Index.to_frame of Index(['employee_id', 'department', 'region', 'education', 'gender',
       'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating',
       'length_of_service', 'KPIs_met >80%', 'awards_won?',
       'avg_training_score', 'is_promoted'],
      dtype='object')>