In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# for the model
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline

import scipy.stats as stats
import seaborn as sns
# for one hot encoding with feature-engine
from feature_engine.encoding import OneHotEncoder

# from feature-engine
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer
)

from feature_engine.encoding import OneHotEncoder

In [52]:
#!pip install feature_Engine

Collecting feature_Engine
  Using cached feature_engine-1.1.2-py2.py3-none-any.whl (180 kB)
Installing collected packages: feature-Engine
Successfully installed feature-Engine-1.1.2


In [7]:
file_path_train = "./archive/aug_train.csv"
file_path_test = "./archive/aug_test.csv"
file_path_ss = "./archive/sample_submission.csv"
df_train = pd.read_csv(file_path_train) 
df_test = pd.read_csv(file_path_test) 
df_ss = pd.read_csv(file_path_ss) 


df_train.shape, df_test.shape, df_ss.shape

((19158, 14), (2129, 13), (2129, 2))

In [10]:
df_train.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


# TIPO DE DATOS

In [12]:
# let's inspect the type of variables in pandas
df_train.dtypes

enrollee_id                 int64
city                       object
city_development_index    float64
gender                     object
relevent_experience        object
enrolled_university        object
education_level            object
major_discipline           object
experience                 object
company_size               object
company_type               object
last_new_job               object
training_hours              int64
target                    float64
dtype: object

# INSPECCIONANDO VARIABLES

In [14]:
# let's inspect the variable values

for var in df_train.columns:
    print(var, df_train[var].unique()[0:20], '\n')

enrollee_id [ 8949 29725 11561 33241   666 21651 28806   402 27107   699 29452 23853
 25619  5826  8722  6588  4167  5764  2156 11399] 

city ['city_103' 'city_40' 'city_21' 'city_115' 'city_162' 'city_176'
 'city_160' 'city_46' 'city_61' 'city_114' 'city_13' 'city_159' 'city_102'
 'city_67' 'city_100' 'city_16' 'city_71' 'city_104' 'city_64' 'city_101'] 

city_development_index [0.92  0.776 0.624 0.789 0.767 0.764 0.762 0.913 0.926 0.827 0.843 0.804
 0.855 0.887 0.91  0.884 0.924 0.666 0.558 0.923] 

gender ['Male' nan 'Female' 'Other'] 

relevent_experience ['Has relevent experience' 'No relevent experience'] 

enrolled_university ['no_enrollment' 'Full time course' nan 'Part time course'] 

education_level ['Graduate' 'Masters' 'High School' nan 'Phd' 'Primary School'] 

major_discipline ['STEM' 'Business Degree' nan 'Arts' 'Humanities' 'No Major' 'Other'] 

experience ['>20' '15' '5' '<1' '11' '13' '7' '17' '2' '16' '1' '4' '10' '14' '18'
 '19' '12' '3' '6' '9'] 

company_size [nan

# VARIABLES DISCRETAS, CONTINUAS, CATEGORICAS Y MIXTAS

In [65]:
# make list of variables  types
data = df_train.copy()
# numerical: discrete vs continuous
discrete = [var for var in data.columns if data[var].dtype!='O' and var not in ['enrollee_id','target'] and data[var].nunique()<10]
continuous = [var for var in data.columns if data[var].dtype!='O' and var not in ['enrollee_id','target']  and var not in discrete]

# mixed
mixed = ['city']

# categorical
categorical = [var for var in data.columns if data[var].dtype=='O' and var not in mixed]

print('There are {} discrete variables'.format(len(discrete)))
print(discrete)
print('There are {} continuous variables'.format(len(continuous)))
print(continuous)
print('There are {} categorical variables'.format(len(categorical)))
print(categorical)
print('There are {} mixed variables'.format(len(mixed)))
print(mixed)

There are 0 discrete variables
[]
There are 2 continuous variables
['city_development_index', 'training_hours']
There are 9 categorical variables
['gender', 'relevent_experience', 'enrolled_university', 'education_level', 'major_discipline', 'experience', 'company_size', 'company_type', 'last_new_job']
There are 1 mixed variables
['city']


# SELECCIONANDO TRAIN Y TEST

In [66]:
# let's separate into training and testing set

# first drop the target from the feature list
cols_to_use = discrete + continuous + categorical + mixed

X_train, X_test, y_train, y_test = train_test_split(data[cols_to_use],
                                                    data['target'],
                                                    test_size=0.3,
                                                    random_state=0)
X_train.shape, X_test.shape

((13410, 12), (5748, 12))

# COMPLETANDO LOS VALORES NULOS O VACIOS

## VALORES NULOS

In [78]:
pipe = Pipeline([
    ('missing_ind', AddMissingIndicator()),

    ('imputer_mode', CategoricalImputer(
        imputation_method='frequent', variables=categorical)),

    ('imputer_median', MeanMedianImputer(imputation_method='median',
                                         variables=continuous)),
])

In [79]:
# fit the pipe
pipe.fit(X_train)

Pipeline(steps=[('missing_ind', AddMissingIndicator()),
                ('imputer_mode',
                 CategoricalImputer(imputation_method='frequent',
                                    variables=['gender', 'relevent_experience',
                                               'enrolled_university',
                                               'education_level',
                                               'major_discipline', 'experience',
                                               'company_size', 'company_type',
                                               'last_new_job'])),
                ('imputer_median',
                 MeanMedianImputer(variables=['city_development_index',
                                              'training_hours']))])

In [80]:
# inspect the separate steps
pipe.named_steps['missing_ind'].variables_

['gender',
 'enrolled_university',
 'education_level',
 'major_discipline',
 'experience',
 'company_size',
 'company_type',
 'last_new_job']

In [81]:
pipe.named_steps['imputer_mode'].imputer_dict_

{'gender': 'Male',
 'relevent_experience': 'Has relevent experience',
 'enrolled_university': 'no_enrollment',
 'education_level': 'Graduate',
 'major_discipline': 'STEM',
 'experience': '>20',
 'company_size': '50-99',
 'company_type': 'Pvt Ltd',
 'last_new_job': '1'}

In [82]:
pipe.named_steps['imputer_median'].imputer_dict_

{'city_development_index': 0.903, 'training_hours': 47.0}

In [83]:
# let's transform the data with the pipeline

# this pipeline will:
#- add the missing indicators
#- fill na in the original variables
# leaving the dataset ready to use for ML

tmp = pipe.transform(X_train)

# let's check null values are gone
tmp.isnull().mean()

city_development_index    0.0
training_hours            0.0
gender                    0.0
relevent_experience       0.0
enrolled_university       0.0
education_level           0.0
major_discipline          0.0
experience                0.0
company_size              0.0
company_type              0.0
last_new_job              0.0
city                      0.0
gender_na                 0.0
enrolled_university_na    0.0
education_level_na        0.0
major_discipline_na       0.0
experience_na             0.0
company_size_na           0.0
company_type_na           0.0
last_new_job_na           0.0
dtype: float64

In [84]:
tmp.shape

(13410, 20)

## VALORES CONTINUOS OUTLIERS

In [88]:
data = tmp.copy()

#Dropping the outlier rows with standard deviation
factor = 3
upper_lim = data['training_hours'].mean () + data['training_hours'].std () * factor
lower_lim = data['training_hours'].mean () - data['training_hours'].std () * factor

data_aux = data[(data['training_hours'] < upper_lim) & (data['training_hours'] > lower_lim)]

#Dropping the outlier rows with standard deviation
factor = 3
upper_lim = data_aux['city_development_index'].mean () + data_aux['city_development_index'].std () * factor
lower_lim = data_aux['city_development_index'].mean () - data_aux['city_development_index'].std () * factor

data_aux = data_aux[(data['city_development_index'] < upper_lim) & (data_aux['city_development_index'] > lower_lim)]

data = data_aux.copy()

  data_aux = data_aux[(data['city_development_index'] < upper_lim) & (data_aux['city_development_index'] > lower_lim)]


In [89]:
data.head()

Unnamed: 0,city_development_index,training_hours,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,city,gender_na,enrolled_university_na,education_level_na,major_discipline_na,experience_na,company_size_na,company_type_na,last_new_job_na
7559,0.92,24,Female,Has relevent experience,no_enrollment,Phd,STEM,>20,1000-4999,Public Sector,2,city_103,0,0,0,0,0,0,0,0
6889,0.802,145,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Pvt Ltd,1,city_65,0,0,0,0,0,1,1,0
2617,0.92,6,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,100-500,Pvt Ltd,>4,city_103,0,0,0,0,0,0,0,0
9715,0.92,28,Male,Has relevent experience,no_enrollment,Graduate,STEM,9,1000-4999,Pvt Ltd,never,city_103,0,0,0,0,0,0,0,0
12748,0.743,72,Male,Has relevent experience,no_enrollment,Masters,STEM,17,50-99,Pvt Ltd,>4,city_116,0,0,0,0,0,1,1,0


In [90]:
data.shape

(13096, 20)

# CREACION DE FEATURE

## VARIABLE MIXTA: Extreyendo numero del feature ejm . CITY_1 = 1

In [91]:
# City
data['city_num'] = data['city'].str.extract('(\d+)') # captures numerical part
data['city_num'] = data['city_num'].astype('float')
data.head()

Unnamed: 0,city_development_index,training_hours,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,...,city,gender_na,enrolled_university_na,education_level_na,major_discipline_na,experience_na,company_size_na,company_type_na,last_new_job_na,city_num
7559,0.92,24,Female,Has relevent experience,no_enrollment,Phd,STEM,>20,1000-4999,Public Sector,...,city_103,0,0,0,0,0,0,0,0,103.0
6889,0.802,145,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Pvt Ltd,...,city_65,0,0,0,0,0,1,1,0,65.0
2617,0.92,6,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,100-500,Pvt Ltd,...,city_103,0,0,0,0,0,0,0,0,103.0
9715,0.92,28,Male,Has relevent experience,no_enrollment,Graduate,STEM,9,1000-4999,Pvt Ltd,...,city_103,0,0,0,0,0,0,0,0,103.0
12748,0.743,72,Male,Has relevent experience,no_enrollment,Masters,STEM,17,50-99,Pvt Ltd,...,city_116,0,0,0,0,0,1,1,0,116.0


In [92]:
data.drop(['city'], axis=1, inplace=True)

In [93]:
data.head()

Unnamed: 0,city_development_index,training_hours,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,gender_na,enrolled_university_na,education_level_na,major_discipline_na,experience_na,company_size_na,company_type_na,last_new_job_na,city_num
7559,0.92,24,Female,Has relevent experience,no_enrollment,Phd,STEM,>20,1000-4999,Public Sector,2,0,0,0,0,0,0,0,0,103.0
6889,0.802,145,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Pvt Ltd,1,0,0,0,0,0,1,1,0,65.0
2617,0.92,6,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,100-500,Pvt Ltd,>4,0,0,0,0,0,0,0,0,103.0
9715,0.92,28,Male,Has relevent experience,no_enrollment,Graduate,STEM,9,1000-4999,Pvt Ltd,never,0,0,0,0,0,0,0,0,103.0
12748,0.743,72,Male,Has relevent experience,no_enrollment,Masters,STEM,17,50-99,Pvt Ltd,>4,0,0,0,0,0,1,1,0,116.0


In [101]:
ohe_enc = OneHotEncoder(
    top_categories=10,  # you can change this value to select more or less variables
    # we can select which variables to encode
    variables=['experience'],
    drop_last=False)

ohe_enc.fit(data)

OneHotEncoder(top_categories=10, variables=['experience'])

In [102]:
ohe_enc.encoder_dict_

{'experience': ['>20', '4', '5', '3', '6', '2', '7', '10', '9', '8']}

In [103]:
# this is the list of variables that the encoder will transform
ohe_enc.variables_

['experience']

In [105]:
X_train = ohe_enc.transform(data)
#X_test = ohe_enc.transform(X_test)
# let's explore the result
X_train.head()

Unnamed: 0,city_development_index,training_hours,gender,relevent_experience,enrolled_university,education_level,major_discipline,company_size,company_type,last_new_job,...,experience_>20,experience_4,experience_5,experience_3,experience_6,experience_2,experience_7,experience_10,experience_9,experience_8
7559,0.92,24,Female,Has relevent experience,no_enrollment,Phd,STEM,1000-4999,Public Sector,2,...,1,0,0,0,0,0,0,0,0,0
6889,0.802,145,Male,Has relevent experience,no_enrollment,Masters,STEM,50-99,Pvt Ltd,1,...,1,0,0,0,0,0,0,0,0,0
2617,0.92,6,Male,Has relevent experience,no_enrollment,Graduate,STEM,100-500,Pvt Ltd,>4,...,1,0,0,0,0,0,0,0,0,0
9715,0.92,28,Male,Has relevent experience,no_enrollment,Graduate,STEM,1000-4999,Pvt Ltd,never,...,0,0,0,0,0,0,0,0,1,0
12748,0.743,72,Male,Has relevent experience,no_enrollment,Masters,STEM,50-99,Pvt Ltd,>4,...,0,0,0,0,0,0,0,0,0,0


## NOMRALIZAR VARIABLES CONTINUAS: transformarla 

In [43]:
data['city_development_index_boxcox'], param = stats.boxcox(data['city_development_index']) ## mirar eda2
print('Optimal λ: ', param)
### Logarithmic transformation
data['training_hours_log'] = np.log(data['training_hours']) 
data.head()

Optimal λ:  5.921288453928488


Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target,city_num,city_development_index_boxcox,training_hours_log
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0,103.0,-0.065805,3.583519
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0,40.0,-0.131262,3.850148
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0,21.0,-0.158535,4.418841
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0,115.0,-0.127373,3.951244
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0,162.0,-0.133773,2.079442


## ONE HOT ENCODER

In [55]:
#categorical
# let's have a look at how many labels each variable has

for col in data[categorical].columns:
    print(col, ': ', len(data[col].unique()), ' labels')

# let's first examine how OHE expands the feature space
print(pd.get_dummies(data[categorical], drop_first=True).shape)

gender :  4  labels
relevent_experience :  2  labels
enrolled_university :  4  labels
education_level :  6  labels
major_discipline :  7  labels
experience :  23  labels
company_size :  9  labels
company_type :  7  labels
last_new_job :  7  labels
(18691, 52)


In [56]:
ohe_enc = OneHotEncoder(
    top_categories=10,  # you can change this value to select more or less variables
    # we can select which variables to encode
    variables=["experience"],
    drop_last=False)

ohe_enc.fit(data)

ValueError: Some of the variables to transform contain NaN. Check and remove those before using this transformer.

## BORRANDO variables

In [28]:
# drop original mixed

data.drop(['city'], axis=1, inplace=True)
data.drop(['enrollee_id'], axis=1, inplace=True)

In [29]:
data.head()

Unnamed: 0,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target,city_num
0,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0,103.0
1,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0,40.0
2,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0,21.0
3,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0,115.0
4,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0,162.0
