In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn import linear_model
from sklearn.model_selection import cross_val_score, train_test_split
%matplotlib inline

In [None]:
#Importing the dataset

In [None]:
df=pd.read_csv('./Data/train_set.csv')

In [None]:
df.columns

In [None]:
#database with 8995 datapoints

In [None]:
df.shape

In [None]:
#The variables that have on average 1300 null values.
#The average age of the Digital House students is 34 years
#The average performance in digital house is 3.5
#The average minutes dedicated to the course is 4419
#The average years of experience of digital house is 16.11
#The average days to find a job is 90

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
#We will now find outliers and imbalances in the dataset

In [None]:
#Not many outliers in the DIAS_EMP target variable, but all of them in the upper side of the distribution.
#The minimum days spent to find a job is 79

In [None]:
box_days=df['DIAS_EMP'].plot(kind='box')

In [None]:
fig=box_days.get_figure()
fig.savefig('./graphs/box_days.png')

In [None]:
#In age, there are outliers in both the upper and lower sides of the distribution.
#There is a person with 15 years, who took 84 days in finding a job. The days to find a job are not an outlier
#so we are going to keep that data point for training.

In [None]:
df[df.EDAD==15]

In [None]:
box_age=df['EDAD'].plot(kind='box')

In [None]:
fig=box_age.get_figure()
fig.savefig('./graphs/box_age.png')

In [None]:
#In AVG_DH también hay outliers en ambos lados de la distribución

In [None]:
box_avg=df['AVG_DH'].plot(kind='box')

In [None]:
fig=box_avg.get_figure()
fig.savefig('./graphs/box_avg.png')

In [None]:
#The minutes DH variables has a lot of variance, great concentration around the mean but huge variability in both tales

In [None]:
box_minutes=df['MINUTES_DH'].plot(kind='box')

In [None]:
fig=box_minutes.get_figure()
fig.savefig('./graphs/box_minutes.png')

In [None]:
dist_minutes=df['MINUTES_DH'].plot(kind='density')

In [None]:
fig=dist_minutes.get_figure()
fig.savefig('./graphs/dist_minutes.png')

In [None]:
#No hay outliers en Experiencia

In [None]:
box_exp=df['EXPERIENCIA'].plot(kind='box')

In [None]:
fig=box_exp.get_figure()
fig.savefig('./graphs/box_exp.png')

In [None]:
#Genero

In [None]:
bar_gender=df['GENERO'].value_counts().plot(kind='bar')
plt.xticks(rotation=360)
fig=bar_gender.get_figure()
fig.savefig('./graphs/bar_gender.png')

In [None]:
#Residencia

In [None]:
bar_residence=df['RESIDENCIA'].value_counts().plot(kind='bar')
plt.xticks(rotation=360)
fig=bar_residence.get_figure()
fig.savefig('./graphs/bar_residence.png')

In [None]:
#NV ESTUDIO

In [None]:
bar_studies=df['NV_ESTUDIO'].value_counts().plot(kind='bar')
plt.xticks(rotation=360)
fig=bar_studies.get_figure()
fig.savefig('./graphs/bar_studies.png')

In [None]:
#Estudio Prev

In [None]:
bar_prev=df['ESTUDIO_PREV'].value_counts().plot(kind='bar')
plt.xticks(rotation=10)
fig=bar_prev.get_figure()
fig.savefig('./graphs/bar_prev.png')

In [None]:
#Course

In [None]:
bar_course=df['TRACK_DH'].value_counts().plot(kind='bar')
plt.xticks(rotation=360)
fig=bar_course.get_figure()
fig.savefig('./graphs/bar_course.png')

In [None]:
# Double check for outliers, counting them with the IQR

In [None]:
q1=df.quantile(0.25)
q3=df.quantile(0.75)
iqr=q3-q1
#df[df<(q1-1.5*iqr)|df>(q3+1.5*iqr)]

In [None]:
((df<(q1-1.5*iqr))|(df>(q3+1.5*iqr))).sum()

In [None]:
(df<(q1-1.5*iqr)).sum()

In [None]:
(df>(q3+1.5*iqr)).sum()

In [None]:
#Correlations
#AVG_DH and MINUTES_DH are the ones that exhibit less correlation with DIAS_EMP

In [None]:
correlations=df[['EDAD', 'GENERO', 'RESIDENCIA', 'NV_ESTUDIO',
       'ESTUDIO_PREV', 'TRACK_DH', 'AVG_DH', 'MINUTES_DH', 'EXPERIENCIA',
       'DIAS_EMP']].corr()

In [None]:
correlations

In [None]:
correlations_heatmap=sns.heatmap(correlations)
fig=correlations_heatmap.get_figure()
plt.tight_layout()
fig.savefig('./graphs/correlations_hmap.png')

In [None]:
pairplot=sns.pairplot(df[['EDAD', 'GENERO', 'RESIDENCIA', 'NV_ESTUDIO',
       'ESTUDIO_PREV', 'TRACK_DH', 'AVG_DH', 'MINUTES_DH', 'EXPERIENCIA',
       'DIAS_EMP']])
#fig=pairplot.get_figure()
plt.tight_layout()
pairplot.savefig('./graphs/pairplot.png')

In [None]:
men_women=sns.catplot(x='GENERO',y='DIAS_EMP',kind='box',data=df)
plt.tight_layout()
men_women.savefig('./graphs/men_women.png')

In [None]:
sns.catplot(x='RESIDENCIA',y='DIAS_EMP',kind='box',data=df)

In [None]:
sns.catplot(x='NV_ESTUDIO',y='DIAS_EMP',kind='box',data=df)

In [None]:
sns.catplot(x='ESTUDIO_PREV',y='DIAS_EMP',kind='box',data=df)

In [None]:
sns.catplot(x='TRACK_DH',y='DIAS_EMP',kind='box',data=df)

## Baseline Accuracy

In [72]:
features=list(df.columns)
features.remove('DIAS_EMP')
target=['DIAS_EMP']
X=df[features]
y=df[target]

In [73]:
df_pipeline = Pipeline(
    steps=[
        ('remove_columns', drop_cols),
        ('encode', encode),
        ('impute', impute)
    ]
)

In [74]:
X=df_pipeline.fit_transform(X)

In [75]:
X

Unnamed: 0,EDAD,AVG_DH,MINUTES_DH,EXPERIENCIA,GENERO_FEMENINO,GENERO_MASCULINO,RESIDENCIA_ARGENTINA,RESIDENCIA_BRAZIL,RESIDENCIA_MEXICO,NV_ESTUDIO_POST_GRADUATE,...,NV_ESTUDIO_UNIVERSITARY,ESTUDIO_PREV_BUSINESS,ESTUDIO_PREV_COMMERCIAL,ESTUDIO_PREV_DEVELOPMENT,ESTUDIO_PREV_ENGINEERING,ESTUDIO_PREV_MARKETING,TRACK_DH_DATA,TRACK_DH_EJECUTIVO,TRACK_DH_MARKETING,TRACK_DH_PROGRAMACION
0,37.0,3.100000,4131.5,21.300000,1,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
1,40.0,3.100000,4160.4,25.200000,1,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,1
2,35.0,3.100000,4087.6,18.000000,0,0,1,0,0,0,...,1,0,0,1,0,0,1,0,0,0
3,33.0,3.100000,4043.2,13.600000,1,0,0,1,0,0,...,1,0,0,0,1,0,0,0,0,0
4,29.0,3.600000,4688.0,9.137713,0,0,0,1,0,0,...,1,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8990,29.0,4.000000,4701.6,9.100000,0,1,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
8991,34.0,3.400000,4646.2,16.800000,0,0,1,0,0,0,...,1,0,0,0,1,0,0,0,0,1
8992,28.0,2.722034,3315.1,5.600000,1,0,1,0,0,1,...,0,0,0,0,1,0,0,1,0,0
8993,23.0,3.300000,4437.8,0.900000,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1


In [76]:
linreg = LinearRegression()
cv=cross_validate(linreg,X,y,cv=15,scoring='r2')
print([cv['test_score'].mean()])

[0.8018768630486648]


## XGBoost

In [77]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn import linear_model
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.model_selection import KFold, cross_validate
from sklearn.linear_model import LinearRegression
from transformations_pipeline import DropColumns
from transformations_pipeline import Imputer
from transformations_pipeline import Encoder
from transformations_pipeline import Scaler
from sklearn.model_selection import GridSearchCV
%matplotlib inline
df=pd.read_csv('./Data/train_set.csv')

In [59]:
features=list(df.columns)
features.remove('DIAS_EMP')
target=['DIAS_EMP']
X=df[features]
y=df[target]

In [60]:
drop_cols=DropColumns(['Unnamed: 0'])

In [61]:
encode=Encoder(['GENERO','RESIDENCIA','NV_ESTUDIO','ESTUDIO_PREV','TRACK_DH'])

In [62]:
impute=Imputer(['EDAD','AVG_DH','MINUTES_DH','EXPERIENCIA'])

In [63]:
scale=Scaler(['EDAD','AVG_DH','MINUTES_DH','EXPERIENCIA'])

In [64]:
df_pipeline = Pipeline(
    steps=[
        ('remove_columns', drop_cols),
        ('encode', encode),
        ('impute', impute),
        ('scale',scale)
    ]
)

In [65]:
X=df_pipeline.fit_transform(X)

In [71]:
gbreg = xgb.XGBRegressor(learning_rate=0.14,n_estimators=90,max_depth=3,alpha=0.9,n_jobs=-1)
cv=cross_validate(gbreg,X,y,cv=15,scoring='r2')
print([cv['test_score'].mean()])

[0.8095108570327242]
