**1. Load Dataset**

In [None]:
#Pandas: Dataset library
import pandas as pd

#Read dataset
rrhh_ds = pd.read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vSfE_9K1uPYtXkEA_oncBZEGne6D6uhJSBCDqASxid4jIlQiyu_Z3629s0yyhNQxxkb6Q4wnUwDtlNr/pub?gid=0&single=true&output=csv")

#Print dataset
rrhh_ds.head()


**2. Filter Dataset**

In [None]:
#Remove some attributes
rrhh_ds = rrhh_ds[['NivelSatisfaccion', 'UltimaEvaluacion', 'ProyectosRealizados', 'HorasMensuales', 'Antiguedad', 'Ascendido_Disc', 'AreaTrabajo', 'NivelSalarial', 'Renuncia_Disc']]

#Print dataset
rrhh_ds.head()

**3.1. Preprocess Dataset Part 1: Standard Scaler**

In [None]:
from sklearn import preprocessing

#Obtain Standard Transformation
std_scaler = preprocessing.StandardScaler().fit(rrhh_ds[['ProyectosRealizados']])

#Check results of transformation
print("\n=====Before Standard Scaler:=====\n")
print("Mean:",std_scaler.mean_)
print("Std:",std_scaler.scale_)

#Apply the transformation (create temporal variable, do not transform original dataset)
rrhh_ds_pr_temp = std_scaler.transform(rrhh_ds[['ProyectosRealizados']])

#Obtain Standard Transformation
std_scaler = preprocessing.StandardScaler().fit(rrhh_ds_pr_temp)

#Check results of transformation
print("\n=====After Standard Scaler:=====\n")
print("Mean:",std_scaler.mean_)
print("Std:",std_scaler.scale_)

#Print dataset
#rrhh_ds.head()

**3.2. Preprocess Dataset - Part 2: MinMax Scaler**

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()

#Transforma data to [0-1] scale. 
rrhh_ds[['ProyectosRealizados', 'HorasMensuales', 'Antiguedad' ]] = min_max_scaler.fit_transform(rrhh_ds[['ProyectosRealizados', 'HorasMensuales', 'Antiguedad']])

#Print dataset
rrhh_ds.head()

**3.3. Preprocess Dataset - Part 3: One Hot Encoder**

In [None]:
from sklearn.preprocessing import OneHotEncoder

#Creating instance of one-hot-encoder
enc = OneHotEncoder(handle_unknown='ignore')

#Passing columns to encoding
enc_df = pd.DataFrame(enc.fit_transform(rrhh_ds[['AreaTrabajo', 'NivelSalarial']]).toarray())

#Merge with main rrhh_ds on key values
rrhh_ds = rrhh_ds.join(enc_df)

#Drop old columns
rrhh_ds = rrhh_ds.drop(['AreaTrabajo', 'NivelSalarial'],axis=1)
rrhh_ds


**3.4. Preprocess Dataset - Part 4: Label Encoder**

In [None]:
from sklearn.preprocessing import LabelEncoder

#Transform YES/NO to 0/1
lb = LabelEncoder() 
rrhh_ds['Ascendido_Disc'] = lb.fit_transform(rrhh_ds['Ascendido_Disc'])
rrhh_ds['Renuncia_Disc'] = lb.fit_transform(rrhh_ds['Renuncia_Disc'])

rrhh_ds

**4A. Feature Selection: Wrapper methods**

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

#Split X and Y
Y = rrhh_ds['Renuncia_Disc']
X = rrhh_ds.drop(['Renuncia_Disc'], axis=1)

print("RRHH Dataset size: ", X.shape)

clf = ExtraTreesClassifier(n_estimators=50)

clf = clf.fit(X, Y)

print(clf.feature_importances_ )

model = SelectFromModel(clf, prefit=True)

X_new = model.transform(X)

print("X_new Dataset size: ", X_new.shape)



**4B. Feature Selection: Embedded methods**

In [None]:
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel
import numpy as np

sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l1', solver='liblinear'))

sel_.fit(X, Y)

selected_feat = X.columns[(sel_.get_support())]

print('\nTotal features: {}'.format((X.shape[1])))
print('\nSelected features: {}'.format(len(selected_feat)))
print('\nFeatures with coefficients shrank to zero: {}'.format(np.sum(sel_.estimator_.coef_ == 0)))

removed_feats = X.columns[(sel_.estimator_.coef_ == 0).ravel().tolist()]

print("\nRemove features:\n")
print(removed_feats)

#Remove features and build a new filter dataset
X_new = sel_.transform(X.fillna(0))



**4C. Feature Selection: Filter methods**

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

NUMBER_FEATURES = 10

#Split X and Y
Y = rrhh_ds['Renuncia_Disc']
X = rrhh_ds.drop(['Renuncia_Disc'], axis=1)

selector = SelectKBest(mutual_info_classif, k=NUMBER_FEATURES)
selector.fit(X, Y)

#Set X_new column names
cols = selector.get_support(indices=True)
X_new = X.iloc[:,cols]

#Set y column names
Y = pd.DataFrame(Y)
Y.columns = ['Renuncia']

print("\nSelected Attributes:\n")
print(X_new[:NUMBER_FEATURES])


**5. Split Dataset**

In [None]:
from sklearn.model_selection import train_test_split

rrhh_ds = X_new.join(Y)

#Split dataset (train and test)
train, test = train_test_split(rrhh_ds, test_size=0.15)

#Split X e Y
X_train = train.drop('Renuncia',axis=1)
X_test = test.drop('Renuncia',axis=1)
y_train = train['Renuncia']
y_test = test['Renuncia']

**6. Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0,penalty='l2').fit(X_train, y_train)

#Evaluate Model
from sklearn.metrics import classification_report,confusion_matrix

predictions = clf.predict(X_test)

print("\n=====Confusion Matrix (Logistic Regression):=====\n")
print(confusion_matrix(y_test,predictions))

print("\n=====Classification Report (Logistic Regression):=====\n")
print(classification_report(y_test,predictions))

#Plot ROC Curve
from sklearn import metrics
import matplotlib.pyplot as plt

fpr, tpr, _ = metrics.roc_curve(y_test,  predictions)
auc = metrics.roc_auc_score(y_test, predictions)
plt.plot(fpr,tpr,label="ROC, auc="+str(auc))
plt.legend(loc=4)

print("\n=====ROC Curve:=====\n")

plt.show()

**7. Decision Trees**

In [None]:
from sklearn import tree

clf_dt = tree.DecisionTreeClassifier()
clf_dt = clf_dt.fit(X_train, y_train)

#Evaluate Model
from sklearn.metrics import classification_report,confusion_matrix

predictions = clf_dt.predict(X_test)

print("\n=====Confusion Matrix (Decision Tree):=====\n")
print(confusion_matrix(y_test,predictions))

print("\n=====Classification Report (Decision Tree):=====\n")
print(classification_report(y_test,predictions))

#Plot ROC Curve
from sklearn import metrics
import matplotlib.pyplot as plt

fpr, tpr, _ = metrics.roc_curve(y_test,  predictions)
auc = metrics.roc_auc_score(y_test, predictions)
plt.plot(fpr,tpr,label="ROC, auc="+str(auc))
plt.legend(loc=4)

print("\n=====ROC Curve:=====\n")

plt.show()


**8. Random Forrest**

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf_rf = RandomForestClassifier(max_depth=5, random_state=0)
clf_rf = clf_rf.fit(X_train, y_train)

#Evaluate Model
from sklearn.metrics import classification_report,confusion_matrix

predictions = clf_rf.predict(X_test)

print("\n=====Confusion Matrix (Random Forrest):=====\n")
print(confusion_matrix(y_test,predictions))

print("\n=====Classification Report (Random Forrest):=====\n")
print(classification_report(y_test,predictions))

#Plot ROC Curve
from sklearn import metrics
import matplotlib.pyplot as plt

fpr, tpr, _ = metrics.roc_curve(y_test,  predictions)
auc = metrics.roc_auc_score(y_test, predictions)
plt.plot(fpr,tpr,label="ROC, auc="+str(auc))
plt.legend(loc=4)

print("\n=====ROC Curve:=====\n")

plt.show()
