In [1]:
import pandas as pd
import numpy as np

In [2]:
# importar pacotes usados na seleção do modelo e na medição da precisão
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
#from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix

In [3]:
BASE = 'afastamentos'
#BASE = 'cadastro'
df = pd.read_csv(BASE + '-nomes-sexo.csv', index_col=0)

In [122]:
df.rename(columns={'sexo': 'SEXO'}, inplace=True)

In [123]:
df.head(10)

Unnamed: 0_level_0,SEXO
nome,Unnamed: 1_level_1
ABDEMES,M
ABILIS,M
ABNALDO,M
ADAILZA,M
ADALBERSON,M
ADALTINA,F
ADAMUS,M
ADEMILDA,F
ADEMISSON,M
ADERCE,F


In [124]:
df['SEXO'] = df['SEXO'].map({'F': 1, 'M': 2, 'X': 0}).astype('uint8')

In [125]:
df['REV'] = df.index
df['REV'] = df['REV'].apply(lambda x: x[::-1])

In [126]:
df.head()

Unnamed: 0_level_0,SEXO,REV
nome,Unnamed: 1_level_1,Unnamed: 2_level_1
ABDEMES,2,SEMEDBA
ABILIS,2,SILIBA
ABNALDO,2,ODLANBA
ADAILZA,2,AZLIADA
ADALBERSON,2,NOSREBLADA


In [127]:
MAIOR_QTDE_LETRAS = 16 # fixado para o maior arquivo

for i in range(MAIOR_QTDE_LETRAS):
    df['L' + str(i)] = df['REV'].apply(
        lambda x: ord(x[i]) - 64 if len(x) > i else 0
    ).astype('uint8')

In [128]:
#df.dtypes

In [129]:
df.head(10)

Unnamed: 0_level_0,SEXO,REV,L0,L1,L2,L3,L4,L5,L6,L7,L8,L9,L10,L11,L12,L13,L14,L15
nome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ABDEMES,2,SEMEDBA,19,5,13,5,4,2,1,0,0,0,0,0,0,0,0,0
ABILIS,2,SILIBA,19,9,12,9,2,1,0,0,0,0,0,0,0,0,0,0
ABNALDO,2,ODLANBA,15,4,12,1,14,2,1,0,0,0,0,0,0,0,0,0
ADAILZA,2,AZLIADA,1,26,12,9,1,4,1,0,0,0,0,0,0,0,0,0
ADALBERSON,2,NOSREBLADA,14,15,19,18,5,2,12,1,4,1,0,0,0,0,0,0
ADALTINA,1,ANITLADA,1,14,9,20,12,1,4,1,0,0,0,0,0,0,0,0
ADAMUS,2,SUMADA,19,21,13,1,4,1,0,0,0,0,0,0,0,0,0,0
ADEMILDA,1,ADLIMEDA,1,4,12,9,13,5,4,1,0,0,0,0,0,0,0,0
ADEMISSON,2,NOSSIMEDA,14,15,19,19,9,13,5,4,1,0,0,0,0,0,0,0
ADERCE,1,ECREDA,5,3,18,5,4,1,0,0,0,0,0,0,0,0,0,0


In [130]:
#df.describe()

In [131]:
from sklearn.decomposition import PCA

X_reduced = PCA(n_components=4).fit_transform(df.drop(['SEXO', 'REV'], axis=1))
X_reduced[:5]

array([[-13.52078631,  -6.66969255,   0.47062714,   1.46881889],
       [-14.66849263,  -2.49468877,   2.10916993,   3.92817915],
       [-10.21309109,  -8.48051482,  -7.49893459,  -3.10800101],
       [ -5.47898324,  16.33064614,  -3.66964771,  -3.15743948],
       [ -3.58103396,   0.388545  ,   0.29250288,   9.99972082]])

In [132]:
X = pd.DataFrame(X_reduced, index=df.index)
y = df['SEXO'] # apenas a coluna alvo

print('Forma dos dados originais:', X.shape, y.shape)

Forma dos dados originais: (384846, 4) (384846,)


In [133]:
X.head()

Unnamed: 0_level_0,0,1,2,3
nome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ABDEMES,-13.520786,-6.669693,0.470627,1.468819
ABILIS,-14.668493,-2.494689,2.10917,3.928179
ABNALDO,-10.213091,-8.480515,-7.498935,-3.108001
ADAILZA,-5.478983,16.330646,-3.669648,-3.157439
ADALBERSON,-3.581034,0.388545,0.292503,9.999721


In [134]:
y.head()

nome
ABDEMES       2
ABILIS        2
ABNALDO       2
ADAILZA       2
ADALBERSON    2
Name: SEXO, dtype: uint8

In [96]:
NUMBER_KFOLD_SPLITS = 5 # number of splits in cross-validation
NUMBER_GRID_ITERATIONS = 10 # number of grid iterations to parameters testing
SCORING_METRIC = 'accuracy' # the scoring metric to be used

In [97]:
from datetime import datetime

models = {}

def evaluate_model(name, model, X=X, y=y):

  start = datetime.now()
  kfold = KFold(n_splits=NUMBER_KFOLD_SPLITS, shuffle=True, random_state=42)
  results = cross_val_score(model, X, y, cv=kfold,
                            scoring=SCORING_METRIC, verbose=1, n_jobs=-1)
  end = datetime.now()

  elapsed = int((end - start).total_seconds() * 1000)
  score = results.mean() * 100
  stddev = results.std() * 100

  models[name] = (model, score, stddev, elapsed)
  print(model, '\nCross-Validation Score: %.2f (+/- %.2f) [%5s ms]' % \
        (score, stddev, elapsed))
  return score, stddev, elapsed

In [98]:
# faz o ajuste fino do modelo, calculando os melhores hiperparâmetros
def fine_tune_model(model, params, X=X, y=y):

  print('\nFine Tuning Model:')
  print(model, "\nparams:", params)
  
  kfold = KFold(n_splits=NUMBER_KFOLD_SPLITS, shuffle=True, random_state=42)

  search = RandomizedSearchCV(model, param_distributions=params, 
                              n_iter=NUMBER_GRID_ITERATIONS, 
                              scoring=SCORING_METRIC, cv=kfold, 
                              verbose=1, n_jobs=-1)
  #search = GridSearchCV(estimator=model, param_grid=params, scoring='accuracy', cv=kfold, verbose=1)

  search.fit(X, y)
  print('\nBest Score: %.2f %%' % (search.best_score_ * 100))
  print('Best Params:', search.best_params_)
  return search

In [99]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    random_state=42, multi_class='auto', 
    max_iter=1000, solver='liblinear', C=1.0) # 85.18
evaluate_model('LR', model)

params = dict(
    solver=['liblinear','lbfgs'], 
    C=np.logspace(-3,3,7)
)
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


LogisticRegression(max_iter=1000, random_state=42, solver='liblinear') 
Cross-Validation Score: 79.22 (+/- 0.30) [ 2834 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.7s finished


In [100]:
# Linear SVM
from sklearn.svm import LinearSVC

model = LinearSVC(random_state=42, max_iter=1000, C=0.001) # 85.27
evaluate_model('LSVM', model)

#from scipy.stats import reciprocal, uniform
#params = dict(C=uniform(1, 10))
params = dict(
    C=np.logspace(-4,3,8)
)
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


LinearSVC(C=0.001, random_state=42) 
Cross-Validation Score: 79.42 (+/- 0.32) [ 1816 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.7s finished


In [101]:
# K-Nearest Neighbours (KNN)
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=11) # 87.45
evaluate_model('KNN', model)

params = dict(
    n_neighbors=[1,3,5,7,9,11]
)
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


KNeighborsClassifier(n_neighbors=11) 
Cross-Validation Score: 81.31 (+/- 0.31) [ 5846 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.7s finished


In [102]:
# Linear Discriminant Analysis (LDA)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

model = LinearDiscriminantAnalysis(solver='svd') # 84.45
evaluate_model('LDA', model)

params = dict(
    solver=['svd','lsqr','eigen']
)
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


LinearDiscriminantAnalysis() 
Cross-Validation Score: 79.40 (+/- 0.33) [  872 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.8s finished


In [103]:
# Quadratic Discriminant Analysis (QDA)
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

model = QuadraticDiscriminantAnalysis() # 44.99
evaluate_model('QDA', model)

params = dict()
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


QuadraticDiscriminantAnalysis() 
Cross-Validation Score: 80.49 (+/- 0.25) [  623 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.5s finished


In [104]:
# Naïve Bayes - Gaussian
from sklearn.naive_bayes import GaussianNB

model = GaussianNB(priors=None, var_smoothing=1.0) # 67.62
evaluate_model('GNB', model)

params = dict(
    var_smoothing=np.logspace(0,-9,num=10)
)
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


GaussianNB(var_smoothing=1.0) 
Cross-Validation Score: 77.87 (+/- 0.20) [  604 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.5s finished


In [105]:
# Naïve Bayes - Bernoulli
from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB() # 43.37
evaluate_model('BNB', model)

#params = dict(var_smoothing=np.logspace(0,-9,num=10))
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


BernoulliNB() 
Cross-Validation Score: 70.91 (+/- 0.16) [  667 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.6s finished


(70.9055644729802, 0.15607149138866988, 667)

In [106]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(
    random_state=42, criterion='entropy', max_depth=7) # 96.37
evaluate_model('DT', model)

params = dict(
    criterion=['gini','entropy'],
    max_depth=[3,5,7,11,13,17,19]
)
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


DecisionTreeClassifier(criterion='entropy', max_depth=7, random_state=42) 
Cross-Validation Score: 78.32 (+/- 0.27) [ 2378 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.3s finished


In [107]:
# Ensemble - Random Forest
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    random_state=42, max_features='sqrt', n_estimators=500) # 93.86
evaluate_model('RF', model)

params = dict(
    n_estimators=[10,50,100,500], 
    max_features=['auto','sqrt','log2']
)
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


RandomForestClassifier(max_features='sqrt', n_estimators=500, random_state=42) 
Cross-Validation Score: 76.43 (+/- 0.33) [504775 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  8.4min finished


In [108]:
# Ensemble - Ada Boost
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier(
    DecisionTreeClassifier(random_state=42, criterion='entropy', max_depth=17), 
    n_estimators=11) # 96.41
evaluate_model('ABDT', model)

params = dict(
    n_estimators=[1,3,5,7,9,11,13]
)
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy',
                                                         max_depth=17,
                                                         random_state=42),
                   n_estimators=11) 
Cross-Validation Score: 72.45 (+/- 0.56) [51655 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   51.5s finished


In [109]:
# Stochastic Gradient Descent (SGD)
from sklearn.linear_model import SGDClassifier

model = SGDClassifier(random_state=42, max_iter=350, tol=0.01) # 77.02
evaluate_model('SGD', model)

params = dict(
    max_iter=[100, 200, 350, 500, 1000], 
    tol=[0.01, 0.1, 1.0]
)
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


SGDClassifier(max_iter=350, random_state=42, tol=0.01) 
Cross-Validation Score: 74.38 (+/- 1.82) [ 2049 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.9s finished


In [110]:
# Perceptron
from sklearn.linear_model import Perceptron

model = Perceptron(random_state=42, max_iter=500, tol=0.1) # 75.58
evaluate_model('PCT', model)

params = dict(
    max_iter=[100, 200, 350, 500, 750, 1000], 
    tol=[0.1, 0.01, 0.001]
)
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Perceptron(max_iter=500, random_state=42, tol=0.1) 
Cross-Validation Score: 73.00 (+/- 3.95) [ 1007 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.9s finished


In [111]:
# Extreme Gradient Boosting Machine (XGBM)
from xgboost import XGBClassifier

model = XGBClassifier(
    max_depth=3, min_child_weight=3, gamma=0.4, 
    subsample=0.7, colsample_bytree=0.8, reg_alpha=1e-05) # 97.67
evaluate_model('XGB', model)

params = dict(max_depth=range(3,10,2), min_child_weight=range(1,6,2)) # 97.60
#params = dict(gamma=[i/10.0 for i in range(0,5)]) # 97.68
#params = dict(subsample=[i/10.0 for i in range(6,10)], colsample_bytree=[i/10.0 for i in range(6,10)]) # 97.72
#params = dict(reg_alpha=[1e-5, 1e-2, 0.1, 1, 100]) # 97.74
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.8, gamma=0.4,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=3,
              min_child_weight=3, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              random_state=None, reg_alpha=1e-05, reg_lambda=None,
              scale_pos_weight=None, subsample=0.7, tree_method=None,
              validate_parameters=None, verbosity=None) 
Cross-Validation Score: 80.00 (+/- 0.32) [26539 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   26.4s finished


In [112]:
# LightGBM
from lightgbm import LGBMClassifier
#!pip install lightgbm

model = LGBMClassifier(
    num_leaves=20, max_depth=3, 
    learning_rate=0.1, feature_fraction=1.0) # 97.51
evaluate_model('LGBM', model)

params = dict(num_leaves=range(10,100), max_depth=range(1,30)) # 97.18
#params = dict(learning_rate=[0.01,0.05,0.1,0.5]) # 97.51
#params = dict(feature_fraction=[0.1,0.25,0.5,0.75,1.0]) # 97.62
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


LGBMClassifier(feature_fraction=1.0, max_depth=3, num_leaves=20) 
Cross-Validation Score: 79.50 (+/- 0.36) [ 4598 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.5s finished


In [113]:
# CatBoost
from catboost import CatBoostClassifier
#!pip install catboost

model = CatBoostClassifier(
    silent=True, iterations=250,
    learning_rate=0.05, depth=7) # 97.04
#learning_rate=0.1, iterations=1000, depth=5
evaluate_model('CB', model)

params = dict(
    iterations=[10,50,100,250,500,1000],
    learning_rate=[0.01,0.05,0.1,0.5],
    depth=range(1,11,2)
) 
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


<catboost.core.CatBoostClassifier object at 0x7ff617e71970> 
Cross-Validation Score: 81.46 (+/- 0.36) [43896 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   43.8s finished


In [114]:
names = []
estimators = []
scores = []
stdevs = []
times = []

for key, value in models.items():
  (model, score, stdev, elapsed) = value
  names.append(key)
  estimators.append(model)
  scores.append(score)
  stdevs.append(stdev)
  times.append(elapsed)

In [115]:
results_df = pd.DataFrame({
    'Model': names,
    'Score': scores,
    'Std Dev': stdevs,
    'Time (ms)': times,
    'Estimator': estimators})

results_df.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score,Std Dev,Time (ms),Estimator
14,CB,81.457999,0.361765,43896,<catboost.core.CatBoostClassifier object at 0x...
2,KNN,81.310861,0.309148,5846,KNeighborsClassifier(n_neighbors=11)
4,QDA,80.488229,0.246629,623,QuadraticDiscriminantAnalysis()
12,XGB,79.997325,0.316124,26539,"XGBClassifier(base_score=None, booster=None, c..."
13,LGBM,79.497057,0.361814,4598,"LGBMClassifier(feature_fraction=1.0, max_depth..."
1,LSVM,79.4168,0.323653,1816,"LinearSVC(C=0.001, random_state=42)"
3,LDA,79.398074,0.325609,872,LinearDiscriminantAnalysis()
0,LR,79.218834,0.299709,2834,"LogisticRegression(max_iter=1000, random_state..."
7,DT,78.31862,0.270925,2378,"DecisionTreeClassifier(criterion='entropy', ma..."
5,GNB,77.874532,0.204371,604,GaussianNB(var_smoothing=1.0)


In [118]:
model = models['CB'][0]
model

<catboost.core.CatBoostClassifier at 0x7ff617e71970>

In [119]:
model.fit(X, y)

<catboost.core.CatBoostClassifier at 0x7ff617e71970>

In [135]:
y_pred = model.predict(X)
y_pred

array([2, 2, 2, ..., 2, 1, 1])

In [136]:
dados = pd.DataFrame({'REAL': y, 'PREV': y_pred}, index=X.index)
for col in dados.columns:
    dados[col] = dados[col].map({0: 'X', 1: 'F', 2: 'M'})
dados.head()

Unnamed: 0_level_0,REAL,PREV
nome,Unnamed: 1_level_1,Unnamed: 2_level_1
ABDEMES,M,M
ABILIS,M,M
ABNALDO,M,M
ADAILZA,M,F
ADALBERSON,M,M


In [137]:
from sklearn.metrics import accuracy_score

accuracy_score(y, y_pred)

0.7844670335666735

In [138]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y, y_pred)

array([[153126,  47749],
       [ 35198, 148773]])

In [139]:
dados[dados['REAL'] != dados['PREV']].head(20)

Unnamed: 0_level_0,REAL,PREV
nome,Unnamed: 1_level_1,Unnamed: 2_level_1
ADAILZA,M,F
ADERCE,F,M
ADEVIR,F,M
AGAONE,M,F
AGMAIR,F,M
ALDEIDI,F,M
ALDOMAR,F,M
ALDRIAN,F,M
ALDRIM,F,M
ALHAGIE,M,F
