In [1]:
import pandas as pd
import numpy as np

In [2]:
# importar pacotes usados na seleção do modelo e na medição da precisão
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
#from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix

In [3]:
BASE = 'afastamentos'
#BASE = 'cadastro'
df = pd.read_csv(BASE + '-nomes-sexo.csv', index_col=0)

In [4]:
df.rename(columns={'sexo': 'SEXO'}, inplace=True)

In [5]:
df.head(10)

Unnamed: 0_level_0,SEXO
PNOME,Unnamed: 1_level_1
MARIA,F
JOSE,M
ANTONIO,M
FRANCISCO,M
JOAO,M
CARLOS,M
ANA,F
PAULO,M
LUIZ,M
MARCOS,M


In [6]:
df['SEXO'] = df['SEXO'].map({'F': 1, 'M': 2, 'X': 0}).astype('uint8')

In [7]:
df['REV'] = df.index
df['REV'] = df['REV'].apply(lambda x: x[::-1])

In [8]:
df.head()

Unnamed: 0_level_0,SEXO,REV
PNOME,Unnamed: 1_level_1,Unnamed: 2_level_1
MARIA,1,AIRAM
JOSE,2,ESOJ
ANTONIO,2,OINOTNA
FRANCISCO,2,OCSICNARF
JOAO,2,OAOJ


In [9]:
MAIOR_QTDE_LETRAS = 16 # fixado para o maior arquivo

for i in range(MAIOR_QTDE_LETRAS):
    df['L' + str(i)] = df['REV'].apply(lambda x: ord(x[i]) - 64 if len(x) > i else 0).astype('uint8')

In [10]:
#df.dtypes

In [11]:
df.head(10)

Unnamed: 0_level_0,SEXO,REV,L0,L1,L2,L3,L4,L5,L6,L7,L8,L9,L10,L11,L12,L13,L14,L15
PNOME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
MARIA,1,AIRAM,1,9,18,1,13,0,0,0,0,0,0,0,0,0,0,0
JOSE,2,ESOJ,5,19,15,10,0,0,0,0,0,0,0,0,0,0,0,0
ANTONIO,2,OINOTNA,15,9,14,15,20,14,1,0,0,0,0,0,0,0,0,0
FRANCISCO,2,OCSICNARF,15,3,19,9,3,14,1,18,6,0,0,0,0,0,0,0
JOAO,2,OAOJ,15,1,15,10,0,0,0,0,0,0,0,0,0,0,0,0
CARLOS,2,SOLRAC,19,15,12,18,1,3,0,0,0,0,0,0,0,0,0,0
ANA,1,ANA,1,14,1,0,0,0,0,0,0,0,0,0,0,0,0,0
PAULO,2,OLUAP,15,12,21,1,16,0,0,0,0,0,0,0,0,0,0,0
LUIZ,2,ZIUL,26,9,21,12,0,0,0,0,0,0,0,0,0,0,0,0
MARCOS,2,SOCRAM,19,15,3,18,1,13,0,0,0,0,0,0,0,0,0,0


In [12]:
#df.describe()

In [13]:
from sklearn.decomposition import PCA

X_reduced = PCA(n_components=4).fit_transform(df.drop(['SEXO', 'REV'], axis=1))
X_reduced[:5]

array([[ -7.57864392,   0.67028276, -13.67598892,  -8.97649646],
       [-10.85593962,  10.1500293 ,  -1.43663739,  -3.4644932 ],
       [ -1.95536386,  -6.21631208,   2.13695385,  -2.22564886],
       [  2.76442291,  -9.74515048,  10.45092921,  -7.75604594],
       [-14.17508656,  -4.90518479,   3.05330494,   2.08141312]])

In [14]:
X = pd.DataFrame(X_reduced, index=df.index)
y = df['SEXO'] # apenas a coluna alvo

print('Forma dos dados originais:', X.shape, y.shape)

Forma dos dados originais: (9726, 4) (9726,)


In [15]:
X.head()

Unnamed: 0_level_0,0,1,2,3
PNOME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MARIA,-7.578644,0.670283,-13.675989,-8.976496
JOSE,-10.85594,10.150029,-1.436637,-3.464493
ANTONIO,-1.955364,-6.216312,2.136954,-2.225649
FRANCISCO,2.764423,-9.74515,10.450929,-7.756046
JOAO,-14.175087,-4.905185,3.053305,2.081413


In [16]:
y.head()

PNOME
MARIA        1
JOSE         2
ANTONIO      2
FRANCISCO    2
JOAO         2
Name: SEXO, dtype: uint8

In [17]:
NUMBER_KFOLD_SPLITS = 5 # number of splits in cross-validation
NUMBER_GRID_ITERATIONS = 10 # number of grid iterations to parameters testing
SCORING_METRIC = 'accuracy' # the scoring metric to be used

In [18]:
from datetime import datetime

models = {}

def evaluate_model(name, model, X=X, y=y):

  start = datetime.now()
  kfold = KFold(n_splits=NUMBER_KFOLD_SPLITS, shuffle=True, random_state=42)
  results = cross_val_score(model, X, y, cv=kfold,
                            scoring=SCORING_METRIC, verbose=1, n_jobs=-1)
  end = datetime.now()

  elapsed = int((end - start).total_seconds() * 1000)
  score = results.mean() * 100
  stddev = results.std() * 100

  models[name] = (model, score, stddev, elapsed)
  print(model, '\nCross-Validation Score: %.2f (+/- %.2f) [%5s ms]' % \
        (score, stddev, elapsed))
  return score, stddev, elapsed

In [19]:
# faz o ajuste fino do modelo, calculando os melhores hiperparâmetros
def fine_tune_model(model, params, X=X, y=y):

  print('\nFine Tuning Model:')
  print(model, "\nparams:", params)
  
  kfold = KFold(n_splits=NUMBER_KFOLD_SPLITS, shuffle=True, random_state=42)

  search = RandomizedSearchCV(model, param_distributions=params, 
                              n_iter=NUMBER_GRID_ITERATIONS, 
                              scoring=SCORING_METRIC, cv=kfold, 
                              verbose=1, n_jobs=-1)
  #search = GridSearchCV(estimator=model, param_grid=params, scoring='accuracy', cv=kfold, verbose=1)

  search.fit(X, y)
  print('\nBest Score: %.2f %%' % (search.best_score_ * 100))
  print('Best Params:', search.best_params_)
  return search

In [20]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=42, multi_class='auto', max_iter=1000, solver='liblinear', C=0.01) # 85.18
evaluate_model('LR', model)

params = dict(solver=['liblinear','lbfgs'], C=np.logspace(-3,3,7))
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False) 
Cross-Validation Score: 84.68 (+/- 0.70) [ 9442 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    9.4s finished


In [21]:
# Linear SVM
from sklearn.svm import LinearSVC

model = LinearSVC(random_state=42, max_iter=1000, C=0.001) # 85.27
evaluate_model('LSVM', model)

#from scipy.stats import reciprocal, uniform
#params = dict(C=uniform(1, 10))
params = dict(C=np.logspace(-4,3,8))
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


LinearSVC(C=0.001, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
          verbose=0) 
Cross-Validation Score: 84.71 (+/- 0.67) [ 1911 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.9s finished


In [22]:
# K-Nearest Neighbours (KNN)
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=1) # 87.45
evaluate_model('KNN', model)

params = dict(n_neighbors=[1,3,5,7,9,11])
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform') 
Cross-Validation Score: 79.84 (+/- 0.34) [ 2618 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.6s finished


In [23]:
# Linear Discriminant Analysis (LDA)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

model = LinearDiscriminantAnalysis(solver='svd') # 84.45
evaluate_model('LDA', model)

params = dict(solver=['svd','lsqr','eigen'])
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
                           solver='svd', store_covariance=False, tol=0.0001) 
Cross-Validation Score: 84.73 (+/- 0.70) [  739 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.7s finished


In [24]:
# Quadratic Discriminant Analysis (QDA)
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

model = QuadraticDiscriminantAnalysis() # 44.99
evaluate_model('QDA', model)

params = dict()
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
                              store_covariance=False, tol=0.0001) 
Cross-Validation Score: 85.19 (+/- 0.85) [  616 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.6s finished


In [25]:
# Naïve Bayes - Gaussian
from sklearn.naive_bayes import GaussianNB

model = GaussianNB(priors=None, var_smoothing=0.1) # 67.62
evaluate_model('GNB', model)

params = dict(var_smoothing=np.logspace(0,-9,num=10))
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


GaussianNB(priors=None, var_smoothing=0.1) 
Cross-Validation Score: 83.24 (+/- 0.78) [  605 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.6s finished


In [26]:
# Naïve Bayes - Bernoulli
from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB() # 43.37
evaluate_model('BNB', model)

params = dict(var_smoothing=np.logspace(0,-9,num=10))
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True) 
Cross-Validation Score: 71.76 (+/- 0.78) [  613 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.6s finished


In [27]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state=42, criterion='entropy', max_depth=17) # 96.37
evaluate_model('DT', model)

params = dict(criterion=['gini','entropy'], max_depth=[3,5,7,11,13,17,19])
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=17,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best') 
Cross-Validation Score: 77.47 (+/- 0.68) [ 3034 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.0s finished


In [28]:
# Ensemble - Random Forest
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42, max_features='auto', n_estimators=500) # 93.86
evaluate_model('RF', model)

params = dict(n_estimators=[10,50,100,500], max_features=['auto','sqrt','log2'])
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False) 
Cross-Validation Score: 84.91 (+/- 0.64) [200435 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.3min finished


In [29]:
# Ensemble - Ada Boost
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier(DecisionTreeClassifier(random_state=42, criterion='entropy', max_depth=17), n_estimators=13) # 96.41
evaluate_model('ABDT', model)

params = dict(n_estimators=[1,3,5,7,9,11,13])
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(class_weight=None,
                                                         criterion='entropy',
                                                         max_depth=17,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort=False,
                                                         random_state=42,
                           

[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   30.6s finished


In [30]:
# Stochastic Gradient Descent (SGD)
from sklearn.linear_model import SGDClassifier

model = SGDClassifier(random_state=42, max_iter=500, tol=0.1) # 77.02
evaluate_model('SGD', model)

params = dict(max_iter=[100, 200, 350, 500, 1000], tol=[0.01, 0.1, 1.0])
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=500, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=42, shuffle=True, tol=0.1,
              validation_fraction=0.1, verbose=0, warm_start=False) 
Cross-Validation Score: 75.57 (+/- 4.70) [ 1714 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.7s finished


In [31]:
# Perceptron
from sklearn.linear_model import Perceptron

model = Perceptron(random_state=42, max_iter=500, tol=0.001) # 75.58
evaluate_model('PCT', model)

params = dict(max_iter=[100, 200, 350, 500, 750, 1000], tol=[0.1, 0.01, 0.001])
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
           fit_intercept=True, max_iter=500, n_iter_no_change=5, n_jobs=None,
           penalty=None, random_state=42, shuffle=True, tol=0.001,
           validation_fraction=0.1, verbose=0, warm_start=False) 
Cross-Validation Score: 77.80 (+/- 3.43) [ 1189 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.2s finished


In [32]:
# Extreme Gradient Boosting Machine (XGBM)
from xgboost import XGBClassifier

model = XGBClassifier(max_depth=9, min_child_weight=1, gamma=0.1, subsample=0.9, colsample_bytree=0.7, reg_alpha=1e-05) # 97.67
evaluate_model('XGB', model)

params = dict(max_depth=range(3,10,2), min_child_weight=range(1,6,2)) # 97.60
#params = dict(gamma=[i/10.0 for i in range(0,5)]) # 97.68
#params = dict(subsample=[i/10.0 for i in range(6,10)], colsample_bytree=[i/10.0 for i in range(6,10)]) # 97.72
#params = dict(reg_alpha=[1e-5, 1e-2, 0.1, 1, 100]) # 97.74
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0.1,
              learning_rate=0.1, max_delta_step=0, max_depth=9,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=1e-05, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.9, verbosity=1) 
Cross-Validation Score: 83.68 (+/- 0.54) [48416 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   48.4s finished


In [33]:
# LightGBM
from lightgbm import LGBMClassifier
#!pip install lightgbm

model = LGBMClassifier(num_leaves=73, max_depth=24, learning_rate=0.5, feature_fraction=0.75) # 97.51
evaluate_model('LGBM', model)

#params = dict(num_leaves=range(10,100), max_depth=range(1,30)) # 97.18
#params = dict(learning_rate=[0.01,0.05,0.1,0.5]) # 97.51
#params = dict(feature_fraction=[0.1,0.25,0.5,0.75,1.0]) # 97.62
#fine_tune_model(model, params)

ModuleNotFoundError: No module named 'lightgbm'

In [34]:
# CatBoost
from catboost import CatBoostClassifier
#!pip install catboost

model = CatBoostClassifier(silent=True) # 97.04
#learning_rate=0.1, iterations=1000, depth=5
evaluate_model('CB', model)

params = dict(iterations=[10,50,100,250,500,1000],
              learning_rate=[0.01,0.05,0.1,0.5],
              depth=range(1,11,2)) 
#fine_tune_model(model, params)

ModuleNotFoundError: No module named 'catboost'

In [35]:
names = []
estimators = []
scores = []
stdevs = []
times = []

for key, value in models.items():
  (model, score, stdev, elapsed) = value
  names.append(key)
  estimators.append(model)
  scores.append(score)
  stdevs.append(stdev)
  times.append(elapsed)

In [36]:
results_df = pd.DataFrame({
    'Model': names,
    'Score': scores,
    'Std Dev': stdevs,
    'Time (ms)': times,
    'Estimator': estimators})

results_df.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score,Std Dev,Time (ms),Estimator
4,QDA,85.194282,0.851771,616,"QuadraticDiscriminantAnalysis(priors=None, reg..."
8,RF,84.906475,0.637586,200435,"RandomForestClassifier(bootstrap=True, class_w..."
3,LDA,84.731594,0.696261,739,"LinearDiscriminantAnalysis(n_components=None, ..."
1,LSVM,84.711039,0.672604,1911,"LinearSVC(C=0.001, class_weight=None, dual=Tru..."
0,LR,84.68019,0.702206,9442,"LogisticRegression(C=0.01, class_weight=None, ..."
12,XGB,83.682893,0.541346,48416,"XGBClassifier(base_score=0.5, booster='gbtree'..."
9,ABDT,83.41562,0.392271,30595,"AdaBoostClassifier(algorithm='SAMME.R',\n ..."
5,GNB,83.24076,0.77848,605,"GaussianNB(priors=None, var_smoothing=0.1)"
2,KNN,79.837489,0.337313,2618,"KNeighborsClassifier(algorithm='auto', leaf_si..."
11,PCT,77.801129,3.427223,1189,"Perceptron(alpha=0.0001, class_weight=None, ea..."


In [37]:
model = models['QDA'][0]
model

QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
                              store_covariance=False, tol=0.0001)

In [38]:
model.fit(X, y)

QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
                              store_covariance=False, tol=0.0001)

In [39]:
y_pred = model.predict(X)
y_pred

array([1, 1, 2, ..., 2, 2, 2], dtype=uint8)

In [40]:
dados = pd.DataFrame({'REAL': y, 'PREV': y_pred}, index=X.index)
for col in dados.columns:
    dados[col] = dados[col].map({0: 'X', 1: 'F', 2: 'M'})
dados.head()

Unnamed: 0_level_0,REAL,PREV
PNOME,Unnamed: 1_level_1,Unnamed: 2_level_1
MARIA,F,F
JOSE,M,F
ANTONIO,M,M
FRANCISCO,M,M
JOAO,M,M


In [41]:
from sklearn.metrics import accuracy_score

accuracy_score(y, y_pred)

0.8524573308657207

In [42]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y, y_pred)

array([[  83,  350,  582],
       [  10, 4104,  262],
       [  47,  184, 4104]], dtype=int64)

In [43]:
dados[dados['REAL'] != dados['PREV']].head(20)

Unnamed: 0_level_0,REAL,PREV
PNOME,Unnamed: 1_level_1,Unnamed: 2_level_1
JOSE,M,F
LUIZ,M,X
JORGE,M,F
PEDRO,M,F
ANDRE,M,F
ALEXANDRE,M,F
FELIPE,M,F
GUSTAVO,M,F
RAQUEL,F,M
GUILHERME,M,F
