In [1]:
import pandas as pd
import numpy as np

In [2]:
# importar pacotes usados na seleção do modelo e na medição da precisão
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
#from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix

In [3]:
df = pd.read_csv('extra/nome_sexo_pf50.csv', index_col=0)
df = df.loc[~df.index.isna()]
df[df.index.isnull()]

Unnamed: 0_level_0,sexo
nome,Unnamed: 1_level_1


In [4]:
df.rename(columns={'sexo': 'SEXO'}, inplace=True)

In [5]:
df.head(10)

Unnamed: 0_level_0,SEXO
nome,Unnamed: 1_level_1
ABENILDA,F
ADINAILZA,F
ADIRACI,F
ADISON,M
ADLER,M
ADSTON,M
ALDIMAR,F
ALDRINA,F
ALISANDRO,M
ALOIS,M


In [6]:
df['SEXO'] = df['SEXO'].map({'F': 1, 'M': 2, 'X': 0}).astype('uint8')

In [7]:
df['REV'] = df.index
df['REV'] = df['REV'].apply(lambda x: x[::-1])

In [8]:
df.head()

Unnamed: 0_level_0,SEXO,REV
nome,Unnamed: 1_level_1,Unnamed: 2_level_1
ABENILDA,1,ADLINEBA
ADINAILZA,1,AZLIANIDA
ADIRACI,1,ICARIDA
ADISON,2,NOSIDA
ADLER,2,RELDA


In [9]:
MAIOR_QTDE_LETRAS = 16 # fixado para o maior arquivo

for i in range(MAIOR_QTDE_LETRAS):
    df['L' + str(i)] = df['REV'].apply(lambda x: ord(x[i]) - 64 if len(x) > i else 0).astype('uint8')

In [10]:
#df.dtypes

In [11]:
df.head(10)

Unnamed: 0_level_0,SEXO,REV,L0,L1,L2,L3,L4,L5,L6,L7,L8,L9,L10,L11,L12,L13,L14,L15
nome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ABENILDA,1,ADLINEBA,1,4,12,9,14,5,2,1,0,0,0,0,0,0,0,0
ADINAILZA,1,AZLIANIDA,1,26,12,9,1,14,9,4,1,0,0,0,0,0,0,0
ADIRACI,1,ICARIDA,9,3,1,18,9,4,1,0,0,0,0,0,0,0,0,0
ADISON,2,NOSIDA,14,15,19,9,4,1,0,0,0,0,0,0,0,0,0,0
ADLER,2,RELDA,18,5,12,4,1,0,0,0,0,0,0,0,0,0,0,0
ADSTON,2,NOTSDA,14,15,20,19,4,1,0,0,0,0,0,0,0,0,0,0
ALDIMAR,1,RAMIDLA,18,1,13,9,4,12,1,0,0,0,0,0,0,0,0,0
ALDRINA,1,ANIRDLA,1,14,9,18,4,12,1,0,0,0,0,0,0,0,0,0
ALISANDRO,2,ORDNASILA,15,18,4,14,1,19,9,12,1,0,0,0,0,0,0,0
ALOIS,2,SIOLA,19,9,15,12,1,0,0,0,0,0,0,0,0,0,0,0


In [12]:
#df.describe()

In [13]:
# definir dados de entrada

X = df.drop(['SEXO', 'REV'], axis=1) # tudo, exceto a coluna alvo
y = df['SEXO'] # apenas a coluna alvo

print('Forma dos dados originais:', X.shape, y.shape)

Forma dos dados originais: (74760, 16) (74760,)


In [14]:
X.head()

Unnamed: 0_level_0,L0,L1,L2,L3,L4,L5,L6,L7,L8,L9,L10,L11,L12,L13,L14,L15
nome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
ABENILDA,1,4,12,9,14,5,2,1,0,0,0,0,0,0,0,0
ADINAILZA,1,26,12,9,1,14,9,4,1,0,0,0,0,0,0,0
ADIRACI,9,3,1,18,9,4,1,0,0,0,0,0,0,0,0,0
ADISON,14,15,19,9,4,1,0,0,0,0,0,0,0,0,0,0
ADLER,18,5,12,4,1,0,0,0,0,0,0,0,0,0,0,0


In [15]:
y.head()

nome
ABENILDA     1
ADINAILZA    1
ADIRACI      1
ADISON       2
ADLER        2
Name: SEXO, dtype: uint8

In [16]:
NUMBER_KFOLD_SPLITS = 5 # number of splits in cross-validation
NUMBER_GRID_ITERATIONS = 10 # number of grid iterations to parameters testing
SCORING_METRIC = 'accuracy' # the scoring metric to be used

In [17]:
from datetime import datetime

models = {}

def evaluate_model(name, model, X=X, y=y):

  start = datetime.now()
  kfold = KFold(n_splits=NUMBER_KFOLD_SPLITS, shuffle=True, random_state=42)
  results = cross_val_score(model, X, y, cv=kfold,
                            scoring=SCORING_METRIC, verbose=1, n_jobs=-1)
  end = datetime.now()

  elapsed = int((end - start).total_seconds() * 1000)
  score = results.mean() * 100
  stddev = results.std() * 100

  models[name] = (model, score, stddev, elapsed)
  print(model, '\nCross-Validation Score: %.2f (+/- %.2f) [%5s ms]' % \
        (score, stddev, elapsed))
  return score, stddev, elapsed

In [18]:
# faz o ajuste fino do modelo, calculando os melhores hiperparâmetros
def fine_tune_model(model, params, X=X, y=y):

  print('\nFine Tuning Model:')
  print(model, "\nparams:", params)
  
  kfold = KFold(n_splits=NUMBER_KFOLD_SPLITS, shuffle=True, random_state=42)

  search = RandomizedSearchCV(model, param_distributions=params, 
                              n_iter=NUMBER_GRID_ITERATIONS, 
                              scoring=SCORING_METRIC, cv=kfold, 
                              verbose=1, n_jobs=-1)
  #search = GridSearchCV(estimator=model, param_grid=params, scoring='accuracy', cv=kfold, verbose=1)

  search.fit(X, y)
  print('\nBest Score: %.2f %%' % (search.best_score_ * 100))
  print('Best Params:', search.best_params_)
  return search

In [19]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=42, multi_class='auto', max_iter=1000, solver='liblinear', C=0.01) # 85.18
evaluate_model('LR', model)

params = dict(solver=['liblinear','lbfgs'], C=np.logspace(-3,3,7))
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


LogisticRegression(C=0.01, max_iter=1000, random_state=42, solver='liblinear') 
Cross-Validation Score: 78.98 (+/- 0.30) [ 4650 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.5s finished


In [20]:
# Linear SVM
from sklearn.svm import LinearSVC

model = LinearSVC(random_state=42, max_iter=1000, C=0.001) # 85.27
evaluate_model('LSVM', model)

#from scipy.stats import reciprocal, uniform
#params = dict(C=uniform(1, 10))
params = dict(C=np.logspace(-4,3,8))
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


LinearSVC(C=0.001, random_state=42) 
Cross-Validation Score: 79.31 (+/- 0.32) [ 3924 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.8s finished


In [21]:
# K-Nearest Neighbours (KNN)
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=1) # 87.45
evaluate_model('KNN', model)

params = dict(n_neighbors=[1,3,5,7,9,11])
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


KNeighborsClassifier(n_neighbors=1) 
Cross-Validation Score: 76.49 (+/- 0.34) [15568 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   15.5s finished


In [22]:
# Linear Discriminant Analysis (LDA)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

model = LinearDiscriminantAnalysis(solver='svd') # 84.45
evaluate_model('LDA', model)

params = dict(solver=['svd','lsqr','eigen'])
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


LinearDiscriminantAnalysis() 
Cross-Validation Score: 78.97 (+/- 0.29) [ 1556 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.4s finished


In [23]:
# Quadratic Discriminant Analysis (QDA)
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

model = QuadraticDiscriminantAnalysis() # 44.99
evaluate_model('QDA', model)

params = dict()
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


QuadraticDiscriminantAnalysis() 
Cross-Validation Score: 54.03 (+/- 0.16) [  746 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.6s finished


In [24]:
# Naïve Bayes - Gaussian
from sklearn.naive_bayes import GaussianNB

model = GaussianNB(priors=None, var_smoothing=0.1) # 67.62
evaluate_model('GNB', model)

params = dict(var_smoothing=np.logspace(0,-9,num=10))
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


GaussianNB(var_smoothing=0.1) 
Cross-Validation Score: 78.99 (+/- 0.50) [  651 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.5s finished


In [25]:
# Naïve Bayes - Bernoulli
from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB() # 43.37
evaluate_model('BNB', model)

params = dict(var_smoothing=np.logspace(0,-9,num=10))
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


BernoulliNB() 
Cross-Validation Score: 54.45 (+/- 0.40) [  698 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.6s finished


In [26]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state=42, criterion='entropy', max_depth=17) # 96.37
evaluate_model('DT', model)

params = dict(criterion=['gini','entropy'], max_depth=[3,5,7,11,13,17,19])
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


DecisionTreeClassifier(criterion='entropy', max_depth=17, random_state=42) 
Cross-Validation Score: 84.22 (+/- 0.31) [ 1790 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.6s finished


In [27]:
# Ensemble - Random Forest
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42, max_features='auto', n_estimators=500) # 93.86
evaluate_model('RF', model)

params = dict(n_estimators=[10,50,100,500], max_features=['auto','sqrt','log2'])
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


RandomForestClassifier(n_estimators=500, random_state=42) 
Cross-Validation Score: 82.63 (+/- 0.31) [164547 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.7min finished


In [28]:
# Ensemble - Ada Boost
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier(DecisionTreeClassifier(random_state=42, criterion='entropy', max_depth=17), n_estimators=13) # 96.41
evaluate_model('ABDT', model)

params = dict(n_estimators=[1,3,5,7,9,11,13])
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy',
                                                         max_depth=17,
                                                         random_state=42),
                   n_estimators=13) 
Cross-Validation Score: 80.85 (+/- 0.34) [17655 ms]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   17.5s finished


In [None]:
# Ensemble - Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(random_state=42, max_depth=35, n_estimators=250, learning_rate=0.1) # 96.75
evaluate_model('GB', model)

params = dict(max_depth=[1,3,5,7,9], n_estimators=[5,50,250,500], learning_rate=[0.01,0.1,1,10,100])
#fine_tune_model(model, params)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


In [None]:
# Stochastic Gradient Descent (SGD)
from sklearn.linear_model import SGDClassifier

model = SGDClassifier(random_state=42, max_iter=500, tol=0.1) # 77.02
evaluate_model('SGD', model)

params = dict(max_iter=[100, 200, 350, 500, 1000], tol=[0.01, 0.1, 1.0])
#fine_tune_model(model, params)

In [None]:
# Perceptron
from sklearn.linear_model import Perceptron

model = Perceptron(random_state=42, max_iter=500, tol=0.001) # 75.58
evaluate_model('PCT', model)

params = dict(max_iter=[100, 200, 350, 500, 750, 1000], tol=[0.1, 0.01, 0.001])
#fine_tune_model(model, params)

In [None]:
# Multi-Layer Perceptron (MLP)
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(random_state=42, solver='lbfgs', alpha=1e-07, hidden_layer_sizes=(50, 100, 50)) # 88.67
evaluate_model('MLP', model)

params = dict(alpha=np.logspace(-9,-1,5), 
              solver=['lbfgs','sgd','adam'], 
              hidden_layer_sizes=[(50,50,50), (50,100,50), (100,)])
#fine_tune_model(model, params)

In [None]:
# Extreme Gradient Boosting Machine (XGBM)
from xgboost import XGBClassifier

model = XGBClassifier(max_depth=9, min_child_weight=1, gamma=0.1, subsample=0.9, colsample_bytree=0.7, reg_alpha=1e-05) # 97.67
evaluate_model('XGB', model)

params = dict(max_depth=range(3,10,2), min_child_weight=range(1,6,2)) # 97.60
#params = dict(gamma=[i/10.0 for i in range(0,5)]) # 97.68
#params = dict(subsample=[i/10.0 for i in range(6,10)], colsample_bytree=[i/10.0 for i in range(6,10)]) # 97.72
#params = dict(reg_alpha=[1e-5, 1e-2, 0.1, 1, 100]) # 97.74
#fine_tune_model(model, params)

In [None]:
# LightGBM
from lightgbm import LGBMClassifier
#!pip install lightgbm

model = LGBMClassifier(num_leaves=73, max_depth=24, learning_rate=0.5, feature_fraction=0.75) # 97.51
evaluate_model('LGBM', model)

#params = dict(num_leaves=range(10,100), max_depth=range(1,30)) # 97.18
#params = dict(learning_rate=[0.01,0.05,0.1,0.5]) # 97.51
#params = dict(feature_fraction=[0.1,0.25,0.5,0.75,1.0]) # 97.62
#fine_tune_model(model, params)

In [None]:
# CatBoost
from catboost import CatBoostClassifier
#!pip install catboost

model = CatBoostClassifier(silent=True) # 97.04
#learning_rate=0.1, iterations=1000, depth=5
evaluate_model('CB', model)

params = dict(iterations=[10,50,100,250,500,1000],
              learning_rate=[0.01,0.05,0.1,0.5],
              depth=range(1,11,2)) 
#fine_tune_model(model, params)

In [None]:
names = []
estimators = []
scores = []
stdevs = []
times = []

for key, value in models.items():
  (model, score, stdev, elapsed) = value
  names.append(key)
  estimators.append(model)
  scores.append(score)
  stdevs.append(stdev)
  times.append(elapsed)

In [None]:
results_df = pd.DataFrame({
    'Model': names,
    'Score': scores,
    'Std Dev': stdevs,
    'Time (ms)': times,
    'Estimator': estimators})

results_df.sort_values(by='Score', ascending=False)

In [None]:
model = models['CB'][0]
model

In [None]:
model.fit(X, y)

In [None]:
y_pred = model.predict(X)
y_pred

In [None]:
dados = pd.DataFrame({'REAL': y, 'PREV': y_pred}, index=X.index)
for col in dados.columns:
    dados[col] = dados[col].map({0: 'X', 1: 'F', 2: 'M'})
dados.head()

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y, y_pred)

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y, y_pred)

In [None]:
dados[dados['REAL'] != dados['PREV']]