# Carregar Dados

In [1]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("/content/gdrive/MyDrive/Colab Notebooks/gender_classification_v7.csv")

In [4]:
df.head(10)

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,gender
0,1,11.8,6.1,1,0,1,1,Male
1,0,14.0,5.4,0,0,1,0,Female
2,0,11.8,6.3,1,1,1,1,Male
3,0,14.4,6.1,0,1,1,1,Male
4,1,13.5,5.9,0,0,0,0,Female
5,1,13.0,6.8,1,1,1,1,Male
6,1,15.3,6.2,1,1,1,0,Male
7,0,13.0,5.2,0,0,0,0,Female
8,1,11.9,5.4,1,0,1,1,Female
9,1,12.1,5.4,0,0,0,0,Female


In [5]:
df.shape

(5001, 8)

# Mapeamento

In [6]:
# Mapeamento de variáveis
gender_map = {'Male': 0, 'Female': 1}

# Add mapeamento
df['gender'] = df['gender'].map(gender_map)

In [7]:
# Verificando mapeamento
df.head()

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,gender
0,1,11.8,6.1,1,0,1,1,0
1,0,14.0,5.4,0,0,1,0,1
2,0,11.8,6.3,1,1,1,1,0
3,0,14.4,6.1,0,1,1,1,0
4,1,13.5,5.9,0,0,0,0,1


In [8]:
#Conferir como os dados estão distribuídos
num_true = len(df.loc[df['gender'] == True])
num_false = len(df.loc[df['gender'] == False])

print("Número de casos verdadeiros: {0} ({1:2.2f}%) ".format(num_true, (num_true/ (num_true + num_false)) * 100))
print("Número de casos falsos: {0} ({1:2.2f}%) ".format(num_false, (num_false/ (num_true + num_false)) * 100))

Número de casos verdadeiros: 2501 (50.01%) 
Número de casos falsos: 2500 (49.99%) 


## Spliting dos Dados

In [9]:
#scikit_learn
from sklearn.model_selection import train_test_split

In [10]:
#selecionar variaveis preditoras
atributos = ['long_hair', 'forehead_width_cm', 'forehead_height_cm','nose_wide', 'nose_long','lips_thin', 'distance_nose_to_lip_long' ]

In [11]:
#variavel prevista (saída Y)
atrib_prev = ['gender']

In [12]:
# Criar objetos X e Y
X = df[atributos].values
Y = df[atrib_prev].values

In [13]:
X[0]

array([ 1. , 11.8,  6.1,  1. ,  0. ,  1. ,  1. ])

In [14]:
Y

array([[0],
       [1],
       [0],
       ...,
       [1],
       [1],
       [0]])

In [15]:
# Definir a taxa do split
split_test_size = 0.30

In [16]:
# Criar os dados de treino e teste
X_treino, X_teste, Y_treino, Y_teste = train_test_split(X, Y, test_size = split_test_size, random_state=42)

In [17]:
# Verificando se tá tudo ok
print("{0:0.2f}% nos dados de treino".format((len(X_treino)/len(df.index))*100))
print("{0:0.2f}% nos dados de teste".format((len(X_teste)/len(df.index))*100))

69.99% nos dados de treino
30.01% nos dados de teste


# Limpar Dados

In [18]:
df.isnull().values.any()

False

In [19]:
# Verificando se há Nulos 
print(len(df.loc[df['long_hair']==None]))
print(len(df.loc[df['forehead_width_cm']==None]))
print(len(df.loc[df['forehead_height_cm']==None]))
print(len(df.loc[df['nose_wide']==None]))
print(len(df.loc[df['nose_long']==None]))
print(len(df.loc[df['lips_thin']==None]))
print(len(df.loc[df['distance_nose_to_lip_long']==None]))

0
0
0
0
0
0
0


In [20]:
from sklearn.impute import SimpleImputer

In [21]:
# Criar o objeto e setar a estratégia
preenche_0 = SimpleImputer(strategy='median')

#Aplicar a estratégia
X_treino = preenche_0.fit_transform(X_treino)
X_teste = preenche_0.fit_transform(X_teste)

In [22]:
X_treino

array([[ 1. , 15.1,  6.7, ...,  1. ,  1. ,  1. ],
       [ 1. , 12.6,  5.1, ...,  0. ,  0. ,  0. ],
       [ 1. , 11.9,  5.9, ...,  1. ,  1. ,  1. ],
       ...,
       [ 1. , 12. ,  5.9, ...,  0. ,  0. ,  0. ],
       [ 1. , 12.8,  5.4, ...,  0. ,  0. ,  0. ],
       [ 1. , 11.4,  5.5, ...,  0. ,  0. ,  0. ]])

In [23]:
X_teste

array([[ 1. , 13.2,  5.7, ...,  1. ,  1. ,  1. ],
       [ 1. , 13.7,  6. , ...,  0. ,  0. ,  0. ],
       [ 0. , 12.9,  5.3, ...,  0. ,  0. ,  1. ],
       ...,
       [ 0. , 12.3,  6.6, ...,  1. ,  0. ,  1. ],
       [ 1. , 12.1,  5.7, ...,  1. ,  1. ,  1. ],
       [ 1. , 14.7,  6.7, ...,  1. ,  0. ,  0. ]])

# Utilizando o Random Forest

In [29]:
# importando o metrics
from sklearn import metrics

In [24]:
# importando o Forest
from sklearn.ensemble import RandomForestClassifier

In [25]:
# criando o modelo
modelo_forest = RandomForestClassifier(random_state=42)
modelo_forest.fit(X_treino,Y_treino.ravel())

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [27]:
# predicao de treino
forest_predict_train = modelo_forest.predict(X_treino)
forest_predict_train

array([0, 1, 0, ..., 1, 1, 1])

In [30]:
print('Exatidão (Acurácia) : {0:.4f}'.format(metrics.accuracy_score(Y_treino, forest_predict_train)))

Exatidão (Acurácia) : 0.9991


In [31]:
#predicao de test
forest_predict_test = modelo_forest.predict(X_teste)
forest_predict_test

array([0, 1, 1, ..., 0, 0, 0])

In [32]:
print('Exatidão (Acurácia) : {0:.4f}'.format(metrics.accuracy_score(Y_teste, forest_predict_test)))

Exatidão (Acurácia) : 0.9627


## Utilizando o GaussianNB

In [33]:
# importando o gaussian
from sklearn.naive_bayes import  GaussianNB

In [34]:
# criando o modelo
modelo_gaussian = GaussianNB()
modelo_gaussian.fit(X_treino,Y_treino.ravel())

GaussianNB(priors=None, var_smoothing=1e-09)

In [35]:
# predicao de treino
gaussian_predict_train = modelo_gaussian.predict(X_treino)
gaussian_predict_train

array([0, 1, 0, ..., 1, 1, 1])

In [36]:
print('Exatidão (Acurácia) : {0:.4f}'.format(metrics.accuracy_score(Y_treino, gaussian_predict_train)))

Exatidão (Acurácia) : 0.9706


In [38]:
# predicao de teste
gaussian_predict_test = modelo_gaussian.predict(X_teste)
gaussian_predict_test

array([0, 1, 1, ..., 0, 0, 0])

In [39]:
print('Exatidão (Acurácia) : {0:.4f}'.format(metrics.accuracy_score(Y_teste, gaussian_predict_test)))

Exatidão (Acurácia) : 0.9654


## Utilizando Support Vector Machines (SVM)

In [40]:
# importando o svm
from sklearn import svm

In [41]:
# criando o modelo
modelo_svm = svm.SVC()
modelo_svm.fit(X_treino, Y_treino)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [42]:
# predicao de treino
svm_treino_predict = modelo_svm.predict(X_treino)
svm_treino_predict

array([0, 1, 0, ..., 1, 1, 1])

In [43]:
print('Exatidão (Acurácia) : {0:.4f}'.format(metrics.accuracy_score(Y_treino, svm_treino_predict)))

Exatidão (Acurácia) : 0.9709


In [44]:
# predicao de teste
svm_test_predict = modelo_svm.predict(X_teste)
svm_test_predict

array([0, 1, 1, ..., 0, 0, 0])

In [45]:
print('Exatidão (Acurácia) : {0:.4f}'.format(metrics.accuracy_score(Y_teste, svm_test_predict)))

Exatidão (Acurácia) : 0.9680


# Comparação

## Randon Forest:  


*   Treino: 0.9991
*   Teste: 0.9627


## GaussianNB


*   Treino: 0.9706
*   Teste: 0.9654

## Support Vector Machines 


*   Treino: 0.9709
*   Teste: 0.9680





# Conclusão

## O mais acurado nos treinos: Randon Forest
## O mais acurado nos testes: Suport Vector Machines

# Salvando o Modelo

In [46]:
import pickle

In [47]:
# salvando o modelo svm
file_name = 'modelo_treinado_svm.sav'
pickle.dump(modelo_svm, open(file_name, 'wb'))

In [51]:
# carregando o modelo
carrega_modelo = pickle.load(open(file_name, 'rb'))

In [52]:
X_teste

array([[ 1. , 13.2,  5.7, ...,  1. ,  1. ,  1. ],
       [ 1. , 13.7,  6. , ...,  0. ,  0. ,  0. ],
       [ 0. , 12.9,  5.3, ...,  0. ,  0. ,  1. ],
       ...,
       [ 0. , 12.3,  6.6, ...,  1. ,  0. ,  1. ],
       [ 1. , 12.1,  5.7, ...,  1. ,  1. ,  1. ],
       [ 1. , 14.7,  6.7, ...,  1. ,  0. ,  0. ]])

In [55]:
nova_pessoa = carrega_modelo.predict(X_teste[2].reshape(1, -1))
nova_pessoa

array([1])