In [1]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
# graficos incrustados
%matplotlib inline

#Carga de los CSV
BBVA_data = pd.read_csv('train_clientes.csv', header=0)
print("Data Train: ", BBVA_data.shape)
BBVA_test = pd.read_csv('test_clientes.csv', header=0)
print("Data Test:  ", BBVA_test.shape)

def completar_data(BBVA):
    if BBVA.isnull().any().any():
        tipos = BBVA.columns.to_series().groupby(BBVA.dtypes).groups
        ctext = tipos[np.dtype('object')]
        columnas = BBVA.columns  # lista de todas las columnas
        cnum = list(set(columnas) - set(ctext))
        #COmpletando los valores nulos de colummnas numericas con la media
        for c in cnum:
            mean = BBVA[c].mean()
            BBVA[c] = BBVA[c].fillna(mean)
        # Completando valores nulos de columnas categóricas con la moda
        for c in ctext:
            mode = BBVA[c].mode()[0]
            BBVA[c] = BBVA[c].fillna(mode)
        print("Clean Data Test: ", BBVA.shape, BBVA.isnull().any().any())

completar_data(BBVA_data)
completar_data(BBVA_test)

def factorizacion_data(datus):
    #Pasando los datos de edad a rango de 10 años
    #EDAD2 = pd.cut(datus['EDAD'], range(0, 150, 10))
    #datus['EDAD'] = EDAD2
    
    #Factorizando la columna EDAD
    #datus['EDAD'], _ = pd.factorize(datus['EDAD'])
    
    #Factorizar las demas columnas de datos categoricos
    tipos = datus.columns.to_series().groupby(datus.dtypes).groups
    ctext = tipos[np.dtype('object')]
    for c in ctext:
        datus[c], _ = pd.factorize(datus[c])
    print(type(datus), datus.shape)
    return datus

BBVA_data = factorizacion_data(BBVA_data)
BBVA_test = factorizacion_data(BBVA_test)
print(BBVA_data.isnull().any().any())
print(BBVA_test.isnull().any().any())

X_train = BBVA_data.drop(['ID_CORRELATIVO', 'ATTRITION'], axis=1)
print(type(X_train), X_train.shape)
X_predic = BBVA_test.drop(['ID_CORRELATIVO'], axis=1)
print(type(X_predic), X_predic.shape)
y_train = BBVA_data['ATTRITION'].values
print(type(y_train), y_train.shape)

Data Train:  (70000, 53)
Data Test:   (30000, 52)
Clean Data Test:  (70000, 53) False
Clean Data Test:  (30000, 52) False
<class 'pandas.core.frame.DataFrame'> (70000, 53)
<class 'pandas.core.frame.DataFrame'> (30000, 52)
False
False
<class 'pandas.core.frame.DataFrame'> (70000, 51)
<class 'pandas.core.frame.DataFrame'> (30000, 51)
<class 'numpy.ndarray'> (70000,)


In [2]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler(copy=True, with_mean=True, with_std=True)
xx_train_std = sc_X.fit_transform(X_train.astype(float))
xx_predic_std = sc_X.transform(X_predic.astype(float))

from sklearn.preprocessing import MinMaxScaler
sc_MM = MinMaxScaler()
xx_train_mm = sc_MM.fit_transform(X_train.astype(float))
xx_predic_mm = sc_MM.transform(X_predic.astype(float))

idd = BBVA_test['ID_CORRELATIVO']
idd.shape

(30000,)

In [5]:
from sklearn.model_selection import cross_val_score

estimators = (50, 100, 250, 500)
splits =  (2, 10, 25)
features =  (0.1, 0.999)

print("---> DATA CON STD")
for est in estimators:
    for sp in splits:
        for ff in features:
            clf =  RandomForestClassifier(n_estimators=est, n_jobs=2, min_samples_split=sp,
                                         max_features=ff, random_state=2)
            scores = cross_val_score(clf, xx_train_std, y_train, cv=5)
            print("arboles: '{}', sample_split '{}' , featute {}".format(est, sp, ff), scores)
            print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
            print("////////////////////////////////////////")  # ejecutando
print("---")
print("---")
print("---> DATA CON MINMAX")
for est in estimators:
    for sp in splits:
        for ff in features:
            clf =  RandomForestClassifier(n_estimators=est, n_jobs=2, min_samples_split=sp,
                                         max_features=ff, random_state=2)
            scores = cross_val_score(clf, xx_train_mm, y_train, cv=5)
            print("arboles: '{}', sample_split '{}' , featute {}".format(est, sp, ff), scores)
            print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))
            print("#########################################")  # ejecutando

---> DATA CON STD
arboles: '50', sample_split '2' , featute 0.1 [ 0.8749375   0.87286622  0.87257143  0.8735624   0.86984785]
Accuracy: 0.87 (+/- 0.00)
////////////////////////////////////////
arboles: '50', sample_split '2' , featute 0.999 [ 0.87200914  0.87358046  0.87135714  0.87384813  0.87106222]
Accuracy: 0.87 (+/- 0.00)
////////////////////////////////////////
arboles: '50', sample_split '10' , featute 0.1 [ 0.87536605  0.87608028  0.87528571  0.87656261  0.87241946]
Accuracy: 0.88 (+/- 0.00)
////////////////////////////////////////
arboles: '50', sample_split '10' , featute 0.999 [ 0.87550889  0.87850868  0.87457143  0.87784842  0.87541967]
Accuracy: 0.88 (+/- 0.00)
////////////////////////////////////////
arboles: '50', sample_split '25' , featute 0.1 [ 0.87543747  0.87515177  0.87507143  0.87741982  0.87320523]
Accuracy: 0.88 (+/- 0.00)
////////////////////////////////////////
arboles: '50', sample_split '25' , featute 0.999 [ 0.87800871  0.87965145  0.87735714  0.87956283  0

In [4]:
estimators = (50, 100, 200, 500)
splits =  ['adam', 'sgd']

print("---> DATA CON STD")
for est in estimators:
    for sp in splits:
        red = MLPClassifier(max_iter=1000000000, hidden_layer_sizes=(est,est), 
                            alpha=0.0000000001, solver=sp, random_state=2)
        scores = cross_val_score(red, xx_train_std, y_train, cv=5)
        print("nodos: '{}', solver '{}' ".format(est, sp), scores)
        print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))
        print("////////////////////////////////////////")  # ejecutando

print("---> DATA CON MINMAX")
for est in estimators:
    for sp in splits:
        red = MLPClassifier(max_iter=1000000000, hidden_layer_sizes=(est,est), 
                            alpha=0.0000000001, solver=sp, random_state=2)
        scores = cross_val_score(red, xx_train_mm, y_train, cv=5)
        print("nodos: '{}', solver '{}' ".format(est, sp), scores)
        print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))
        print("////////////////////////////////////////")  # ejecuta

---> DATA CON STD
nodos: '10', solver 'adam'  [ 0.85651025  0.8645811   0.86135714  0.85841846  0.86056147]
Accuracy: 0.8603 (+/- 0.0055)
////////////////////////////////////////
nodos: '10', solver 'sgd'  [ 0.85372473  0.85365331  0.85428571  0.85327523  0.85077506]
Accuracy: 0.8531 (+/- 0.0025)
////////////////////////////////////////
nodos: '20', solver 'adam'  [ 0.86443825  0.86743804  0.86728571  0.86734767  0.86541896]
Accuracy: 0.8664 (+/- 0.0025)
////////////////////////////////////////
nodos: '20', solver 'sgd'  [ 0.85272481  0.86300979  0.85742857  0.85648975  0.85227516]
Accuracy: 0.8564 (+/- 0.0078)
////////////////////////////////////////
nodos: '50', solver 'adam'  [ 0.85929576  0.86065281  0.85871429  0.86384742  0.86170441]
Accuracy: 0.8608 (+/- 0.0037)
////////////////////////////////////////
nodos: '50', solver 'sgd'  [ 0.85929576  0.86429541  0.86121429  0.86070434  0.85620401]
Accuracy: 0.8603 (+/- 0.0053)
////////////////////////////////////////
nodos: '100', solve