In [1]:
from sklearn.neural_network import MLPClassifier
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
# graficos incrustados
%matplotlib inline

#Carga de los CSV
BBVA_data = pd.read_csv('train_clientes.csv', header=0)
print("Data Train: ", BBVA_data.shape)
BBVA_test = pd.read_csv('test_clientes.csv', header=0)
print("Data Test:  ", BBVA_test.shape)

def completar_data(BBVA):
    if BBVA.isnull().any().any():
        tipos = BBVA.columns.to_series().groupby(BBVA.dtypes).groups
        ctext = tipos[np.dtype('object')]
        columnas = BBVA.columns  # lista de todas las columnas
        cnum = list(set(columnas) - set(ctext))
        #COmpletando los valores nulos de colummnas numericas con la media
        for c in cnum:
            mean = BBVA[c].mean()
            BBVA[c] = BBVA[c].fillna(mean)
        # Completando valores nulos de columnas categóricas con la moda
        for c in ctext:
            mode = BBVA[c].mode()[0]
            BBVA[c] = BBVA[c].fillna(mode)
        print("Clean Data Test: ", BBVA.shape, BBVA.isnull().any().any())

completar_data(BBVA_data)
completar_data(BBVA_test)

def factorizacion_data(datus):
    #Pasando los datos de edad a rango de 10 años
    EDAD2 = pd.cut(datus['EDAD'], range(0, 150, 10))
    datus['EDAD'] = EDAD2
    
    #Factorizando la columna EDAD
    datus['EDAD'], _ = pd.factorize(datus['EDAD'])
    
    #Factorizar las demas columnas de datos categoricos
    tipos = datus.columns.to_series().groupby(datus.dtypes).groups
    ctext = tipos[np.dtype('object')]
    for c in ctext:
        datus[c], _ = pd.factorize(datus[c])
    print(type(datus), datus.shape)
    return datus

BBVA_data = factorizacion_data(BBVA_data)
BBVA_test = factorizacion_data(BBVA_test)
print(BBVA_data.isnull().any().any())
print(BBVA_test.isnull().any().any())

X_train = BBVA_data.drop(['ID_CORRELATIVO', 'ATTRITION'], axis=1).values
print(type(X_train), X_train.shape)
X_predic = BBVA_test.drop(['ID_CORRELATIVO'], axis=1).values
print(type(X_predic), X_predic.shape)
y_train = BBVA_data['ATTRITION'].values
print(type(y_train), y_train.shape)

Data Train:  (70000, 53)
Data Test:   (30000, 52)
Clean Data Test:  (70000, 53) False
Clean Data Test:  (30000, 52) False
<class 'pandas.core.frame.DataFrame'> (70000, 53)
<class 'pandas.core.frame.DataFrame'> (30000, 52)
False
False
<class 'numpy.ndarray'> (70000, 51)
<class 'numpy.ndarray'> (30000, 51)
<class 'numpy.ndarray'> (70000,)


In [2]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler(copy=True, with_mean=True, with_std=True)
xx_train_std = sc_X.fit_transform(X_train.astype(float))
xx_predic_std = sc_X.transform(X_predic.astype(float))

from sklearn.preprocessing import MinMaxScaler
sc_MM = MinMaxScaler()
xx_train_mm = sc_MM.fit_transform(X_train.astype(float))
xx_predic_mm = sc_MM.transform(X_predic.astype(float))

In [3]:
idd = BBVA_test['ID_CORRELATIVO']
idd.shape

(30000,)

In [4]:
initial = 'svm_imputacion_factorizacion_allVar_'
import csv
def generate_csv(clf, predic, name):
    name = initial + name
    result = clf.predict_proba(predic)
    result = [float("{0:.5f}".format(_)) for _ in list(zip(*result))[1]]
    result = list(zip(list(idd), result))
    result.insert(0, ('ID_CORRELATIVO', 'ATTRITION'))
    with open("{}.csv".format(name), 'w') as resultFile:
       wr = csv.writer(resultFile, dialect='excel')
       for row in result:
           wr.writerow(row)

In [None]:
#////////////////////////////////////// SVM
from sklearn import svm
clf_rbf_1 = svm.SVC(kernel='rbf', C=10, gamma=1, probability=True, tol=0.0000001, cache_size=500)
clf_rbf_1.fit(xx_train_std, y_train)
generate_csv(clf_rbf_1, xx_predic_std, 'STD2_SVCrbf_c10_g1')
print("STD2///////////////////////////////////////////////SVCrbf_c10_g1")
clf_rbf_1 = svm.SVC(kernel='rbf', C=10, gamma=10, probability=True, tol=0.0000001, cache_size=500)
clf_rbf_1.fit(xx_train_std, y_train)
generate_csv(clf_rbf_1, xx_predic_std, 'STD2_SVCrbf_c10_g10')
print("STD2///////////////////////////////////////////////SVCrbf_c10_g10")

clf_rbf_2 = svm.SVC(kernel='rbf', C=1, gamma=10, probability=True, tol=0.0000001, cache_size=500)
clf_rbf_2.fit(xx_train_mm, y_train)
generate_csv(clf_rbf_2, xx_predic_mm, 'MINMAX2_SVCrbf_c1_g10')
print("MINMAX2///////////////////////////////////////////////SVCrbf_c1_g10")
clf_rbf_2 = svm.SVC(kernel='rbf', C=10, gamma=1, probability=True, tol=0.0000001, cache_size=500)
clf_rbf_2.fit(xx_train_mm, y_train)
generate_csv(clf_rbf_2, xx_predic_mm, 'MINMAX2_SVCrbf_c10_g1')
print("MINMAX2///////////////////////////////////////////////SVCrbf_c10_g1")
clf_rbf_2 = svm.SVC(kernel='rbf', C=10, gamma=10, probability=True, tol=0.0000001, cache_size=500)
clf_rbf_2.fit(xx_train_mm, y_train)
generate_csv(clf_rbf_2, xx_predic_mm, 'MINMAX2_SVCrbf_c10_g10')
print("MINMAX2///////////////////////////////////////////////SVCrbf_c10_g10")

clf_rbf_1 = svm.SVC(kernel='rbf', C=1, gamma=10, probability=True, tol=0.0000001, cache_size=500)
clf_rbf_1.fit(xx_train_std, y_train)
generate_csv(clf_rbf_1, xx_predic_std, 'STD2_SVCrbf_c1_g10')
print("STD2///////////////////////////////////////////////SVCrbf_c1_g10")

STD2///////////////////////////////////////////////SVCrbf_c10_g1
STD2///////////////////////////////////////////////SVCrbf_c10_g10
MINMAX2///////////////////////////////////////////////SVCrbf_c1_g10
MINMAX2///////////////////////////////////////////////SVCrbf_c10_g1
MINMAX2///////////////////////////////////////////////SVCrbf_c10_g10
