In [1]:
#Importaciones basicas
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
# graficos incrustados
%matplotlib inline

#Carga de los CSV
BBVA_data = pd.read_csv('train_clientes.csv', header=0)
print("Data Train: ", BBVA_data.shape)
BBVA_test = pd.read_csv('test_clientes.csv', header=0)
print("Data Test:  ", BBVA_test.shape)

Data Train:  (70000, 53)
Data Test:   (30000, 52)


In [2]:
def completar_data(BBVA):
    if BBVA.isnull().any().any():
        tipos = BBVA.columns.to_series().groupby(BBVA.dtypes).groups
        ctext = tipos[np.dtype('object')]
        columnas = BBVA.columns  # lista de todas las columnas
        cnum = list(set(columnas) - set(ctext))
        #COmpletando los valores nulos de colummnas numericas con la media
        for c in cnum:
            mean = BBVA[c].mean()
            BBVA[c] = BBVA[c].fillna(mean)
        # Completando valores nulos de columnas categóricas con la moda
        for c in ctext:
            mode = BBVA[c].mode()[0]
            BBVA[c] = BBVA[c].fillna(mode)
        print("Clean Data Test: ", BBVA.shape, BBVA.isnull().any().any())

completar_data(BBVA_data)
completar_data(BBVA_test)

Clean Data Test:  (70000, 53) False
Clean Data Test:  (30000, 52) False


In [3]:
def factorizacion_data(datus):
    #Pasando los datos de edad a rango de 10 años
    EDAD2 = pd.cut(datus['EDAD'], range(0, 150, 10))
    datus['EDAD'] = EDAD2
    
    #Factorizando la columna EDAD
    datus['EDAD'], _ = pd.factorize(datus['EDAD'])
    
    #Factorizar las demas columnas de datos categoricos
    tipos = datus.columns.to_series().groupby(datus.dtypes).groups
    ctext = tipos[np.dtype('object')]
    for c in ctext:
        datus[c], _ = pd.factorize(datus[c])
    print(type(datus), datus.shape)
    return datus

BBVA_data = factorizacion_data(BBVA_data)
BBVA_test = factorizacion_data(BBVA_test)

<class 'pandas.core.frame.DataFrame'> (70000, 53)
<class 'pandas.core.frame.DataFrame'> (30000, 52)


In [4]:
print(BBVA_data.isnull().any().any())
print(BBVA_test.isnull().any().any())

False
False


In [5]:
X_train = BBVA_data.drop(['ID_CORRELATIVO', 'ATTRITION'], axis=1)
print(type(X_train), X_train.shape)
X_predic = BBVA_test.drop(['ID_CORRELATIVO'], axis=1)
print(type(X_predic), X_predic.shape)
y_train = BBVA_data['ATTRITION'].values
print(type(y_train), y_train.shape)

<class 'pandas.core.frame.DataFrame'> (70000, 51)
<class 'pandas.core.frame.DataFrame'> (30000, 51)
<class 'numpy.ndarray'> (70000,)


In [6]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler(copy=True, with_mean=True, with_std=True)
xx_train_std = sc_X.fit_transform(X_train.astype(float))
xx_predic_std = sc_X.transform(X_predic.astype(float))

from sklearn.preprocessing import MinMaxScaler
sc_MM = MinMaxScaler()
xx_train_mm = sc_MM.fit_transform(X_train.astype(float))
xx_predic_mm = sc_MM.transform(X_predic.astype(float))

In [7]:
#Probando con la data RECIEN IMPUTADA - FACTORIZADA - STANDARDSCALER O MINMAXSCALER
from sklearn.ensemble import RandomForestClassifier
clf_forest_gini = RandomForestClassifier(n_estimators=25, n_jobs=2)
clf_forest_gini.fit(xx_train_std, y_train)
clf_forest_entropy = RandomForestClassifier(n_estimators=25, n_jobs=2)
clf_forest_entropy.fit(xx_train_mm, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=2,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [8]:
result_gini = clf_forest_gini.predict_proba(xx_predic_std)
result_entropy = clf_forest_entropy.predict_proba(xx_predic_mm)

In [9]:
result_gini = [float("{0:.4f}".format(_)) for _ in list(zip(*result_gini))[1]]
result_entropy = [float("{0:.4f}".format(_)) for _ in list(zip(*result_entropy))[1]]

In [10]:
idd = BBVA_test['ID_CORRELATIVO']
idd.shape

(30000,)

In [11]:
resultados_gini = list(zip(list(idd), result_gini))
resultados_gini.insert(0, ('ID_CORRELATIVO', 'ATTRITION'))
print(resultados_gini[:10])

resultados_entropy = list(zip(list(idd), result_entropy))
resultados_entropy.insert(0, ('ID_CORRELATIVO', 'ATTRITION'))
print(resultados_entropy[:10])

[('ID_CORRELATIVO', 'ATTRITION'), (47411, 0.12), (39861, 0.52), (38898, 0.04), (50927, 0.0), (32969, 0.0), (89661, 0.32), (12197, 0.32), (71520, 0.08), (59759, 0.08)]
[('ID_CORRELATIVO', 'ATTRITION'), (47411, 0.16), (39861, 0.44), (38898, 0.0), (50927, 0.0), (32969, 0.16), (89661, 0.32), (12197, 0.16), (71520, 0.08), (59759, 0.0)]


In [12]:
import csv
with open("8_imputacion_factorizacion_allVar_STD2_RF25gini.csv", 'w') as resultFile:
       wr = csv.writer(resultFile, dialect='excel')
       for row in resultados_gini:
           wr.writerow(row)
with open("8_imputacion_factorizacion_allVar_MINMAX2_RF25gini.csv", 'w') as resultFile:
       wr = csv.writer(resultFile, dialect='excel')
       for row in resultados_entropy:
           wr.writerow(row)

In [15]:
#////////////////////////////////////// SVM
from sklearn import svm
clf_forest_gini = svm.SVC(probability=True)
clf_forest_gini.fit(xx_train_std, y_train)
clf_forest_entropy = svm.SVC(probability=True, kernel="poly")
clf_forest_entropy.fit(xx_train_mm, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [16]:
result_gini = clf_forest_gini.predict_proba(xx_predic_std)
result_entropy = clf_forest_entropy.predict_proba(xx_predic_mm)

In [17]:
result_gini = [float("{0:.4f}".format(_)) for _ in list(zip(*result_gini))[1]]
result_entropy = [float("{0:.4f}".format(_)) for _ in list(zip(*result_entropy))[1]]
idd = BBVA_test['ID_CORRELATIVO']
print(idd.shape)

(30000,)


In [18]:
resultados_gini = list(zip(list(idd), result_gini))
resultados_gini.insert(0, ('ID_CORRELATIVO', 'ATTRITION'))
print(resultados_gini[:10])

resultados_entropy = list(zip(list(idd), result_entropy))
resultados_entropy.insert(0, ('ID_CORRELATIVO', 'ATTRITION'))
print(resultados_entropy[:10])

[('ID_CORRELATIVO', 'ATTRITION'), (47411, 0.253), (39861, 0.7479), (38898, 0.1294), (50927, 0.0837), (32969, 0.1281), (89661, 0.1407), (12197, 0.1325), (71520, 0.1573), (59759, 0.1306)]
[('ID_CORRELATIVO', 'ATTRITION'), (47411, 0.1895), (39861, 0.6265), (38898, 0.2731), (50927, 0.1622), (32969, 0.1804), (89661, 0.1557), (12197, 0.1559), (71520, 0.0595), (59759, 0.1831)]


In [22]:
import csv
with open("9_imputacion_factorizacion_allVar_STD_SVMrbf.csv", 'w') as resultFile:
       wr = csv.writer(resultFile, dialect='excel')
       for row in resultados_gini:
           wr.writerow(row)
with open("9_imputacion_factorizacion_allVar_MINMAX2_SVMpoly.csv", 'w') as resultFile:
       wr = csv.writer(resultFile, dialect='excel')
       for row in resultados_entropy:
           wr.writerow(row)

In [20]:
clf_forest_gini = svm.SVC(probability=True)
clf_forest_gini.fit(xx_train_mm, y_train)
clf_forest_entropy = svm.SVC(probability=True, kernel="poly")
clf_forest_entropy.fit(xx_train_std, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [23]:
result_gini = clf_forest_gini.predict_proba(xx_predic_mm)
result_entropy = clf_forest_entropy.predict_proba(xx_predic_std)

In [24]:
result_gini = [float("{0:.4f}".format(_)) for _ in list(zip(*result_gini))[1]]
result_entropy = [float("{0:.4f}".format(_)) for _ in list(zip(*result_entropy))[1]]
idd = BBVA_test['ID_CORRELATIVO']
print(idd.shape)

(30000,)


In [25]:
resultados_gini = list(zip(list(idd), result_gini))
resultados_gini.insert(0, ('ID_CORRELATIVO', 'ATTRITION'))
print(resultados_gini[:10])

resultados_entropy = list(zip(list(idd), result_entropy))
resultados_entropy.insert(0, ('ID_CORRELATIVO', 'ATTRITION'))
print(resultados_entropy[:10])

[('ID_CORRELATIVO', 'ATTRITION'), (47411, 0.1327), (39861, 0.9042), (38898, 0.1925), (50927, 0.141), (32969, 0.0977), (89661, 0.1738), (12197, 0.1652), (71520, 0.1635), (59759, 0.1672)]
[('ID_CORRELATIVO', 'ATTRITION'), (47411, 0.328), (39861, 0.3607), (38898, 0.138), (50927, 0.1363), (32969, 0.1539), (89661, 0.163), (12197, 0.1635), (71520, 0.0006), (59759, 0.1635)]


In [26]:
import csv
with open("9_imputacion_factorizacion_allVar_MINMAX2_SVMrbf.csv", 'w') as resultFile:
       wr = csv.writer(resultFile, dialect='excel')
       for row in resultados_gini:
           wr.writerow(row)
with open("9_imputacion_factorizacion_allVar_STD2_SVMpoly.csv", 'w') as resultFile:
       wr = csv.writer(resultFile, dialect='excel')
       for row in resultados_entropy:
           wr.writerow(row)