In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import manifold
from sklearn.metrics import confusion_matrix,accuracy_score,roc_auc_score
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from time import time 

In [4]:
data = pd.read_csv("datosLimpios.csv") #leemos los datos de train después de haber imputado los NA

In [5]:
data=data.drop('Unnamed: 0',axis=1)
data.head()

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,employment_status,census_msa,household_adults,household_children,h1n1_vaccine,seasonal_vaccine,race_White,race_Black,race_Other or Multiple,race_Hispanic
0,1,0,0,0,0,0,0,1,1,0,...,3,3,0,0,0,0,1,0,0,0
1,3,2,0,1,0,1,0,1,1,0,...,2,1,0,0,0,1,1,0,0,0
2,1,1,0,1,0,0,0,0,0,0,...,2,1,2,0,0,0,1,0,0,0
3,1,1,0,1,0,1,1,0,0,0,...,3,2,0,0,0,1,1,0,0,0
4,2,1,0,1,0,1,1,0,1,0,...,2,1,1,0,0,0,1,0,0,0


In [None]:
def plot_clustering(X_red, labels, title=None):
    x_min, x_max = np.min(X_red, axis=0), np.max(X_red, axis=0)
    X_red = (X_red - x_min) / (x_max - x_min)

    plt.figure(figsize=(8, 6))
    for i in range(X_red.shape[0]):
        plt.text(X_red[i, 0], X_red[i, 1], str( int (y[i])),
                 color=plt.cm.nipy_spectral(labels[i] / 10.),
                 fontdict={'weight': 'bold', 'size': 9})

    plt.xticks([])
    plt.yticks([])
    if title is not None:
        plt.title(title, size=17)
    plt.axis('off')
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    
# ploteamos con los datos desbalanceados
start_time = time()
X=data
y=data["h1n1_vaccine"]
porcentaje=30 # porcentaje sobre 100 de datos del dataset que se utilizan para mostrar la proyección t-SNE
X_red = manifold.TSNE(n_components=2).fit_transform(X[:int(len(X)*porcentaje/100)][:-1])
plot_clustering(X_red,y)

elapsed_time = time() - start_time
print("Tiempo de ejecución: %.10f segundos." % elapsed_time)    

In [None]:
#Hacemos undersampling con la clase mayoritaria y oversampling con la clase minoritaria
#esto sólo vale para predecir la primera etiqueta. Para la segunda los datos se toman tal cual están

data0 = data[data["h1n1_vaccine"] ==0].sample(n=15359)
data2 = data[data["h1n1_vaccine"] ==1]
data = pd.concat([data0,data2,data2])
data=data.sample(frac=1).reset_index(drop=True)

In [None]:
start_time = time()
X=data
y=data["h1n1_vaccine"]
porcentaje=30 # porcentaje sobre 100 de datos del dataset que se utilizan para mostrar la proyección t-SNE
X_red = manifold.TSNE(n_components=2).fit_transform(X[:int(len(X)*porcentaje/100)][:-1])
plot_clustering(X_red,y)

elapsed_time = time() - start_time
print("Tiempo de ejecución: %.10f segundos." % elapsed_time)   

In [25]:
#Escalamos los datos. Esta es la forma que mejor resultado ha dado

scaler = MinMaxScaler()

data = pd.DataFrame(scaler.fit_transform(data),columns=data.columns )

data.head()

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,employment_status,census_msa,household_adults,household_children,h1n1_vaccine,seasonal_vaccine,race_White,race_Black,race_Other or Multiple,race_Hispanic
0,0.666667,0.5,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.333333,0.5,0.333333,0.666667,1.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
2,0.666667,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.333333,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,0.666667,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.333333,0.5,0.333333,0.0,1.0,1.0,0.0,0.0,1.0,0.0
4,0.666667,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.333333,0.0,0.333333,0.666667,1.0,1.0,1.0,0.0,0.0,0.0


In [26]:
#transformamos los datos de la misma manera que el train
data_test = pd.read_csv("test_clean_2.csv") #leemos los datos
data_test = pd.DataFrame(scaler.fit_transform(data_test),columns=data_test.columns )
data_test=data_test.drop('Unnamed: 0',axis=1)

In [7]:
#exportamos los datos para introducirlos en el mdelo en R
data.to_csv("dataTrainMinMaxEtiqueta1",index = False)
data_test.to_csv("dataTestMinMax",index = False)

In [20]:
#ahora continuamos haciendo preprocesamiento que más tarde aplicaremos a los datos antes de emplear el modelo kNN

In [27]:
y1 = data["h1n1_vaccine"]

In [28]:
y1.value_counts()

0.0    15359
1.0    11348
Name: h1n1_vaccine, dtype: int64

In [29]:
data=data.drop(["h1n1_vaccine","seasonal_vaccine"],axis=1)
data.head()

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,marital_status,rent_or_own,employment_status,census_msa,household_adults,household_children,race_White,race_Black,race_Other or Multiple,race_Hispanic
0,0.666667,0.5,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.333333,0.5,0.333333,0.666667,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.666667,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.333333,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.666667,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,1.0,1.0,0.333333,0.5,0.333333,0.0,0.0,0.0,1.0,0.0
4,0.666667,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,0.333333,0.0,0.333333,0.666667,1.0,0.0,0.0,0.0


In [10]:
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization
from bayes_opt.util import Colours
#El mejor resultado en train se obtiene con n=1, sin embargo cuando predecimos con n=1 el resultado es malísimo

In [12]:
def knn_cv(n_neighbors,p, data, targets):
    
    estimator = KNeighborsClassifier(
        n_neighbors = n_neighbors,
        p = p,
        n_jobs = -1
    )
    cval = cross_val_score(estimator, data,
    targets,scoring="roc_auc", cv=10)
    return cval.mean()

def optimize_knn(data, targets):
    """Apply Bayesian Optimization to Random Forest parameters."""
    def knn_crossval(n_neighbors,p):
        """Wrapper of RandomForest cross validation.
        Notice how we ensure n_estimators and min_samples_split are casted
        to integer before we pass them along. Moreover, to avoid max_features
        taking values outside the (0, 1) range, we also ensure it is capped
        accordingly.
        """
        return knn_cv(
            n_neighbors = int(n_neighbors),
                      p = int(p),
                      
            
            data=data,
            targets=targets,
        )

    optimizer = BayesianOptimization(
        f=knn_crossval,
        #aquí los parámetros y sus cotas
        pbounds={
            "n_neighbors": (1, 51),
            "p":(1, 20)
            
        },
        random_state=1234,
        verbose=2
    )
    optimizer.maximize(n_iter=20) #número de iteraciones que efectuará el algoritmo

    print("Final result:", optimizer.max)
    
##Una vez que creamos el optimizador,
##para aplicarlo a los datos lanzamos el siguiente código:

data=data #datos sin el target
targets=y1
print(Colours.green("--- Optimizing Knn ---"))
optimize_knn(data, targets)

[92m--- Optimizing Knn ---[0m
|   iter    |  target   | n_neig... |     p     |
-------------------------------------------------
| [0m 1       [0m | [0m 0.7975  [0m | [0m 10.58   [0m | [0m 12.82   [0m |
| [0m 2       [0m | [0m 0.7933  [0m | [0m 22.89   [0m | [0m 15.92   [0m |
| [0m 3       [0m | [0m 0.7917  [0m | [0m 40.0    [0m | [0m 6.179   [0m |
| [0m 4       [0m | [0m 0.7942  [0m | [0m 14.82   [0m | [0m 16.24   [0m |
| [0m 5       [0m | [0m 0.7919  [0m | [0m 48.91   [0m | [0m 17.64   [0m |
| [0m 6       [0m | [0m 0.7975  [0m | [0m 7.591   [0m | [0m 13.61   [0m |
| [95m 7       [0m | [95m 0.8007  [0m | [95m 7.876   [0m | [95m 4.982   [0m |
| [95m 8       [0m | [95m 0.9038  [0m | [95m 1.0     [0m | [95m 1.0     [0m |
| [0m 9       [0m | [0m 0.8994  [0m | [0m 1.008   [0m | [0m 3.583   [0m |
| [0m 10      [0m | [0m 0.8141  [0m | [0m 25.89   [0m | [0m 1.0     [0m |
| [0m 11      [0m | [0m 0.8141  [0m 

In [13]:
kf = KFold(n_splits=10,shuffle = True) #validación cruzada 
caracteristicas=data.columns # quitamos la etiqueta
X=pd.DataFrame()
y=y1

roc_auc=0

for char in caracteristicas:
    
    roc=[]
    X[char]=data[char]#añadimos la variable
    
    #entrenamos el modelo y testeamos si funciona mejor o peor con esa variable añadida  
    for train_index, test_index in kf.split(X):
        
    
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        clf = KNeighborsClassifier(n_neighbors = 51,p=1)
        clf.fit(X_train, y_train)
    
        y_pred=clf.predict(X_test)
        roc.append(roc_auc_score(y_test, y_pred))
        
    if(100*np.mean(roc)<roc_auc):#si hemos empeorado el resultado, quitamos de nuevo la variable y si hemos mejorado, la dejamos
                             #y actualizamos la cota de eficiencia a superar
            
        X=X.drop([char],axis=1)
        print('Variable ',char,' rechazada')
    else:
        roc_auc=100*np.mean(roc)
        print('Variable ',char,' aceptada')
        print('Características actuales:', X.columns)
        print('Roc actual: ',round(100*np.mean(roc),2),'%')    
print('Con las variables ',X.columns,' conseguimos un roc del ',roc_auc,'%')

Variable  h1n1_concern  aceptada
Características actuales: Index(['h1n1_concern'], dtype='object')
Roc actual:  50.35 %
Variable  h1n1_knowledge  aceptada
Características actuales: Index(['h1n1_concern', 'h1n1_knowledge'], dtype='object')
Roc actual:  52.58 %
Variable  behavioral_antiviral_meds  rechazada
Variable  behavioral_avoidance  rechazada
Variable  behavioral_face_mask  rechazada
Variable  behavioral_wash_hands  rechazada
Variable  behavioral_large_gatherings  aceptada
Características actuales: Index(['h1n1_concern', 'h1n1_knowledge', 'behavioral_large_gatherings'], dtype='object')
Roc actual:  52.75 %
Variable  behavioral_outside_home  rechazada
Variable  behavioral_touch_face  rechazada
Variable  doctor_recc_h1n1  aceptada
Características actuales: Index(['h1n1_concern', 'h1n1_knowledge', 'behavioral_large_gatherings',
       'doctor_recc_h1n1'],
      dtype='object')
Roc actual:  59.0 %
Variable  doctor_recc_seasonal  aceptada
Características actuales: Index(['h1n1_concern',