# Data splitting

## Initial train and test split

In [1]:
from utils import wrangle_data, dummy_to_labels, SEED
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from imblearn.under_sampling import CondensedNearestNeighbour, OneSidedSelection, RandomUnderSampler 
from collections import Counter
import os
import pickle

In [2]:
df = wrangle_data(dummy=True)

In [3]:
x_vars = df.drop(['Statistical_report', 'Result'], axis=1)
y_vars = df['Result']

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x_vars, y_vars, test_size=0.2, random_state=SEED)

In [6]:
with open('./rebalanced_data/test_set.pickle', mode='wb') as f:
    pickle.dump({'x': pd.DataFrame(x_test, columns=x_vars.columns),
                 'y': y_test},
                f,
                protocol=4)

In [7]:
y_train.apply(lambda x: 'legit' if x == 1 else 'phising').value_counts()

legit      4886
phising    3958
Name: Result, dtype: int64

Las clases no están balanceadas: a continuación se utilizarán una o varias técnicas para conseguir el equilibrio entre clases.

https://books.google.es/books?id=eZJjDQAAQBAJ&pg=PA81&lpg=PA81&dq=condensed+nearest+neighbour+vs+one+sided+selection&source=bl&ots=jhqcx6JeCN&sig=G8jyXElGdp4c3cfvO9r9tS2XCs0&hl=es&sa=X&ved=0ahUKEwiIl5SiluvVAhVGORQKHY49CR4Q6AEIbTAJ#v=onepage&q=condensed%20nearest%20neighbour%20vs%20one%20sided%20selection&f=false

### Condensed Nearest Neighbour

In [12]:
n_cpus = os.cpu_count()

n_cpus = 1 if n_cpus is None else n_cpus

cnn = CondensedNearestNeighbour(random_state=SEED, n_jobs=n_cpus) 

x_train_cnn, y_train_cnn = cnn.fit_sample(x_train, y_train) 

In [8]:
results_to_vals = np.vectorize(lambda x: 'legit' if x == 1 else 'phising')

Counter(results_to_vals(y_train_cnn))

Counter({'legit': 916, 'phising': 3958})

In [13]:
with open('./rebalanced_data/train_set_cnn.pickle', mode='wb') as f:
    pickle.dump({'x': pd.DataFrame(x_train_cnn, columns=x_vars.columns),
                 'y': y_train_cnn},
                f,
                protocol=4)

### One Sided Selection

In [10]:
oss = OneSidedSelection(random_state=SEED, n_jobs=n_cpus) 

x_train_oss, y_train_oss = oss.fit_sample(x_train, y_train) 

In [10]:
Counter(results_to_vals(y_train_oss))

Counter({'legit': 4861, 'phising': 3958})

In [11]:
with open('./rebalanced_data/train_set_oss.pickle', mode='wb') as f:
    pickle.dump({'x': pd.DataFrame(x_train_oss, columns=x_vars.columns),
                 'y': y_train_oss},
                f,
                protocol=4)

### Simple Random Undersampling

In [11]:
sru = RandomUnderSampler(random_state=SEED, ratio='auto') 

x_train_sru, y_train_sru = sru.fit_sample(x_train, y_train) 

In [12]:
Counter(results_to_vals(y_train_sru))

Counter({'legit': 3958, 'phising': 3958})

Los resultados ofrecidos por los dos primeros métodos son interesantes de estudiar, dado que su objetivo es quedarse con aquellas observaciones que son prototipos (http://www.math.le.ac.uk/people/ag153/homepage/KNN/OliverKNN_Presentation.pdf pag. 19). 

Sin embargo, empezaremos utilizando este último método que es el más común.

## 5-Folds

In [15]:
def create_folds_dict(x_train_balanced, y_train_balanced, k):
    x_vars_aux = pd.DataFrame(x_train_balanced, columns=x_vars.columns)
    y_var_aux = y_train_balanced   
    
    skf = StratifiedKFold(n_splits=k, random_state=SEED, shuffle=False)

    k_folds_splits = skf.split(x_vars_aux, y_var_aux)

    train_folds = []

    for train_index, test_index in k_folds_splits:
        train_folds.append({'train': {'x': x_vars_aux.iloc[train_index], 
                                      'y': y_var_aux[train_index]}, 
                            'valid': {'x': x_vars_aux.iloc[test_index], 
                                      'y': y_var_aux[test_index]}
                           })
        
    train_folds.append(skf)
    
    return train_folds

In [22]:
train_folds = create_folds_dict(x_train_sru, y_train_sru, k=5)

In [23]:
print(Counter(results_to_vals(train_folds[0]['train']['y'])))
print(Counter(results_to_vals(train_folds[1]['train']['y'])))
print(Counter(results_to_vals(train_folds[2]['train']['y'])))
print(Counter(results_to_vals(train_folds[3]['train']['y'])))
print(Counter(results_to_vals(train_folds[4]['train']['y'])))

Counter({'legit': 3166, 'phising': 3166})
Counter({'legit': 3166, 'phising': 3166})
Counter({'legit': 3166, 'phising': 3166})
Counter({'legit': 3167, 'phising': 3167})
Counter({'legit': 3167, 'phising': 3167})


Guardamos el diccionario con los diferentes pliegues en un archivo para que pueda ser utilizado en otros notebooks.

In [24]:
with open('./rebalanced_data/k_folds_sru.pickle', mode='wb') as f:
    pickle.dump(train_folds, f, protocol=4)

Se guardan en diferentes archivos los otros balanceos anteriores por si interesara usarlos en el futuro.

In [16]:
with open('./rebalanced_data/k_folds_cnn.pickle', mode='wb') as f:
    pickle.dump(create_folds_dict(x_train_cnn, y_train_cnn, k=5), f, protocol=4)
    
with open('./rebalanced_data/k_folds_oss.pickle', mode='wb') as f:
    pickle.dump(create_folds_dict(x_train_oss, y_train_oss, k=5), f, protocol=4)

Guardamos en un fichero la partición entera de train, en este caso sólo de SRU.

In [25]:
with open('./rebalanced_data/train_set_sru.pickle', mode='wb') as f:
    pickle.dump({'x': pd.DataFrame(x_train_sru, columns=x_vars.columns),
                 'y': y_train_sru},
                f,
                protocol=4)