In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%load_ext autoreload
%autoreload 2

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

In [3]:
print(y.shape, tX.shape,ids.shape)

(250000,) (250000, 30) (250000,)


In [4]:
def set_nan(tX):
    """
    Label the -999 values as NaN
    """
    tX_copy = np.copy(tX)
    tX_copy[tX_copy == -999] = np.nan
    return tX_copy

In [5]:
def remove_empty_columns(tX, threshold = 0.4):
    """
    Remove feature columns containing more than the specified threshold proportion of NaN
    """
    tX_copy = np.copy(tX)
    #For each column compute the ratio of nan values over the number of rows
    prop_empty_column = (np.isnan(tX_copy)).sum(axis=0) / len(tX_copy)
    
    column_mask = prop_empty_column < threshold
    return tX_copy[:, column_mask]

In [6]:
def copy_data(y, tX, ids):
    return np.copy(y), np.copy(tX) , np.copy(ids)

In [7]:
def filter_nan(y, tX, ids, remove=True, replace_val=0):
    """
    Filter the nan (-999) values, by either removing the rows or replacing by the specified replace_val.
    """   
    y_copy, tX_copy, ids_copy = copy_data(y, tX, ids)
    mask = np.isnan(tX_copy)# True if Nan, False otherwise
    
    if remove:
        # Remove the rows containing any NaN
        row_mask = ~mask.any(axis=1) # sets to False any rows containing NaN
        tX_copy, y_copy, ids_copy = tX_copy[row_mask], y_copy[row_mask], ids_copy[row_mask]
    else:
        #Replace NaN values by replace_val
        tX_copy[mask] = replace_val
        
    return y_copy, tX_copy, ids_copy

In [8]:
def remove_outliers(y, tX, ids):
    """
    Remove outliers feature points using Interquartile range.
    """
    print("""TODO: only remove outliers when max - min > threshold like 10
    Doubt with DER_deltar_tau_lep and PRI_jet_all_pt""")
    y_copy, tX_copy, ids_copy = copy_data(y, tX, ids)
    # Compute first and third quartiles and the Interquartile range
    Q1 = np.percentile(tX_copy, 25, axis=0) 
    Q3 = np.percentile(tX_copy, 75, axis=0)
    IQR = Q3 - Q1
    mask = (tX_copy >= Q1 - 1.5 * IQR) & (tX_copy <= Q3 + 1.5 * IQR) # set to True any entry outside the interquartile range

    row_mask = mask.all(axis=1) #sets to False rows containing any outliers
    return y_copy[row_mask], tX_copy[row_mask], ids_copy[row_mask]

In [9]:
def noncategorical_columns(tX):
    """
    Computes the columns with more that 10 unique values
    """
     # count the number of unique values
    nunique_col = (np.diff(np.sort(tX, axis=0), axis=0) != 0).sum(axis=0) + 1 
    noncategorical_col = nunique_col > 10 #set to True columns with more than 10 unique elements
    return noncategorical_col

In [10]:
def scale(tX, method="standard"):
    """
    Scale noncategorical features using the specified method. Possible methods: standard, min-max
    """
    tX_copy = np.copy(tX)
    noncategorical_col = noncategorical_columns(tX_copy)
    tX_noncat = tX_copy[:,noncategorical_col]
    
    if method == "standard": 
        #Standardize the data
        tX_copy[:,noncategorical_col] = (tX_noncat - tX_noncat.mean(axis=0)) / tX_noncat.std(axis=0) 
    else:
        #Apply a min-max normalization to scale data between 0 and 1
        col_min = tX_noncat.min(axis=0)
        col_max = tX_noncat.max(axis=0)
        tX_copy[:,noncategorical_col] = (tX_noncat - col_min) / (col_max - col_min)
    return tX_copy

In [11]:
def remove_correlated_features(tX, threshold=0.9):
    """
    Compute the correlations between each feature and remove features that have a correlation greater
    than the specified threshold
    """
    tX_copy = np.copy(tX)
    noncategorical_col = noncategorical_columns(tX_copy)
    cat_idx = np.where(~noncategorical_col)[0] # index of non categorical features
    tX_noncat = tX_copy[:,noncategorical_col]
    
    corr_matrix = np.corrcoef(tX_noncat, rowvar=False)
    
    #set to False highly correlated columns
    nb_col = len(corr_matrix)
    columns = np.full((nb_col,), True, dtype=bool)
    for i in range(nb_col):
        for j in range(i+1, nb_col):
            if corr_matrix[i,j] >= threshold:
                if columns[i]:
                    columns[j] = False
     
    #remove correlated features and concat categorical features
    return np.c_[tX_noncat[:,columns],tX_copy[:,cat_idx]]

In [12]:
def remove_pri_colums(tX):
    pass

def remove_der_columns(tX):
    pass

In [13]:
def pipeline(y, tX, ids):
    tX_nan = set_nan(tX)
    tX_columns = remove_empty_columns(tX_nan)
    y_filtered, tX_filtered, ids_filtered = filter_nan(y, tX_columns, ids)
    y_outliers, tX_outliers, ids_outliers = remove_outliers(y_filtered, tX_filtered, ids_filtered)
    tX_scale = scale(tX_outliers, method="standard")
    tX_corr = remove_correlated_features(tX_scale, threshold=0.9)
    return tX_corr

In [14]:
pipeline(y, tX, ids)

TODO: only remove outliers when max - min > threshold like 10
    Doubt with DER_deltar_tau_lep and PRI_jet_all_pt


array([[ 1.00116781,  0.60294873,  1.27197329, ...,  1.16307234,
         0.25544962,  2.        ],
       [ 1.79100178,  1.23888656,  1.54716042, ...,  0.39352041,
         0.64881421,  1.        ],
       [-0.711809  , -0.81307483, -0.69616427, ..., -1.3005739 ,
        -0.34892147,  3.        ],
       ...,
       [-0.72007049,  1.88954019,  0.35822785, ..., -1.53764989,
         1.62671638,  1.        ],
       [ 0.82493433,  1.56486369,  0.82224994, ..., -0.66441097,
         1.39973289,  1.        ],
       [-0.15941388,  0.93260481,  0.15310963, ...,  0.97405958,
        -0.08061817,  1.        ]])