# IEEE-CIS Fraud Detection

## Missing Value and Outlier Analysis

In [308]:
def corr_thresh(data, thres):
    '''
    Returns the corelation matrix with columns which have higher value than the threshold.
    '''
    # Create a correlation DataFrame
    corr = data.corr()
    
    # Change the self corelation from "1" to "0"
    for i in range(len(corr)):
        corr.iloc[i, i] = 0
    
    # Stack > select > unstack
    c = corr.stack()
    c_stacked = c[c.abs()>thres]
    c_unstacked = c_stacked.unstack(1)
    
    return c_unstacked

corr_new = corr_thresh(train[C_cols], 0.8)
corr_new.stack().sort_values()

C13  C6     0.808531
C6   C13    0.808531
C14  C10    0.853009
C10  C14    0.853009
C6   C12    0.858182
              ...   
C1   C11    0.996515
C8   C10    0.996970
C10  C8     0.996970
C12  C7     0.999489
C7   C12    0.999489
Length: 92, dtype: float64

In [312]:
def corr_drop_list(data, thres):
    '''
    Returns list of columns to drop
    '''
    # Create a correlation DataFrame
    corr = data.corr().abs()
    
    # Select upper triangle of correlation matrix
    corr_upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype('bool'))
    
    # Find columns with correlation greater than the threshold
    drop_cols = [column for column in corr_upper.columns if any(corr_upper[column] > thres)]
    
    return drop_cols

drop_list = corr_drop_list(train[C_cols], thres = .8)

len(drop_list)

print(drop_list)

['C2', 'C4', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14']
