# Feature importance by chi-squared test

Description: Use chi-squared test to determine if to include a feature in machine learning pipeline

In [None]:
# set project and working directory
home_dir = os.environ['project_home']

In [None]:
# import other libraries
from scipy.stats import chi2_contingency

### === MAIN ===

#### ==== Parameters setting ====

In [None]:
p_cut = 0.20 # p-value cut-off
logp_cut = np.log(p_cut)
n_min = 5

#### Read input

In [None]:
data_source_dir = working_dir+'/vars/'

# select latest dated file (track record)
cmd = 'ls {data_source_dir}/DF.*.*.pickle'.format(data_source_dir=data_source_dir)
input_filepath = commands.getoutput(cmd).splitlines()[-1]
__nb_logger.info('reading file: %s' % input_filepath)
DF = pickle.load(open(input_filepath))

### Run chi-square test for feature relevance

In [None]:
colnames = ['feature_01'. 'feature_02', 'feature_03']

In [None]:
Records = {}
for colname in colnames:
    output = {}
    
    # make contigency table
    temp = defaultdict(lambda: 0)
    temp.update(count_elements(DF[[colname, '__label']].dropna().values))
    ps = set(DF[colname]  .dropna().values)
    qs = set(DF['__label'].dropna().values)
    cont_table = np.zeros([len(ps), len(qs)])
    for i in range(len(ps)):
        for j in range(len(qs)):
            cont_table[i][j] = temp[(ps[i], qs[j])]
        # end for
    # end for
    
    # make expected table
    psum = map(sum, exp_table)
    qsum = map(sum, zip(*exp_table))
    exp_table = np.empty_like(cont_table)
    for i in range(len(psum)):
        for j in range(len(qsum)):
            exp_table[i][j] = psum[i] * qsum[j]
        # end for
    # end for
    # normalize
    r = sum(map(sum, cont_table)) / sum(map(sum,  exp_table))
    exp_table = map(list, np.array(exp_table) * r)
    
    # record
    _df = pd.DataFrame(cont_table)
    _df.columns = qs; _df.index = ps
    output['contigency_table'] = _df
    
    _df = pd.DataFrame(exp_table)
    _df.columns = qs; _df.index = ps    
    output['contigency_table_expected'] = _df
    
    odd_table = np.array(cont_table) / np.array(exp_table)
    _df = pd.DataFrame(odd_table)
    _df.columns = qs; _df.index = ps        
    output['contigency_table_ratio'] = _df
    
    # run chi-squared test
    chi2, p, dof, ex = chi2_contingency(cont_table)
    output['chi2_test'] = {
        'chi2': chi2, 'p': p, 'dof': dof, 'ex': ex
    } # end chi2_test

    # record
    Records[colname] = output
# end for


### DFsiplay test results

In [None]:
from IPython.display import Markdown, display
for colname in Records.keys():
    display(Markdown("<b>"+colname+"</b>"))
    display(Records[colname]['contigency_table'])
    print   Records[colname]['chi2_test']
    print
    print '_'*50
# end for

In [None]:
temp = {}
for colname in Records.keys():
    temp[colname] = Records[colname]['chi2_test']
# end for
_df = pd.DataFrame(temp).transpose()
_df

### Make selection here

In [None]:
sels = _df[_df['p'] < p_cut].index
print 'Selected feature at cut-off p-value=%5.2f: [%s]' % (p_cut, ', '.join(map(str, sels))

### Export to file

In [None]:
_timestamp = int(utc_timestamp())

In [None]:
# note: should dump to excel in the next revision
# specify output
outfilename = working_dir + '/vars/chi-squared_test.rawtables.%s.pickle' % _timestamp
__nb_logger.info('write to output: %s' % outfilename)
pickle.dump(Records, open(outfilename, 'w'))

In [None]:
# specify output
outfilename = working_dir + '/vars/chi-squared_test.results.%s.xlsx' % _timestamp
__nb_logger.info('write to output: %s' % outfilename)
_df.to_excel(outfilename)