# Using Information Theory to Evaluate Features

### Loading Libraries

In [1]:
# Numerical Computing
import numpy as np

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# YellowBrick
from yellowbrick.model_selection import ValidationCurve, LearningCurve

# Data Manipulation
import pandas as pd

# Warnings
import warnings

# Path
from pathlib import Path

# SciPy
from scipy.stats import spearmanr

# Scikit-Learn
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler, scale
from sklearn.feature_selection import mutual_info_regression

from sklearn.metrics import (mean_squared_error,
                             mean_absolute_error,
                             mean_squared_log_error, 
                             median_absolute_error, 
                             explained_variance_score, 
                             r2_score)

from sklearn.neighbors import (KNeighborsClassifier, 
                               KNeighborsRegressor)

from sklearn.model_selection import (cross_val_score, 
                                     cross_val_predict, 
                                     GridSearchCV)

In [2]:
sns.set_style('whitegrid')

warnings.filterwarnings('ignore')

In [3]:
%matplotlib inline

### Retrieving Data

In [6]:
with pd.HDFStore('/Users/joaquinromero/desktop/MLAT/data/assets.h5') as store:
    data = store['engineered_features']

### Setting Dummy Variables

In [7]:
dummy_data = pd.get_dummies(data,
                            columns=['year','month', 'msize', 'age',  'sector'],
                            prefix=['year','month', 'msize', 'age', ''],
                            prefix_sep=['_', '_', '_', '_', ''])

dummy_data = dummy_data.rename(columns={c:c.replace('.0', '') for c in dummy_data.columns})
dummy_data.info()

### Mutual Information

#### Original Data

In [8]:
target_labels = [f'target_{i}m' for i in [1,2,3,6,12]]
targets = data.dropna().loc[:, target_labels]

features = data.dropna().drop(target_labels, axis=1)
features.sector = pd.factorize(features.sector)[0]

cat_cols = ['year', 'month', 'msize', 'age', 'sector']
discrete_features = [features.columns.get_loc(c) for c in cat_cols]

In [9]:
mutual_info = pd.DataFrame()

for label in target_labels:
    mi = mutual_info_classif(X=features, 
                             y=(targets[label]> 0).astype(int),
                             discrete_features=discrete_features,
                             random_state=42
                            )
    mutual_info[label] = pd.Series(mi, index=features.columns)

In [10]:
mutual_info.sum()

### Normalized MI Heatmap

In [11]:
fig, ax= plt.subplots(figsize=(15, 4))
sns.heatmap(mutual_info.div(mutual_info.sum()).T, ax=ax, cmap='Blues');
plt.show()

### Dummy Data

In [12]:
target_labels = [f'target_{i}m' for i in [1, 2, 3, 6, 12]]
dummy_targets = dummy_data.dropna().loc[:, target_labels]

dummy_features = dummy_data.dropna().drop(target_labels, axis=1)
cat_cols = [c for c in dummy_features.columns if c not in features.columns]
discrete_features = [dummy_features.columns.get_loc(c) for c in cat_cols]

In [13]:
mutual_info_dummies = pd.DataFrame()
for label in target_labels:
    mi = mutual_info_classif(X=dummy_features, 
                             y=(dummy_targets[label]> 0).astype(int),
                             discrete_features=discrete_features,
                             random_state=42
                            )    
    mutual_info_dummies[label] = pd.Series(mi, index=dummy_features.columns)

In [14]:
mutual_info_dummies.sum()

In [16]:
fig, ax= plt.subplots(figsize=(4, 20))
sns.heatmap(mutual_info_dummies.div(mutual_info_dummies.sum()), ax=ax, cmap='Blues');
plt.show()