# Feature Engineering
This notebook focus on:
- Explore data transformations
- Analyse feature importance
- Feature selection

In [None]:
# import custom libraries
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
from libs import data_utils

In [None]:
# read data
df = data_utils.read_data(parent_dir + '/data/interim/')

In [None]:
df.describe()

In the following columns, we can see that significant differences between <b>maximum values</b> and the respective <b>75th percentil</b>, indicating the possibility of a right-skew distribution:
- rank_p1,
- rank_p2,
- consecutive_wins_p1,
- consecutive_wins_p2,
- days_last_win_p1 and
- days_last_win_p2.

#### Binning features
Technique used to discretize continuous numerical features into categoical bins.

In [None]:
binning_features = {
    'RankP1': 15,
    'RankP2': 15,
    'Rank_dif': 15,
    'Rank_ratio': 15,
    'OddP2': 15,
    'Odd_dif': 15,
    'Odd_ratio': 15
}

for feature, num_classes in binning_features.items():
    # Calculate the bin edges to have approximately the same frequency in each bin
    bin_edges = pd.qcut(df[feature], q=num_classes, labels=False, retbins=True, duplicates='drop')[1]

    new_col_name = feature + '_binned'

    # Split the column into classes based on the bin edges
    df[new_col_name] = pd.cut(df[feature], bins=bin_edges, labels=range(len(bin_edges) - 1))

    print(f'{feature} binned.')

In [None]:
# Assuming binning_features is a dictionary where keys are feature names and values are the number of desired bins
binning_features = {
    'RankP1': 15,
    'RankP2': 15,
    'Rank_dif': 15,
    'Rank_ratio': 15,
    'OddP2': 15,
    'Odd_dif': 15,
    'Odd_ratio': 15
}

binning_data = []

# Create an empty DataFrame to store the bin edges
binning_df = pd.DataFrame(columns=['Feature', 'Bin', 'Lower Bound', 'Upper Bound'])

# Iterate through each feature and its corresponding number of classes
for feature, num_classes in binning_features.items():
    # Calculate the bin edges to have approximately the same frequency in each bin
    bin_edges = pd.qcut(df[feature], q=num_classes, labels=False, retbins=True, duplicates='drop')[1]

    # Create bins with labels
    #bins = pd.cut(df[feature], bins=bin_edges, labels=False)
    
    # Split the column into classes based on the bin edges
    df[feature + '_binned'] = pd.cut(df[feature], bins=bin_edges, labels=range(len(bin_edges) - 1))

    # Iterate through each bin and get the lower and upper bounds
    for bin_num in range(num_classes):
        lower_bound = bin_edges[bin_num]
        upper_bound = bin_edges[bin_num + 1] if bin_num < len(bin_edges) - 1 else float('inf')
        
        binning_data.append({'Feature': feature, 'Bin': bin_num + 1, 
                             'Lower Bound': lower_bound, 'Upper Bound': upper_bound})

binning_df = pd.DataFrame(binning_data)

binning_df[binning_df['Feature']=='RankP1']