# Feature Engineering

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2
pd.options.mode.chained_assignment = None  # to silence verbose warnings

In [2]:
dcca_11c = pd.read_csv('dcca_11c_processed.csv')
dcca_16bc = pd.read_csv('dcca_16bc_processed.csv')
pol_parties = ['ADPL', 'BPA', 'CIV', 'CivP', 'CivPass', 'DAB', 'DP', 'Demosisto', 'FTU', 'IND', 'LAB', 'LIB', 'LSD', 'NPP', 'NWSC', 'NeoDem', 'PathDem', 'PeoP', 'Young']
bloc = ['CEN', 'DEM', 'EST', 'IND', 'LOC']
le_legco_winner_party = LabelEncoder().fit(pol_parties)
le_legco_winner_bloc = LabelEncoder().fit(bloc)

## Delete redundant features

In [3]:
# similar_features_prefix = ['age_', 'ethn_', 'ms_', 'mearn_xfdh_', 'mearn_xfdhfw_', 'wp_', 'nwp_', 'plw_', 'pls_', 'dhz_', 'dhc_', 'dhi_', 'dhr_', 'oq_', 'dh_', 'pop_', 'dhm_', 'pm_']
# # similar_features_suffix = ['_m', '_f']

# def remove_features(df, prefix=None, suffix=None):
#     filter_bool = pd.Series([True] * len(df.columns), index=df.columns)
#     if prefix is not None:
#         for pre in prefix:
#             filter_bool &= ~df.columns.str.startswith(pre)
#     if suffix is not None:
#         for suf in suffix:
#             filter_bool &= ~df.columns.str.endswith(suf)
#     return filter_bool

# filter_bool = remove_features(dcca_11c, similar_features_prefix)
# dcca_11c = dcca_11c.loc[:, filter_bool].set_index('dcca_class')
# dcca_16bc = dcca_16bc.loc[:, filter_bool].set_index('dcca_class')
dcca_11c = dcca_11c.set_index('ca_chi')
dcca_16bc = dcca_16bc.set_index('ca_chi')

## Create inferred features

In [4]:
# transform hierarchical data to numbers, then calculate the average
edu = dcca_11c.loc[:, dcca_11c.columns.str.startswith('edu_')]
edu_score = (edu['edu_prepri'] * 0.5 + edu['edu_pri'] * 1. + edu['edu_lsec'] * 1.5 + edu['edu_usec'] * 2. + edu['edu_dip'] * 2.33 + \
            edu['edu_sub'] * 2.67 + edu['edu_deg'] * 3.) / edu.sum(axis=1)

dcca_11c = dcca_11c.loc[:, ~dcca_11c.columns.str.startswith('edu_')]
dcca_11c['edu_score'] = edu_score

edu = dcca_16bc.loc[:, dcca_16bc.columns.str.startswith('edu_')]
edu_score = (edu['edu_prepri'] * 0.5 + edu['edu_pri'] * 1. + edu['edu_lsec'] * 1.5 + edu['edu_usec'] * 2. + edu['edu_dip'] * 2.33 + \
            edu['edu_sub'] * 2.67 + edu['edu_deg'] * 3.) / edu.sum(axis=1)

dcca_16bc = dcca_16bc.loc[:, ~dcca_16bc.columns.str.startswith('edu_')]
dcca_16bc['edu_score'] = edu_score

## Select the Best 20 Features

In [6]:
X = dcca_11c.drop(['legco2012_winner', 'legco2016_winner'], axis=1)
try:
    y = le_legco_winner_party.transform(dcca_11c['legco2012_winner'])
except ValueError:
    y = le_legco_winner_bloc.transform(dcca_11c['legco2012_winner'])

In [7]:
best_feats = SelectKBest(score_func=chi2).fit(X, y)
feature_scores = pd.DataFrame({'features': X.columns, 'score': best_feats.scores_})
score = feature_scores.nlargest(37, 'score')
chosen_feats = list(score[score.score >= 3000]['features'].values)
chosen_feats += ['legco2012_winner', 'legco2016_winner']

## Saving the engineered data set

In [8]:
dcca_11c[chosen_feats].to_csv('dcca_11c_engineered.csv', index='ca_chi')
chosen_feats.remove('legco2012_winner')
dcca_16bc[chosen_feats].to_csv('dcca_16bc_engineered.csv', index='ca_chi')