# Feature Selection

We want to determine which features are the most relevant to display on our site.

## Setup

In [1]:
# !pip install numpy
# !pip install pandas
# !pip install scikit-learn

In [2]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Data Collection

In [3]:
def load_data() -> pd.DataFrame:
    data = pd.read_csv('../data/data.csv')
    data.columns = [col.strip() for col in data.columns]
    return data

In [4]:
df, target = load_data(), 'Bankrupt?'

## Data Cleaning

In [5]:
def data_cleaning(df: pd.DataFrame) -> None:
    # Missing values
    mv = set()
    for col in df:
        num = df[col].isna().sum()
        if num > 0:
            print(f"[!] {num} missing value{'s' if num > 1 else ''} in '{col}'")
            mv.add(col)
    if not mv:
        print('[✓] No missing values')

    # Duplicated values
    num = df.duplicated().sum()
    if num > 0:
        print(f'[!] {num} duplicated values')
    else:
        print('[✓] No duplicated values')

    # Unique values
    uv = set()
    for col in df:
        num = df[col].nunique()
        if num == 1:
            print(f"[!] {num} unique value in '{col}'")
            uv.add(col)
    if not uv:
        print('[✓] No unique values')

In [6]:
data_cleaning(df)

[✓] No missing values
[✓] No duplicated values
[!] 1 unique value in 'Net Income Flag'


In [7]:
df = df.drop('Net Income Flag', axis=1)

In [8]:
data_cleaning(df)

[✓] No missing values
[✓] No duplicated values
[✓] No unique values


## Preprocessing

**Split features and target**

In [9]:
def split_X_y(df: pd.DataFrame, target: str):
    features = [col for col in df.columns if col != target]

    X = df[features]
    y = df[target]

    return X, y

In [10]:
X, y = split_X_y(df, 'Bankrupt?')

categorical_features = X.select_dtypes(include=np.int64).columns
numerical_features = X.select_dtypes(exclude=np.int64).columns

**Split train and test data**

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42, stratify=y)

**Standardize the numeric features**

In [12]:
scaler = ColumnTransformer([('num_scaler', StandardScaler(), numerical_features)], remainder='passthrough')

In [13]:
X_train = scaler.fit_transform(X_train)

**Select the best features**

In [14]:
selection = SelectKBest(score_func=f_classif, k=15).fit(X_train, y_train)

In [15]:
best_features = selection.get_feature_names_out(X.columns).tolist()
best_features

['ROA(C) before interest and depreciation before interest',
 'ROA(A) before interest and % after tax',
 'ROA(B) before interest and depreciation after tax',
 'Persistent EPS in the Last Four Seasons',
 'Per Share Net profit before tax (Yuan ¥)',
 'Debt ratio %',
 'Net worth/Assets',
 'Borrowing dependency',
 'Net profit before tax/Paid-in capital',
 'Working Capital to Total Assets',
 'Current Liability to Assets',
 'Retained Earnings to Total Assets',
 'Current Liability to Current Assets',
 'Liability-Assets Flag',
 'Gross Profit to Sales']