In [57]:
from pathlib import Path
import json
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import gc # сборщик мусора
from imputer import FeatureImputer

In [58]:
# paths
RAW_DATA_PATH = Path('./data_raw')
TRAIN_RAW_PATH = RAW_DATA_PATH / 'train.csv'
SCALERS_PATH = Path('./scalers')

DATA_TAKE_NO = '01_bal_smote'
IMPUTER_METHOD = '01_nan'
SCALING_METHOD = 'standard'
BALANCE_METHOD = 'smote'

DATA_PATH = Path('./data/' + DATA_TAKE_NO)
TRAIN_PATH  = DATA_PATH / 'train.csv'
TEST_PATH = DATA_PATH / 'validation.csv'
TRAIN_TARGET_PATH = DATA_PATH / 'train_target.csv'
TEST_TARGET_PATH = DATA_PATH / 'validation_target.csv'

In [59]:
# загрузить словарь признаков
with open("feature_dict.json", "r") as f:
    feature_dict = json.load(f)
# заполнить признаки по словарю
NUM_FEATURE_NAMES = feature_dict['NUM_FEATURE_NAMES']
CAT_FEATURE_NAMES = feature_dict['CAT_FEATURE_NAMES']
TARGET_NAME = feature_dict['TARGET_NAME']
SELECTED_FEATURE_NAMES = NUM_FEATURE_NAMES

In [60]:
def do_scaling(df, method, with_save = 1):
    """масштабирование"""
    assert method in ['standard', 'minmax'], 'Неверный метод масштабирования'
    if method == 'standard':
        scaler = StandardScaler()
        file_name = 'st_scaler.pkl'
    elif method == 'minmax':
        scaler = MinMaxScaler()
        file_name = 'minmax_scaler.pkl'
    
    df_norm = df.copy()
    df_norm[NUM_FEATURE_NAMES] = scaler.fit_transform(df_norm[NUM_FEATURE_NAMES])

    df = df_norm.copy()
    
    if with_save:
        with open(SCALERS_PATH / file_name, 'wb') as file:
            pickle.dump(scaler, file)
    return df.sample(frac=1)

In [61]:
def do_2class_df_balancing(df, method = 'over', target_name = TARGET_NAME):
    """сэмплирование (балансировка для двух классов)"""
    assert method in ['over', 'under', 'tomek', 'smote'], 'Неверный метод сэмплирования'
    
    target_counts = df[target_name].value_counts()

    major_class_name = target_counts.argmax()
    minor_class_name = target_counts.argmin()

    disbalance_coeff = int(target_counts[major_class_name] / target_counts[minor_class_name]) - 1
    if method == 'over':
        for i in range(disbalance_coeff):
            sample = df[df[target_name] == minor_class_name].sample(target_counts[minor_class_name])
            df = df.append(sample, ignore_index=True)
            
    elif method == 'under':
        df_ = df.copy()
        df = df_[df_[target_name] == minor_class_name]
        tmp = df_[df_[target_name] == major_class_name]
        df = df.append(tmp.iloc[
            np.random.randint(0, tmp.shape[0], target_counts[minor_class_name])
        ], ignore_index=True)

    elif method == 'tomek':
        from imblearn.under_sampling import TomekLinks
        tl = TomekLinks()
        X_tomek, y_tomek = tl.fit_sample(df.drop(columns=target_name), df[target_name])
        df = pd.concat([X_tomek, y_tomek], axis=1)
    
    elif method == 'smote':
        from imblearn.over_sampling import SMOTE
        smote = SMOTE()
        # ???!!! installed imblearn.__version__ = '0.8.0', updated sklearn
        # but SMOTE() has no attribute fit_sample() 
        X_smote, y_smote = smote.fit_resample(df.drop(columns=target_name), df[target_name], )
        df = pd.concat([X_smote, y_smote], axis=1)

    return df.sample(frac=1)     

### Из сырого train-набора данных по варианту обработки получить:
 * train-набор
 * validation-набор

In [62]:
# считать сырой набор
df = pd.read_csv(TRAIN_RAW_PATH)

In [63]:
# выбросы и пропуски
imputer = FeatureImputer(df, IMPUTER_METHOD)
df = imputer.transform()

In [64]:
# балансировка классов
df = df[NUM_FEATURE_NAMES + [TARGET_NAME]]
df = do_2class_df_balancing(df, BALANCE_METHOD)
df[TARGET_NAME].value_counts()

1    5387
0    5387
Name: Credit Default, dtype: int64

In [65]:
X = df[SELECTED_FEATURE_NAMES]
y = df[TARGET_NAME]
del df
gc.collect()

104

In [66]:
# стандартизация
# X = do_scaling(X, SCALING_METHOD)

In [67]:
X.head(2)

Unnamed: 0,Annual Income,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score
779,1014144.0,0.0,6.0,18.6,322608.0,0.0,19.0,0.0,99999999.0,268033.0,19100.0,651.0
4816,1168386.0,0.0,9.0,10.5,192390.0,0.0,36.0,0.0,178882.0,109497.0,13315.0,731.0


In [68]:
# разбить train-test
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    shuffle=True,
                                                    test_size=0.3,
                                                    random_state=21,
                                                    stratify=y)

In [69]:
display(y_train.value_counts(normalize=True), y_test.value_counts(normalize=True))

0    0.500066
1    0.499934
Name: Credit Default, dtype: float64

1    0.500155
0    0.499845
Name: Credit Default, dtype: float64

In [70]:
X_train.to_csv(TRAIN_PATH, index = False, header=True)
X_test.to_csv(TEST_PATH, index = False, header=True)
y_train.to_csv(TRAIN_TARGET_PATH, index = False, header=True)
y_test.to_csv(TEST_TARGET_PATH, index = False, header=True)