In [1]:
import os
import pandas as pd
import tqdm
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import dill
import warnings


from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

warnings.filterwarnings('ignore')

In [2]:
target = pd.read_csv("data/train_target.csv")

In [3]:
def read_parquet_dataset_from_local(path_to_dataset: str, start_from: int = 0,
                                     num_parts_to_read: int = 2, columns=None, verbose=False) -> pd.DataFrame:
    """
    читает num_parts_to_read партиций, преобразовывает их к pd.DataFrame и возвращает
    :param path_to_dataset: путь до директории с партициями
    :param start_from: номер партиции, с которой нужно начать чтение
    :param num_parts_to_read: количество партиций, которые требуется прочитать
    :param columns: список колонок, которые нужно прочитать из партиции
    :return: pd.DataFrame
    """

    res = []
    dataset_paths = sorted([os.path.join(path_to_dataset, filename) for filename in os.listdir(path_to_dataset)
                              if filename.startswith('train')])

    print(start_from)
    start_from = max(0, start_from)
    chunks = dataset_paths[start_from: start_from + num_parts_to_read]
    if verbose:
        for chunk in chunks:
            print(chunk)
    for chunk_path in chunks:
        chunk = pd.read_parquet(chunk_path,columns=columns)
        res.append(chunk)

    return pd.concat(res).reset_index(drop=True)

In [4]:
def reduce_mem_usage(data):
    """ Функция для оптимизации использования памяти DataFrame (inplace). """
    
    # Расчет начального использования памяти -
    #start_memory = data.memory_usage().sum() / 1024**2
   # print(f"Initial memory usage: {start_memory:.2f} MB")
    
    # Создание словарей с диапазонами для каждого типа чисел
    int_type_dict = {
        (np.iinfo(np.int8).min,  np.iinfo(np.int8).max):  np.int8,
        (np.iinfo(np.int16).min, np.iinfo(np.int16).max): np.int16,
        (np.iinfo(np.int32).min, np.iinfo(np.int32).max): np.int32,
        (np.iinfo(np.int64).min, np.iinfo(np.int64).max): np.int64,
    }
    
    float_type_dict = {
        (np.finfo(np.float16).min, np.finfo(np.float16).max): np.float16,
        (np.finfo(np.float32).min, np.finfo(np.float32).max): np.float32,
        (np.finfo(np.float64).min, np.finfo(np.float64).max): np.float64,
    }
    
    # Обрабатываем каждый столбец в DataFrame
    for column in data.columns:
        col_type = data[column].dtype

        if np.issubdtype(col_type, np.integer):
            c_min = data[column].min()
            c_max = data[column].max()
            dtype = next((v for k, v in int_type_dict.items() if k[0] <= c_min and k[1] >= c_max), None)
            if dtype:
                data[column] = data[column].astype(dtype)
        elif np.issubdtype(col_type, np.floating):
            c_min = data[column].min()
            c_max = data[column].max()
            dtype = next((v for k, v in float_type_dict.items() if k[0] <= c_min and k[1] >= c_max), None)
            if dtype:
                data[column] = data[column].astype(dtype)
    
    # Расчет конечного использования памяти
    #end_memory = data.memory_usage().sum() / 1024**2
    #print(f"Final memory usage: {end_memory:.2f} MB")
    #print(f"Reduced by {(start_memory - end_memory) / start_memory * 100:.1f}%")
    
    

In [5]:
def prepare_transactions_dataset(path_to_dataset: str, num_parts_to_preprocess_at_once: int = 1, num_parts_total: int=50,
                                 save_to_path=None, verbose: bool=False, start_df = 0):
    """
    возвращает готовый pd.DataFrame с признаками, на которых можно учить модель для целевой задачи
    path_to_dataset: str
        путь до датасета с партициями
    num_parts_to_preprocess_at_once: int
        количество партиций, которые будут одновременно держаться и обрабатываться в памяти
    num_parts_total: int
        общее количество партиций, которые нужно обработать
    save_to_path: str
        путь до папки, в которой будет сохранён каждый обработанный блок в .parquet-формате; если None, то не будет сохранён
    verbose: bool
        логирует каждую обрабатываемую часть данных
    """
    target = pd.read_csv("data/train_target.csv")
    reduce_mem_usage(target)
    preprocessed_frames = []
    for step in range(start_df, num_parts_total, num_parts_to_preprocess_at_once):
        transactions_frame = read_parquet_dataset_from_local(path_to_dataset, step, num_parts_to_preprocess_at_once,
                                                             verbose=verbose)
        
        #transactions_frame = pd.merge(left=transactions_frame.groupby('id').agg('mean'), right=target, on='id', how='left')

        
        reduce_mem_usage(transactions_frame)
        
   #записываем подготовленные данные в файл
        if save_to_path:
            block_as_str = str(step)
            if len(block_as_str) == 1:
                block_as_str = '00' + block_as_str
            else:
                block_as_str = '0' + block_as_str
            transactions_frame.to_parquet(os.path.join(save_to_path, f'processed_chunk_{block_as_str}.parquet'))

        preprocessed_frames.append(transactions_frame)
        del transactions_frame
    return pd.concat(preprocessed_frames)

In [6]:
save_to_path = 'data/output'

In [7]:
path = 'data'

In [8]:
coloumns_for_drop = [  'enc_paym_0',
                       'enc_paym_1',
                       'enc_paym_2',
                       'enc_paym_3',
                       'enc_paym_4',
                       'enc_paym_5',
                       'enc_paym_6',
                       'enc_paym_7',
                       'enc_paym_8',
                       'enc_paym_9',
                       'enc_paym_10',
                       'enc_paym_11',
                       'enc_paym_12',
                       'enc_paym_13',
                       'enc_paym_14',
                       'enc_paym_15',
                       'enc_paym_16',
                       'enc_paym_17',
                       'enc_paym_18',
                       'enc_paym_19',
                       'enc_paym_20',
                       'enc_paym_21',
                       'enc_paym_22',
                       'enc_paym_23',
                       'enc_paym_24']

In [9]:
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer

In [10]:
data_ohe_list = [       'pre_till_fclose',
                        'pre_till_pclose',
                        'pre_fterm',
                        'pre_pterm',
                        'pre_since_confirmed',
                        'pre_since_opened',
                        'is_zero_loans5',
                        'is_zero_loans530',
                        'is_zero_loans3060',
                        'is_zero_loans6090',
                        'is_zero_loans90',
                        'is_zero_util',
                        'is_zero_over2limit',
                        'is_zero_maxover2limit',
                        'pclose_flag',
                        'fclose_flag',
                        'pre_loans5',
                        'pre_loans530',
                        'pre_loans3060',
                        'pre_loans6090',
                        'pre_loans90',
                        'enc_loans_account_holder_type',
                        'enc_loans_credit_status',
                        'enc_loans_credit_type',
                        'enc_loans_account_cur',
                        'pre_loans_credit_limit',
                        'pre_loans_next_pay_summ',
                        'pre_loans_outstanding',
                        'pre_loans_total_overdue',
                        'pre_loans_max_overdue_sum',
                        'pre_loans_credit_cost_rate',
                        'pre_util',
                        'pre_over2limit',
                        'pre_maxover2limit',
                        'flag_paym0',
                        'flag_paym1',
                        'flag_paym2',
                        'flag_paym3',
                        'flag_paym4',
                        'flag_paym5',
                        'flag_paym6',
                        'flag_paym7',
                        'flag_paym8',
                        'flag_paym9',
                        'flag_paym10',
                        'flag_paym11',
                        'flag_paym12',
                        'flag_paym13',
                        'flag_paym14',
                        'flag_paym15',
                        'flag_paym16',
                        'flag_paym17',
                        'flag_paym18',
                        'flag_paym19',
                        'flag_paym20',
                        'flag_paym21',
                        'flag_paym22',
                        'flag_paym23',
                        'flag_paym24',
                        'is_zero_loans',
                        'all_is_closed']

In [11]:
cat_features = data_ohe_list
id_feat = ['id','rn']
all_feat = id_feat + cat_features
print(all_feat)

['id', 'rn', 'pre_till_fclose', 'pre_till_pclose', 'pre_fterm', 'pre_pterm', 'pre_since_confirmed', 'pre_since_opened', 'is_zero_loans5', 'is_zero_loans530', 'is_zero_loans3060', 'is_zero_loans6090', 'is_zero_loans90', 'is_zero_util', 'is_zero_over2limit', 'is_zero_maxover2limit', 'pclose_flag', 'fclose_flag', 'pre_loans5', 'pre_loans530', 'pre_loans3060', 'pre_loans6090', 'pre_loans90', 'enc_loans_account_holder_type', 'enc_loans_credit_status', 'enc_loans_credit_type', 'enc_loans_account_cur', 'pre_loans_credit_limit', 'pre_loans_next_pay_summ', 'pre_loans_outstanding', 'pre_loans_total_overdue', 'pre_loans_max_overdue_sum', 'pre_loans_credit_cost_rate', 'pre_util', 'pre_over2limit', 'pre_maxover2limit', 'flag_paym0', 'flag_paym1', 'flag_paym2', 'flag_paym3', 'flag_paym4', 'flag_paym5', 'flag_paym6', 'flag_paym7', 'flag_paym8', 'flag_paym9', 'flag_paym10', 'flag_paym11', 'flag_paym12', 'flag_paym13', 'flag_paym14', 'flag_paym15', 'flag_paym16', 'flag_paym17', 'flag_paym18', 'flag_pay

In [12]:
from sklearn.compose import ColumnTransformer, make_column_selector

In [13]:
from sklearn.preprocessing import StandardScaler

In [14]:
from sklearn.pipeline import FeatureUnion

In [15]:
from sklearn.compose import ColumnTransformer

In [16]:
from pip_lib import *

In [17]:
transformer = Pipeline(steps=[
        ('new_features', FunctionTransformer(new_features)),
        ('filter', FunctionTransformer(filter_data)),
        ('reduce_mem_usage1', FunctionTransformer(reduce_mem_usage)),
    ])


In [18]:
categorical_transformer = Pipeline(steps=[
        ('encoder', FunctionTransformer(encoder).set_output(transform="pandas")),
        ('reduce_mem_usage2', FunctionTransformer(reduce_mem_usage)),
        ('merge', FunctionTransformer(merge_flag).set_output(transform="pandas")),
        ('reduce_mem_usage3', FunctionTransformer(reduce_mem_usage)),
        ('imputer', FunctionTransformer(imput_median).set_output(transform="pandas")),
        ('boundaries_dataframe', FunctionTransformer(boundaries_dataframe)),
        ('reduce_mem_usage4', FunctionTransformer(reduce_mem_usage)),
    ])

In [19]:
preprocessor = ColumnTransformer(transformers=[
    ("cat", categorical_transformer, all_feat)
])

In [20]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

In [21]:
model = LGBMClassifier(class_weight='balanced', n_estimators=700, num_leaves=10, random_state=42,force_col_wise=True)

In [22]:

df = prepare_transactions_dataset(path, num_parts_to_preprocess_at_once=1, num_parts_total=12,
                                    save_to_path=save_to_path)

df = reduce_mem_usage(df)
df = pd.merge(left=df, right=target, on='id', how='left')
df = reduce_mem_usage(df)
#df.to_csv('df.csv',index = False)
X = df.drop(['flag'], axis=1)

y = target['flag']

pipe = Pipeline(steps=[
        ('transformer', transformer),
        ('preprocessor', preprocessor),
        ('classifier', model),
    ])


pipe.fit(X,y)

    


0
1
2
3
4
5
6
7
8
9
10
11
start enc
stop enc
merged
filled
[LightGBM] [Info] Number of positive: 106442, number of negative: 2893558
[LightGBM] [Info] Total Bins 31104
[LightGBM] [Info] Number of data points in the train set: 3000000, number of used features: 300
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


In [23]:
with open('risk_of_defualt.pkl', 'wb') as output_file:
    dill.dump({
            'model': pipe
    }, output_file)

In [None]:
score = roc_auc_score(y, pipe.predict_proba(X)[:, 1])

print(f'model: {type(model).__name__}, roc_auc: {score:.4f}')
   