In [1]:
%pip list

Package                 Version
----------------------- -----------
asttokens               3.0.0
colorama                0.4.6
comm                    0.2.2
debugpy                 1.8.13
decorator               5.2.1
executing               2.2.0
ipykernel               6.29.5
ipython                 9.0.2
ipython_pygments_lexers 1.1.1
jedi                    0.19.2
jupyter_client          8.6.3
jupyter_core            5.7.2
lightgbm                4.6.0
matplotlib-inline       0.1.7
nest-asyncio            1.6.0
numpy                   2.2.4
packaging               24.2
pandas                  2.2.3
parso                   0.8.4
pip                     25.0.1
platformdirs            4.3.7
prompt_toolkit          3.0.50
psutil                  7.0.0
pure_eval               0.2.3
Pygments                2.19.1
python-dateutil         2.9.0.post0
pytz                    2025.2
pywin32                 310
pyzmq                   26.4.0
scipy                   1.15.2
six                 

In [1]:
%pip list

Package                 Version
----------------------- -----------
asttokens               3.0.0
colorama                0.4.6
comm                    0.2.2
debugpy                 1.8.13
decorator               5.2.1
executing               2.2.0
ipykernel               6.29.5
ipython                 9.0.2
ipython_pygments_lexers 1.1.1
jedi                    0.19.2
jupyter_client          8.6.3
jupyter_core            5.7.2
lightgbm                4.6.0
matplotlib-inline       0.1.7
nest-asyncio            1.6.0
numpy                   2.2.4
packaging               24.2
parso                   0.8.4
pip                     25.0.1
platformdirs            4.3.7
prompt_toolkit          3.0.50
psutil                  7.0.0
pure_eval               0.2.3
Pygments                2.19.1
python-dateutil         2.9.0.post0
pywin32                 310
pyzmq                   26.4.0
scipy                   1.15.2
six                     1.17.0
stack-data              0.6.3
tornado             

In [2]:
import pandas as pd
import numpy as np
import gc
from typing import List

import warnings

In [3]:
import pandas as pd
import numpy as np
import gc
from typing import List

import warnings
warnings.simplefilter(action = "ignore", category = RuntimeWarning)


def get_train_data(TRAIN_PATH: str) -> pd.DataFrame:
    '''Returns train dataset'''
    df_train = pd.read_parquet(TRAIN_PATH)
    return df_train


def get_test_data(TEST_PATH: str) -> pd.DataFrame:
    '''Returns test dataset'''
    df_test = pd.read_parquet(TEST_PATH)
    return df_test


def get_target(TARGET_PATH: str) -> pd.DataFrame:
    '''Retruns dataset with train targets'''
    df_train_target = pd.read_csv(TARGET_PATH)
    return df_train_target


def get_train_data_with_target_merged(df_train: pd.DataFrame, df_train_target: pd.DataFrame) -> pd.DataFrame:
    '''Retruns train dataset with target variable merged'''
    df_train_w_target = (
        df_train
        .merge(df_train_target,
            on='customer_ID',
            how='left'
        )
    )
    # df_train_w_target.groupby('target', dropna=False).count()['customer_ID']
    '''
    target
    0    4153582
    1    1377869
    Name: customer_ID, dtype: int64    
    '''
    return df_train_w_target


def get_all_features(df: pd.DataFrame) -> List:
    '''Returns list of all features from the dataset'''
    return list(df)


def get_cat_features() -> List:
    '''Returns list of categorical features from the dataset'''
    cat_features = ['B_30', 'B_38', 'D_114', 
                    'D_116', 'D_117', 'D_120', 
                    'D_126', 'D_63', 'D_64', 
                    'D_66', 'D_68']
    
    return cat_features


def get_num_features(all_features: List, cat_features: List) -> List:
    '''Returns list of all numerical features from the dataset'''
    num_feats = [col for col in all_features if col not in cat_features + ['customer_ID', 'S_2', 'target']]

    return num_feats


def get_df_w_aggrs(df: pd.DataFrame, numerical_features: List) ->  pd.DataFrame:
    '''Returns dataframe with generated aggregates based on numerical features'''

    cid = pd.Categorical(df.pop('customer_ID'), ordered=True)
    last = (cid != np.roll(cid, -1)) # mask for last statement of every customer

    df_min = (df
        .groupby(cid)
        .min()[numerical_features]
        .rename(columns={f: f"{f}_min" for f in numerical_features})
    )
    print(df_min.shape)

    df_max = (df
        .groupby(cid)
        .max()[numerical_features]
        .rename(columns={f: f"{f}_max" for f in numerical_features})
    )
    print(df_max.shape)

    df_avg = (df
        .drop('S_2', axis='columns')
        .groupby(cid)
        .mean()[numerical_features]
        .rename(columns={f: f"{f}_avg" for f in numerical_features})
    )
    print(df_avg.shape)

    df_last = (df
        .loc[last, numerical_features]
        .rename(columns={f: f"{f}_last" for f in numerical_features})
        .set_index(np.asarray(cid[last]))
    )
    print(df_last.shape)

    df_aggrs = (pd.concat([df_min, df_max, df_avg, df_last], axis=1)
        .reset_index()
        .rename(columns={'index': 'customer_ID'})
    )
    print(df_aggrs.shape)

    '''
    del df, df_min, df_max, df_avg, cid, last
    gc.collect()
    '''
    return df_aggrs


def check_zapolnenie(df: pd.DataFrame) -> pd.DataFrame:
    '''Returns pd.DataFrame with isNotNullShare of each column of given df'''
    # Calculate percent of not null share each column 
    col_pct_notNull = [] 
    for col in df.columns: 
        percent_notNull = np.mean(~df[col].isnull())*100 
        col_pct_notNull.append([col, percent_notNull]) 
        
    col_pct_notNull_df = pd.DataFrame(col_pct_notNull, columns = ['column_name','isNotNullShare']).sort_values(by = 'isNotNullShare', ascending = False) 
    #print(col_pct_notNull_df)
    return col_pct_notNull_df

In [4]:
df_train = get_train_data(TRAIN_PATH='./data/train.parquet')

all_features = get_all_features(df_train)
cat_features = get_cat_features()
num_features = get_num_features(all_features, cat_features)
# len(all_features), len(cat_features), len(num_features) -> (190, 11, 178)

df_train_agg = get_df_w_aggrs(df=df_train, numerical_features=num_features)
df_train_target = get_target(TARGET_PATH='./data/train_labels.csv')
df_train = get_train_data_with_target_merged(df_train=df_train_agg, df_train_target=df_train_target)

'''
df_train.target.value_counts()
target
0    340085
1    118828
Name: count, dtype: int64    
'''

# df_test = get_test_data(TEST_PATH='./data/test.parquet')
# df_test = get_df_w_aggrs(df=df_test, numerical_features=num_features)

ImportError: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.

In [3]:
df_train.shape

(458913, 710)

In [4]:
def get_amex_metric_calculated(y_true: np.array, y_pred: np.array) -> float:

    # count of positives and negatives
    n_pos = y_true.sum()
    n_neg = y_true.shape[0] - n_pos

    # sorting by descring prediction values
    indices = np.argsort(y_pred)[::-1]
    preds, target = y_pred[indices], y_true[indices]

    # filter the top 4% by cumulative row weights
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_filter = cum_norm_weight <= 0.04

    # default rate captured at 4%
    d = target[four_pct_filter].sum() / n_pos

    # weighted gini coefficient
    lorentz = (target / n_pos).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    # max weighted gini coefficient
    gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))

    # normalized weighted gini coefficient
    g = gini / gini_max

    return 0.5 * (g + d)

In [6]:
print(get_amex_metric_calculated(y_true=df_train.target, y_pred=df_train.target))

1.0000000000015785


In [7]:
from lightgbm import LGBMClassifier, log_evaluation

ModuleNotFoundError: No module named 'lightgbm'

In [5]:
df_train.target

0         0
1         0
2         0
3         0
4         0
         ..
458908    0
458909    0
458910    0
458911    1
458912    0
Name: target, Length: 458913, dtype: int64