In [41]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
from sklearn.metrics import auc, roc_auc_score


/kaggle/input/weights/for_different.npy
/kaggle/input/tabular-playground-series-oct-2021/sample_submission.csv
/kaggle/input/tabular-playground-series-oct-2021/train.csv
/kaggle/input/tabular-playground-series-oct-2021/test.csv


In [42]:
import datatable as dt
def data_load() -> pd.DataFrame:
    direсtory_kagggle = "/kaggle/input/tabular-playground-series-oct-2021/"
    train_data = "train.csv"
    test_data = "test.csv"
    sub_data = "sample_submission.csv"
    train = pd.read_csv(direсtory_kagggle + train_data)
    test = pd.read_csv(direсtory_kagggle + test_data)
    #train = dt.fread(direсtory_kagggle + train_data).to_pandas()
    #test = dt.fread(direсtory_kagggle + test_data).to_pandas()
    sub = pd.read_csv(direсtory_kagggle + sub_data)
    
    return train, test, sub

train, test, sub = data_load()

In [43]:
def reduce_memory_use(df: pd.DataFrame, verbose: bool=True) -> pd.DataFrame:
    num_col = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_memory_usage = df.memory_usage(deep=True).sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in num_col:
            col_min = df[col].min()
            col_max = df[col].max()
            if str(col_type)[:3] == "int":
                if col_min > np.iinfo(np.int8).min and col_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif col_min > np.iinfo(np.int16).min and col_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif col_min > np.iinfo(np.int32).min and col_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif col_min > np.iinfo(np.int64).min and col_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            elif str(col_type)[:5] == "float":
                if col_min > np.finfo(np.float16).min and col_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif col_min > np.finfo(np.float32).min and col_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                elif col_min > np.finfo(np.float64).min and col_max < np.finfo(np.float64).max:
                    df[col] = df[col].astype(np.float64)
    end_memory_usage = df.memory_usage(deep=True).sum() / 1024 ** 2
    if verbose:
        print(f"Суммарно памяти до: {start_memory_usage}", end="\n")
        print("Суммарно памяти после: ", end_memory_usage)
        
        return df
    
print("TRAIN", end="\n")
train = reduce_memory_use(train)
print("TEST", end="\n")
test = reduce_memory_use(test)
    

TRAIN
Суммарно памяти до: 2189.6363525390625
Суммарно памяти после:  505.447509765625
TEST
Суммарно памяти до: 1091.0035400390625
Суммарно памяти после:  252.24697875976562


In [44]:
def devide_X_y (train_df: pd.DataFrame, test_df: pd.DataFrame) -> pd.DataFrame:
    y = train_df['target']
    X = train_df.drop(['target', 'id'], axis=1)
    test = test_df.drop(['id'], axis=1)
    
    return X, y, test

X, y, test = devide_X_y(train ,test)
    

In [45]:
weights = np.load('/kaggle/input/weights/for_different.npy')
weights

array([0.72135168, 0.66652727, 0.69653423, ..., 0.62813161, 0.62210972,
       0.60417259])

In [46]:
def multiply_weights(weights: np.array) -> np.array:
    pred_train = weights[:len(X)]
    weights = (1. / pred_train) - 1.
    weights /= np.mean(weights)
    
    return weights

In [47]:
 weights = multiply_weights(weights)

In [64]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
def model_rfc(X: pd.DataFrame, test: pd.DataFrame, y: pd.Series, weights: np.array, n_splits=5) -> list:
    rfc_prediction = np.zeros(test.shape[0])
    AUC = []
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=1)
    for trn_idx, val_idx in tqdm(skf.split(X, y)):
        x_train = X.iloc[trn_idx]
        y_train = y.iloc[trn_idx]
        x_valid = X.iloc[val_idx]
        y_valid = y.iloc[val_idx]
        weights = pd.Series(weights)
        weights_train = weights.iloc[trn_idx]
        rfc = RandomForestClassifier(n_jobs=-1,max_depth=5)
        rfc.fit(x_train,
                y_train,
                sample_weight=weights_train
                )
        rfc_prediction += rfc.predict_proba(test)[:, -1] / n_splits
        AUC.append(roc_auc_score(y_valid, rfc.predict_proba(x_valid)[:, -1]))
        
    return rfc_prediction, AUC

In [65]:
#[0.8380873673564039, 0.8378374448212519, 0.8374627953687441, 0.83885619107979, 0.8390132342275519]
#Среднее значение AUC:  0.8382514065707484
#score = 0.84143
rfc_prediction, AUC = model_rfc(X, test, y, weights, n_splits=5)
from statistics import mean
print(AUC, end='\n')
print("Среднее значение AUC: ", mean(AUC))

5it [18:02, 216.47s/it]

[0.8380873673564039, 0.8378374448212519, 0.8374627953687441, 0.83885619107979, 0.8390132342275519]
Среднее значение AUC:  0.8382514065707484





In [None]:
def to_csv_file(sub: pd.DataFrame, *args: np.array) -> pd.DataFrame:
    sub['target'] = np.column_stack(args).mean(axis=1)
    sub.to_csv('Baseline_3.csv', index=False)
    
    return sub

baseline_3 = to_csv_file(sub, rfc_prediction)
#score = 0.84143

In [83]:
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
def model_cat(X: pd.DataFrame, test: pd.DataFrame, y: pd.Series, n_splits=10) -> list:
    cat_prediction = np.zeros(test.shape[0])
    AUC = []
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=1)
    for trn_idx, val_idx in tqdm(skf.split(X, y)):
        x_train = X.iloc[trn_idx]
        y_train = y.iloc[trn_idx]
        x_valid = X.iloc[val_idx]
        y_valid = y.iloc[val_idx]
        cat_model = CatBoostClassifier()
        cat_model.fit(x_train,
                     y_train,
                     verbose=0,
                     )
        cat_prediction += cat_model.predict_proba(test)[:, -1] / n_splits
        AUC.append(roc_auc_score(y_valid, cat_model.predict_proba(x_valid)[:, -1]))
        
    return cat_prediction, AUC

In [71]:
cat_prediction, AUC = model_cat(X, test, y, n_splits=10)

from statistics import mean
print(AUC, end='\n')
print("Среднее значение AUC: ", mean(AUC))

10it [1:54:10, 685.01s/it]

[0.8548804402271748, 0.8530394320590706, 0.8566705611464069, 0.8539963621781063, 0.8532784610886341, 0.854513350719504, 0.8564033220613015, 0.8568425924661333, 0.8555991161201454, 0.8553218246645937]
Среднее значение AUC:  0.8550545462731071





In [73]:
def to_csv_file(sub: pd.DataFrame, *args: np.array) -> pd.DataFrame:
    sub['target'] = np.column_stack(args).mean(axis=1)
    sub.to_csv('Baseline_3.csv', index=False)
    
    return sub

baseline_4 = to_csv_file(sub, cat_prediction)
#score = 0.85583