# Imports

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool
from catboost.metrics import TotalF1



# Constants

In [None]:
TARGETS = ['main_screen', 'invest', 'statement', 'phone_money_transfer',
       'own_transfer', 'credit_info', 'chat', 'card2card_transfer',
       'mobile_recharge', 'card_recharge']

# Data Load

In [None]:
train_target = pd.read_csv("alfabattle2_abattle_train_target.csv")
train_target.sort_values("timestamp", inplace=True)
train_target.head()

In [None]:
train_target.groupby("client_pin")["session_id"].count().reset_index()["session_id"].describe()

In [None]:
plt.figure(figsize=(40, 3))
plt.gca().set_xticks(list(range(0, 100, 15)) + list(range(100, 2000, 100)))
sns.boxplot(train_target.groupby("client_pin")["session_id"].count().reset_index()["session_id"])

# Data Train\Val split

In [None]:
train_target.head()

In [None]:
class ExtractorLastNTarget():
    def __init__(self, by: str, target: str, num: int=10):
        self.num = num
        self.by = by
        self.target = target
        
    def __call__(self, X: pd.DataFrame) -> pd.DataFrame:
        """  Add last <num> <target> feature
            grouped <by> field
        """
        X_new = X.copy()
        columns = []
        for i in range(1, self.num+1):
            columns.append(f"f_last_{i}")
            X_new[f"f_last_{i}"] = X_new.groupby(self.by)[self.target].shift(i)
        X_new[columns] = X_new[columns].replace(np.NaN, "NaN") 
        return X_new   
    
    def __repr__(self):
        return f"LastNTarget {self.num}"
    
    
class ExtractorClientTargetFreq():      
    def __init__(self, column: str):
        self.column = column
        
    def __call__(self, X: pd.DataFrame) -> pd.DataFrame:
        """  Add client frequency till the moment
        """
        X_new = X.copy()
        client_freq, columns = self.prepare_user_freq_matrix(X_new)        
        X_new[columns] = client_freq[columns]  
        return X_new  
    
    def prepare_user_freq_matrix(self, X: pd.DataFrame) -> pd.DataFrame:
        X_new = X.copy()
        X_new["f_total"] = X_new.groupby("client_pin")["session_id"].cumcount()
        X_new["count_per_target"] = X_new.groupby(["client_pin", "multi_class_target"])["session_id"].cumcount()
        X_new["f_total"] = pd.to_numeric(X_new["f_total"], downcast="unsigned")
        X_new["count_per_target"] = pd.to_numeric(X_new["count_per_target"], downcast="unsigned")
        columns_names = ["f_total"]
        for t in X_new[self.column].unique():
            X_new.loc[X_new[self.column] == t, f"cum_{t}"] = X_new.loc[X_new[self.column] == t, "count_per_target"]            
            X_new[f"cum_{t}"] = X_new.groupby("client_pin")[f"cum_{t}"].fillna(method='ffill')
            X_new[f"cum_{t}"] = X_new[f"cum_{t}"].replace(np.NaN, 0)
            X_new[f"f_freq_{t}"] = X_new[f"cum_{t}"].div(X_new["f_total"])  
            X_new[f"f_freq_{t}"] = X_new[f"f_freq_{t}"].replace(np.NaN, 0)
            columns_names.append(f"f_freq_{t}")
        return X_new, columns_names
    
    def __repr__(self):
        return f"ClientTargetFreq"
    
    
class FilterMinTotal():
    """ Filter dataset by "f_total" field. 
    If <remove_index> specified  will not removed client 
    with max_f_total < <filter_min_target>
    """
    def __init__(self, index: str="client_pin", target: str="f_total", filter_min_target: int=20, remove_index: bool=False):
        self.index = index
        self.target = target
        self.filter_min_target = filter_min_target
        self.remove_index = remove_index
        
    def __call__(self, X: pd.DataFrame) -> pd.DataFrame:
        X_copy = X.copy()             
        X_cleared = X_copy[X_copy[self.target] >= self.filter_min_target]        
        index_array = X_copy[self.index].unique()  
        cleared_index_array = X_cleared[self.index].unique() 
        removed_index_array = np.setxor1d(index_array, cleared_index_array)
        X_filtered = X_cleared
        if len(removed_index_array) > 0:
            if self.remove_index:                 
                print(f"{self.__class__.__name__}: {len(removed_index_array)} clients were removed")
            else:
                X_copy[f"max_{self.target}"] = X_copy.groupby(self.index)[self.target].max()                 
                X_filtered = X_copy[(X_copy[self.target] >= self.filter_min_target) | (X_copy[f"max_{self.target}"] == X_copy[self.target])]  
                X_filtered = X_filtered.drop(f"max_{self.target}", axis=1)
                print(f"{self.__class__.__name__}: {len(removed_index_array)} clients were removed and merged back")
        return X_filtered
    
    def __repr__(self):
        return f"FilterMinTotal: \n {self.target} >= {self.filter_min_target} by={self.index}"
        
def create_feature(X: pd.DataFrame, feature_gen_list: list) -> pd.DataFrame:
    """ Create dataset features
    
    Attributes
    ----------
    X: pd.DataFrame
     initial feature
    y: np.ndarray
     target
    feature_config: dict
     config with feature generatation functions
    """
    X_new = X.copy()
    for fg in feature_gen_list:
        print(f"Adding feature with {fg}:-------")
        X_new = fg(X_new)
    return X_new

def filter_feature(X: pd.DataFrame, feature_filter_list: list) -> pd.DataFrame:
    X_filtered = X.copy()
    for ff in feature_filter_list:
        print(f"{ff}")
        X_filtered = ff(X_filtered)
    return X_filtered

In [None]:
feature_generator = [
    ExtractorLastNTarget(by="client_pin",
                target="multi_class_target",
                num=30),
    ExtractorClientTargetFreq(column="multi_class_target")
]

train_target_featured = create_feature(train_target, feature_generator)

In [None]:
feature_generator_filter = [
    #FilterMinTotal(target="f_total", filter_min_target=20)
]

train_target_featured_filtered = filter_feature(train_target_featured, feature_generator_filter)

In [None]:
train_target_featured_filtered

In [None]:
train_target_featured_filtered.columns

# Learn a model

In [None]:
features_list = [c for c in train_target_featured_filtered.columns if c.startswith("f_")]
features_list

cat_features = [c for c in features_list if c.startswith("f_last")]
num_features = ['f_total',
 'f_freq_mobile_recharge',
 'f_freq_statement',
 'f_freq_phone_money_transfer',
 'f_freq_chat',
 'f_freq_invest',
 'f_freq_main_screen',
 'f_freq_own_transfer',
 'f_freq_card_recharge',
 'f_freq_credit_info',
 'f_freq_card2card_transfer']

In [None]:
assert train_target_featured_filtered.timestamp.is_monotonic
X_train, X_val, y_train, y_val = train_test_split(train_target_featured_filtered[features_list],
                                                  train_target_featured_filtered.multi_class_target,
                                                  train_size=0.8, random_state=42, shuffle=False)

In [None]:
model = CatBoostClassifier(
    iterations=200,
    learning_rate=0.1,
    save_snapshot=True
    # loss_function='CrossEntropy'
)
model.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_val, y_val),
    verbose=False,
    plot=True
)
print('Model is fitted: ' + str(model.is_fitted()))
print('Model params:')
print(model.get_params())

In [None]:
pool = Pool(data=X_val, label=y_val, cat_features=cat_features)

metrics = model.eval_metrics(
    data=pool,
    metrics=['TotalF1:average=Macro'],
    ntree_start=0,
    ntree_end=0,
    eval_period=1,
    plot=True
)

In [None]:
pool = Pool(data=X_val, label=y_val, cat_features=cat_features)

metrics = model.eval_metrics(
    data=pool,
    metrics=['TotalF1:average=Macro;use_weights=False'],
    ntree_start=0,
    ntree_end=0,
    eval_period=1,
    plot=True
)

In [None]:
model.feature_importances_

# Make Prediction

In [None]:
test_target = pd.read_csv("alfabattle2_prediction_session_timestamp.csv")
test_target.head()

In [None]:
test_target = pd.read_csv("alfabattle2_prediction_session_timestamp.csv")
test_target["test"] = True
test_target = pd.concat([test_target, train_target], axis=0).reset_index(drop=True)
test_target.sort_values("timestamp", inplace=True)

In [None]:
test_target.test.replace(np.NaN, False, inplace=True)
test_target.loc[test_target.test, "multi_class_target"] = "statement"
test_target

In [None]:
test_target.loc[test_target.test, "session_id"] = np.arange(len(test_target.loc[test_target.test, "session_id"]))
test_target

In [None]:
# ExtractorClientTargetFreq(column="multi_class_target").prepare_user_freq_matrix(test_target)

In [None]:
test_featured = create_feature(test_target, feature_generator)
# test_featured_filtered = filter_feature(test_featured, feature_generator_filter)

In [None]:
X_test = test_featured.loc[test_featured.test, features_list]
X_test[cat_features] = X_test[cat_features].replace(np.NaN, "NaN")
best_model = model

test_pool = Pool(data=X_test, cat_features=cat_features)
contest_predictions = best_model.predict_proba(test_pool)
print('Predictions:')
print(contest_predictions)

In [None]:
prediction = pd.read_csv("alfabattle2_abattle_sample_prediction.csv")
prediction["prediction"] = pd.Series(best_model.predict(test_pool, prediction_type="Class").reshape(-1))
prediction

In [None]:
prediction.to_csv("submission.csv", index=False)

In [None]:
pd.read_csv("alfabattle2_prediction_session_timestamp.csv")

In [None]:
def preprocess_test(X: pd.DataFrame):
    X["test"] = True
    

# Lost in time and space

In [None]:
    
#     def prepare_user_freq_matrix(self, X: pd.DataFrame) -> pd.DataFrame:
#         client_freq = pd.pivot_table(X, index=self.index,
#                        columns=self.column,
#                       values="session_id",
#                       aggfunc="count")
#         client_freq.replace(np.NaN, 0, inplace=True)
#         client_freq["total"] = client_freq.sum(axis=1)
#         client_freq = client_freq.div(client_freq["total"], axis=0)
#         client_freq.drop("total", axis=1, inplace=True)
#         client_freq.columns = [f"f_freq_{c}"for c in client_freq.columns]
#         client_freq.columns.name = None
#         client_freq = client_freq.reset_index()        
#         return client_freq