In [26]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
import numpy as np

%matplotlib inline

In [27]:
def data_preprocessed(data = "train"):
    if data == "train":
        df = pd.read_csv("train_dataset_train.csv")
    else:
        df = pd.read_csv("test_dataset_test.csv")
    df.fillna(0)
    #col_obj = df.select_dtypes(include=['object']).columns.values
    new_df = df["oper_type + oper_attr"].str.split('_',expand=True)
    new_df.columns=["oper_type", "oper_attr"]
    df["oper_attr"] = new_df["oper_attr"]
    df["oper_type"] = new_df["oper_type"]
    mas_attr = ['51', '1002']
    mas_type =['10', '34', '128']
    df = (df[df.oper_type.isin(mas_type) == False])
    df = (df[df.oper_attr.isin(mas_attr) == False])
    col_obj = ["oper_type + oper_attr", 'name_mfi']
    if data == "train":
        X = df.drop(list(set(col_obj) ^ set(['id'])), axis = 1)
        return X
    else:
        X = df.drop(list(set(col_obj)), axis = 1)
        return X

In [37]:
def my_predict(model, name_file_predict):
    test = data_preprocessed("test")
    feat = ['index_oper', 'type', 'priority', 'is_privatecategory', 'class', 'mailtype', 'mailctg', 'mailrank', 'directctg',
            'transport_pay', 'postmark', 'is_in_yandex', "is_return", "oper_type", 'is_wrong_sndr_name',
            'is_wrong_rcpn_name', 'is_wrong_phone_number', 'is_wrong_address', "oper_attr"]
    test[feat] = test[feat].astype('string')
    prediction = model.predict(test.drop(['id'], axis = 1))
    sub = pd.DataFrame({'id': test["id"], 'label': prediction})
    sub.to_csv(name_file_predict, line_terminator = '\n', index=False)

In [29]:
train = data_preprocessed()

In [31]:
feat = ['index_oper', 'type', 'priority', 'is_privatecategory', 'class', 'mailtype', 'mailctg', 'mailrank', 'directctg',
        'transport_pay', 'postmark', 'is_in_yandex', "is_return", "oper_type", 'is_wrong_sndr_name',
        'is_wrong_rcpn_name', 'is_wrong_phone_number', 'is_wrong_address', "oper_attr"]
train[feat] = train[feat].astype('string')

In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train.drop(['label'], axis = 1), train['label'], test_size=0.33, random_state= 42)

In [39]:
from imblearn.over_sampling import RandomOverSampler
OverS = RandomOverSampler(random_state=42)
X_Over, Y_Over = OverS.fit_resample(X_train, y_train)

In [34]:
X_train.head()

Unnamed: 0,index_oper,type,priority,is_privatecategory,class,is_in_yandex,is_return,weight,mailtype,mailctg,...,total_qty_oper_login_1,total_qty_oper_login_0,total_qty_over_index_and_type,total_qty_over_index,is_wrong_sndr_name,is_wrong_rcpn_name,is_wrong_phone_number,is_wrong_address,oper_attr,oper_type
4521366,102971.0,Цех,7506.0,N,0.0,N,N,26.0,5.0,0.0,...,60613352.0,10648.0,60624000.0,75592387.0,0,0,1,0,-1,1
5748280,102971.0,Цех,7506.0,N,0.0,N,N,130.0,5.0,0.0,...,67541214.0,7870.0,67549084.0,72981134.0,0,1,1,0,-1,1022
2464773,102976.0,ММПО,7503.0,N,0.0,N,N,459.0,5.0,1.0,...,64270133.0,116432632.0,180702765.0,188407812.0,0,0,0,0,6,8
1530661,102976.0,ММПО,7503.0,N,0.0,N,N,9.0,5.0,0.0,...,64270133.0,116432632.0,180702765.0,188407812.0,0,0,1,0,-1,1014
5566679,630302.0,Цех,7503.0,N,0.0,N,N,181.0,5.0,1.0,...,25083674.0,33874.0,25117548.0,82035345.0,0,1,0,0,-1,1018


In [None]:
from catboost import CatBoostClassifier

cat_features = ['index_oper', 'type', 'priority', 'is_privatecategory', 'class','mailtype', 'mailctg', 'mailrank', 'directctg', 'transport_pay', 'postmark', 'is_in_yandex', "is_return", "oper_type", 'is_wrong_sndr_name', 'is_wrong_rcpn_name', 'is_wrong_phone_number', 'is_wrong_address',  "oper_attr"]

model = CatBoostClassifier(
    iterations=150,
    early_stopping_rounds = 20,
    random_seed=42,
    #metric_period = 20,
    learning_rate=0.2,
    task_type = "GPU",
    eval_metric = 'Recall'
)
model.fit(
    X_Over, Y_Over,
    cat_features = cat_features,
    eval_set=(X_test, y_test),
    verbose = True
)

In [None]:
my_predict(model, "model.csv")

In [43]:
y_pred = model.predict(X_test)

In [44]:
print("F1: ", f1_score(y_test, y_pred))
print("Accuracy: ", accuracy_score(y_test, y_pred))
score_recall = recall_score(y_test, y_pred, average = "macro" )
score_auc = roc_auc_score(y_test,  y_pred, multi_class='ovo')
print("Recall: ", score_recall)
print("ROC_AUC: ", score_auc)
print("Score: ", 0.1 * score_recall + 0.9 * score_auc)

F1:  0.5596950976484656
Accuracy:  0.9553646239215272
Recall:  0.9743119550881396
ROC_AUC:  0.9743119550881396
Score:  0.9743119550881396


F1:  0.23920931091816
ROC_AUR:  0.8839866552620088
Recall:  0.9429362073987766
Accuracy:  0.82841
Score = 0.845385