## Предсказание пола клиента по транзакционным данным - [kaggle](https://www.kaggle.com/c/python-and-analyze-data-final-project/overview)

**Описание задачи**

Одним из самых ценных источников информации о клиенте являются данные о банковских транзакциях. В этом соревновании участникам предлагается ответить на вопрос: возможно ли предсказать пол клиента, используя сведения о поступлениях и тратах по банковской карте? И если возможно, то какая точность такого предсказания?

**Формальная постановка задачи**

Необходимо предсказать вероятность пола "1" для каждого "customerid".

**Полезная информация**

Для понимания представленных данных будет полезна следующая [статья](https://www.banki.ru/wikibank/mcc-kod/)

**Метрика качества**

В качестве метрики качества предсказания была выбрана ROC AUC (area under the receiver operating characteristic curve).

**Описание датасета**

* customer_id - id пользователя
* tr_datetime - время транзакции
* mcc_code - код вида торговой точки
* tr_type - тип транзакции
* amount - сумма транзакции
* term_id - id терминала оплаты

* **gender - целевая переменная, пол**

In [25]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
from sklearn.inspection import permutation_importance
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from collections import defaultdict
from gensim.models import word2vec
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')
import dill

In [2]:
def roc_auc(y_proba, y_true):
    '''
    Make a plot - ROC AUC curve, returns - area_under_curve value

    arguments:
    y_proba: numpy array - predicted probabilities 
    y_true: numpy array - true classes

    returns:
    area_under_curve: float
    '''

    TPR = []
    FPR = []
    tresholds = np.linspace(0, 1, 20)

    for treshold in tresholds:
        y_pred = np.where(y_proba >= treshold, 1, 0)

        TP = np.sum((y_true == y_pred) & (y_pred == 1))
        FP = np.sum((y_true != y_pred) & (y_pred == 1))
        TN = np.sum((y_true == y_pred) & (y_pred == 0))
        FN = np.sum((y_true != y_pred) & (y_pred == 0))

        tpr = TP / (TP + FN); TPR.append(tpr)
        fpr = FP / (FP + TN); FPR.append(fpr)

    area_under_curve = np.abs(np.trapz(TPR, FPR))
    x = np.linspace(0, 1, 50)
    y = x

    plt.figure(figsize=(10, 6))
    plt.plot(FPR, TPR, linewidth=1.5, color='#0B032D')
    plt.plot(x, y, linestyle='--', linewidth=1.5, color='#e36414')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC AUC curve', size=20)

    return f'ROC AUC score = {area_under_curve:.4f}'

In [3]:
def save(df, PATH):
    df.to_csv(PATH, index=False, encoding='utf-8', sep=';')

In [4]:
DATA_ROOT = Path('./data2/')
MODELS_PATH = Path('./models2/')

# input
TRANSACTIONS_DATASET_PATH = DATA_ROOT / 'transactions.csv'
GENDER_TRAIN_DATASET_PATH = DATA_ROOT / 'gender_train.csv'
Y_TEST = DATA_ROOT / 'gender_test_kaggle_sample_submission.csv'

# output
PREPARED_TRAIN_DATASET_PATH = DATA_ROOT / 'train_prepared.csv'
PREPARED_TEST_DATASET_PATH = DATA_ROOT / 'test_prepared.csv'

VALID_DATASET_PATH = DATA_ROOT / 'df_valid.csv'

MODEL_FILE_PATH = MODELS_PATH / 'model.pkl'

In [5]:
transactions = pd.read_csv(TRANSACTIONS_DATASET_PATH)
gender = pd.read_csv(GENDER_TRAIN_DATASET_PATH)
test_id = pd.read_csv(Y_TEST)

df = pd.merge(transactions, gender, on='customer_id', how='left')

# Сделаем отложенную выборку, которая в обучении не будет участвовать
customer_train, customer_valid = train_test_split(gender.customer_id, test_size=0.1, shuffle=True, random_state=42)

df_valid = df.loc[df['customer_id'].isin(customer_valid)]

save(df_valid, VALID_DATASET_PATH)

In [6]:
df_train = df.loc[df['customer_id'].isin(customer_train)]
df_test = df.loc[df['customer_id'].isin(test_id.customer_id.values)]

users_train = df_train.groupby('customer_id')['mcc_code'].apply(list).reset_index()
users_test = df_test.groupby('customer_id')['mcc_code'].apply(list).reset_index()

type_train = df_train.groupby('customer_id')['tr_type'].apply(list).reset_index()
type_test = df_test.groupby('customer_id')['tr_type'].apply(list).reset_index()

df_train = pd.merge(users_train, type_train,  on='customer_id', how='left')
df_test = pd.merge(users_test, type_test, on='customer_id', how='left')

df_train = pd.merge(df_train, gender, on='customer_id', how='left')
df_train.sample(5)

Unnamed: 0,customer_id,mcc_code,tr_type,gender
5824,77436906,"[6011, 6011, 6011, 6011, 6011, 4814, 6011, 592...","[7010, 7010, 7010, 7010, 7010, 1030, 7010, 111...",1
3544,47746315,"[6011, 6011, 6011, 6011, 6011, 4814, 6011, 601...","[2010, 4010, 2010, 4010, 2010, 1030, 2010, 701...",1
3773,50544213,"[6011, 5942, 5621, 6011, 6010, 6011, 5691, 601...","[7010, 1010, 1110, 2010, 7030, 2210, 1210, 421...",0
291,4208081,"[6011, 5945, 4829, 6010, 5732, 5411, 4814, 601...","[2010, 1010, 2370, 7070, 1110, 1110, 1030, 201...",1
2979,40224781,"[6011, 4814, 4814, 4814, 6011, 6011, 6011, 601...","[7010, 1030, 1030, 1030, 2010, 2010, 2010, 201...",0


In [7]:
X_train = df_train.drop(columns='gender')
y_train = df_train['gender']

In [8]:
target_values = gender.gender.value_counts()
target_values

0    4687
1    3713
Name: gender, dtype: int64

In [9]:
disbalance = target_values[0] / target_values[1]
print(f'Количество наблюдений в обучающей выборке: {gender.gender.count()}',
      f'Количество наблюдений в тестовой выборке: {len(transactions.customer_id.unique()) - gender.gender.count()}',
      f'Дисбаланс целевой переменной: {disbalance:.2f}', sep='\n')

Количество наблюдений в обучающей выборке: 8400
Количество наблюдений в тестовой выборке: 6600
Дисбаланс целевой переменной: 1.26


In [26]:
#соберем наш простой pipeline, но нам понадобится написать класс для выбора нужного поля

class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
#класс выполняющий tfidf преобразование.

class NewTfidfVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, vec_size):
        self.vec_size = vec_size
        self.word2vec = None
        self.word2weight = None
        self.tfidf = None
        self.dim = None

    def fit(self, X, y=None):

        model = word2vec.Word2Vec(X, vector_size=self.vec_size, window=3, workers=4)
        #словарь со словами и соответсвующими им векторами
        self.word2vec = dict(zip(model.wv.index_to_key, model.wv.vectors))
        self.dim = len(next(iter(self.word2vec.values())))
        
        self.tfidf = TfidfVectorizer(analyzer=lambda x: x)
        self.tfidf.fit(X)
        max_idf = max(self.tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, self.tfidf.idf_[i]) for w, i in self.tfidf.vocabulary_.items()])

        return self

    def transform(self, X, y=None):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [27]:
mcc_code = Pipeline([('mcc_selector', FeatureSelector(column='mcc_code')), 
                     ('mcc_tfidf', NewTfidfVectorizer(80))])
tr_type = Pipeline([('types_selector', FeatureSelector(column='tr_type')), 
                     ('types_tfidf', NewTfidfVectorizer(20))])

feats = FeatureUnion([('mcc_codes', mcc_code),
                      ('tr_types', tr_type)])

catboost = Pipeline([('features',feats),
                     ('classifier', CatBoostClassifier(n_estimators=350, depth=4,
                                                       class_weights=[1, disbalance],
                                                       custom_metric='AUC', random_state=42,
                                                       verbose=False))])

In [28]:
#запустим кросс-валидацию
cv_scores = cross_val_score(catboost, X_train, y_train, cv=5, scoring='roc_auc')
cv_score = np.mean(cv_scores)
print('CV score is {}'.format(cv_score))

#обучим пайплайн на всем тренировочном датасете
catboost.fit(X_train, y_train)
y_score = catboost.predict_proba(df_test)[:, 1]

CV score is 0.8242363407657562


In [29]:
catboost.steps

[('features',
  FeatureUnion(transformer_list=[('mcc_codes',
                                  Pipeline(steps=[('mcc_selector',
                                                   FeatureSelector(column='mcc_code')),
                                                  ('mcc_tfidf',
                                                   NewTfidfVectorizer(vec_size=80))])),
                                 ('tr_types',
                                  Pipeline(steps=[('types_selector',
                                                   FeatureSelector(column='tr_type')),
                                                  ('types_tfidf',
                                                   NewTfidfVectorizer(vec_size=20))]))])),
 ('classifier', <catboost.core.CatBoostClassifier at 0x12c1e7d00>)]

In [30]:
test_id

Unnamed: 0,customer_id,probability
0,75562265,0.152664
1,84816985,0.302357
2,54129921,0.062036
3,23160845,0.459860
4,45646491,0.835253
...,...,...
3595,89647691,0.242247
3596,53818229,0.145849
3597,88924303,0.471155
3598,28118312,0.526089


In [32]:
df_test['gender'] = y_score
test_id = pd.merge(test_id, df_test[['customer_id','gender']], on='customer_id', how='left')
test_id.drop(columns='probability', inplace=True)
test_id.rename(columns={'gender':'probability'}, inplace=True)
test_id.to_csv('Data2/Sokolova_predictions.csv', index=False, encoding='utf-8', sep=',')
test_id

Unnamed: 0,customer_id,probability
0,75562265,0.290222
1,84816985,0.788754
2,54129921,0.320163
3,23160845,0.342247
4,45646491,0.438755
...,...,...
3595,89647691,0.216668
3596,53818229,0.722846
3597,88924303,0.362166
3598,28118312,0.348233


### Сохранение модели

In [34]:
with open(MODEL_FILE_PATH, "wb") as f:
    dill.dump(catboost, f)