Соревнование https://inclass.kaggle.com/c/telecom-clients-churn-prediction

In [1]:
import warnings
import os

import pandas as pd
import numpy as np

from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import FeatureUnion, make_pipeline
from xgboost import XGBClassifier

warnings.simplefilter("ignore")

PATH_TO_DATA = '../data'

In [2]:
train_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'orange_small_churn_train_data.csv'))
test_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'orange_small_churn_test_data.csv'))

train_df.drop("ID", axis=1, inplace=True)
test_df.drop("ID", axis=1, inplace=True)

### Удаление пустых столбцов

In [3]:
empty_cols = [c for c in train_df.columns if train_df[c].dropna().shape[0] == 0]
print(len(empty_cols))
print(train_df.shape)
train_df.drop(empty_cols, axis=1, inplace=True)
print(train_df.shape)

print(test_df.shape)
test_df.drop(empty_cols, axis=1, inplace=True)
print(test_df.shape)

18
(40000, 231)
(40000, 213)
(10000, 230)
(10000, 212)


In [4]:
num_cols = train_df.columns[:-39]
cat_cols = train_df.columns[ -39:-1]

In [5]:
train_df[num_cols] = train_df[num_cols].fillna(train_df[num_cols].mean(axis=0), axis=0)
train_df[cat_cols] = train_df[cat_cols].fillna('N\A')

test_df[num_cols] = test_df[num_cols].fillna(test_df[num_cols].mean(axis=0), axis=0)
test_df[cat_cols] = test_df[cat_cols].fillna('N\A')

### Подготовка категориальных фичей к OHE

In [6]:
%%time
all_cats = pd.concat([train_df[cat_cols], test_df[cat_cols]], axis=0)
le_dict = {c:LabelEncoder().fit(all_cats[c].values) for c in cat_cols}

train_df[cat_cols] = train_df[cat_cols].apply(lambda x: le_dict[x.name].transform(x))
test_df[cat_cols] = test_df[cat_cols].apply(lambda x: le_dict[x.name].transform(x))

Wall time: 5.3 s


In [7]:
num_ids = np.array([(c in num_cols) for c in train_df.columns], dtype = bool)
cat_ids = np.array([(c in cat_cols) for c in train_df.columns], dtype = bool)

### Разбиение на траин и тест

In [8]:
X_train = train_df.values
y_train = train_df['labels'].values

X_test = test_df.values

### Построение и обучение модели

In [9]:
model = XGBClassifier(max_depth=7, learning_rate=0.05, n_estimators=250, silent=True, objective='binary:logistic', 
                        nthread=-1, gamma=0, min_child_weight=2,subsample=0.6, colsample_bytree = 0.6, reg_alpha=0, reg_lambda=1,
                        scale_pos_weight=0.04, base_score=0.5, seed=0, missing=None)

numeric_pipe = make_pipeline(FunctionTransformer(lambda x: x[:, num_ids]), StandardScaler(with_mean=0))
cat_pipe = make_pipeline(FunctionTransformer(lambda x: x[:, cat_ids]), OneHotEncoder(handle_unknown = 'ignore'))

pipe = make_pipeline(FeatureUnion([('numbers', numeric_pipe), ('categories', cat_pipe)]), model)

In [10]:
%%time
_ = pipe.fit(X_train, y_train)

Wall time: 1min 43s


### Формирование результата

In [11]:
result_df = pd.DataFrame()
result_df['result'] = pipe.predict_proba(X_test)[:, 1]
result_df.to_csv('result.csv', index_label = 'Id')