In [41]:
import pandas as pd
import numpy as np

from scipy.signal import argrelextrema

from lightgbm import LGBMClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import StackingClassifier

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

In [23]:
train = pd.read_csv('data/Train.csv')
test = pd.read_csv('data/Test.csv')
sub = pd.read_csv('data/Samplesubmission.csv')

In [26]:
target = train['CHURN']
train['REGULARITY'] = train['REGULARITY'].astype('float')
train = train.drop(['user_id', 'MRG', 'CHURN', 'TOP_PACK'], axis=1)
test = test.drop(['user_id', 'MRG', 'TOP_PACK'], axis=1)

In [27]:
for col in train.columns:
    if train[col].isna().sum() == 0:
        continue
        
    filler = 'NAN'
    if train[col].dtype != 'object':
        train[col + '_isna'] = train[col].isna().astype('int')
        test[col + '_isna'] = test[col].isna().astype('int')
        filler = train[col].median()

    train[col] = train[col].fillna(filler)
    test[col] = test[col].fillna(filler)

In [28]:
# def get_bins(data):
#     x, y = sns.kdeplot(data[data>0], log_scale=True, color='b').get_lines()[0].get_data()
#     plt.close()
#     ext, = argrelextrema(y, np.less)
#     return [min(data.min(), x.min()) - 1, *x[ext], max(data.max(), x.max()) + 1]

# for col in tqdm(train.columns[train.dtypes=='float']):
#     bins = get_bins(train[col])
#     train[col + '_minimums'] = pd.cut(train[col], bins)
#     test[col + '_minimums'] = pd.cut(test[col], bins)

ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
ohe_train = ohe.fit_transform(train[train.columns[train.dtypes=='object']])
ohe_test = ohe.transform(test[test.columns[test.dtypes=='object']])

train = np.hstack([ohe_train, train[train.columns[train.dtypes!='object']].values])
test = np.hstack([ohe_test, test[test.columns[test.dtypes!='object']].values])

In [47]:
def make_submission(prediction, filename):
    sub['CHURN'] = prediction
    sub.to_csv('data/submissions/{}.csv'.format(filename), index=False)

In [29]:
X_train, X_val, y_train, y_val = train_test_split(train, target)

In [30]:
lr = RidgeClassifier(class_weight='balanced')
lr.fit(X_train, y_train)

roc_auc_score(y_val, lr.decision_function(X_val))

0.9250693117958613

In [49]:
lr = RidgeClassifier(class_weight='balanced')
lr.fit(train, target)

prediction = lr.decision_function(test)

In [53]:
train = np.hstack([train, lr.decision_function(train).reshape(-1, 1)])
test = np.hstack([test, lr.decision_function(test).reshape(-1, 1)])

In [54]:
X_train, X_val, y_train, y_val = train_test_split(train, target)

In [58]:
lgbm = LGBMClassifier()

lgbm.fit(X_train, y_train)
roc_auc_score(y_val, lgbm.predict_proba(X_val)[:, 1])

0.9310806320470547

In [51]:
make_submission(prediction, 'ridge_nans')

In [None]:

# mask = np.random.choice(X_train.index, size=100000, replace=False)
# knn = KNeighborsClassifier()
# knn.fit(X_train.loc[mask], y_train.loc[mask])

# roc_auc_score(y_test, knn.predict_proba(X_test))

In [36]:
# quantiles = [0, *np.arange(0.05, 1, 0.1), 1]
# bins = train.MONTANT.quantile(quantiles)
# train.groupby(pd.cut(train.MONTANT, bins, duplicates='drop', include_lowest=True)).CHURN.mean()

MONTANT
(9.999, 500.0]         0.127763
(500.0, 1000.0]        0.079491
(1000.0, 2000.0]       0.055419
(2000.0, 3000.0]       0.378253
(3000.0, 4600.0]       0.030392
(4600.0, 7900.0]       0.022264
(7900.0, 15350.0]      0.014272
(15350.0, 470000.0]    0.010045
Name: CHURN, dtype: float64

In [68]:
positive = target.sum()
negative = target.shape[0] - positive
need = negative - positive

new_idx = np.hstack([np.random.choice(target[target==1].index, need), target[target==1].index])

X_oversampled = pd.get_dummies(pd.concat([train[target==0], train.loc[new_idx]]))
y_oversampled = pd.concat([target[target==0], target.loc[new_idx]])

In [69]:
idx = np.random.choice(target[target==1].index, int(positive * 0.3), replace=False)

In [70]:
test_idx1 = np.random.choice(target[target==1].index, int(positive * 0.3), replace=False)
test_idx0 = np.random.choice(target[target==0].index, int(negative * 0.3), replace=False)
full_test_idx = np.hstack([test_idx0, test_idx1])
full_train_idx = np.setdiff1d(target.index, full_test_idx)

X_train, X_test, y_train, y_test = X_oversampled.loc[full_train_idx], X_oversampled.loc[full_test_idx],\
                                    y_oversampled.loc[full_train_idx], y_oversampled.loc[full_test_idx]

In [71]:
lr = RidgeClassifier()

lr.fit(X_train, y_train)
roc_auc_score(y_test, lr.decision_function(X_test))

0.9254100729100593

In [42]:
estimators = [
    ('lgbm1', LGBMClassifier(random_state=1)),
    ('lgbm2', LGBMClassifier(random_state=2)),
    ('lgbm3', LGBMClassifier(random_state=3))
 ]

stacking = StackingClassifier(
    estimators=estimators, final_estimator=RidgeClassifier()
)

In [43]:
stacking.fit(X_train, y_train)

StackingClassifier(estimators=[('lgbm1', LGBMClassifier(random_state=1)),
                               ('lgbm2', LGBMClassifier(random_state=2)),
                               ('lgbm3', LGBMClassifier(random_state=3))],
                   final_estimator=RidgeClassifier())

In [46]:
roc_auc_score(y_val, stacking.decision_function(X_val))

0.9308577263466179

In [28]:
mapper = {x[1] : x[0] + 1 for x in enumerate(
    [x[0] for x in sorted(train.TOP_PACK.value_counts().items(), key=lambda x: -x[1])]
)}
quantiles = [0, *np.arange(0.05, 1, 0.1), 1]
bins = train.TOP_PACK.map(mapper).quantile(quantiles)

quantiled = pd.cut(train.TOP_PACK.map(mapper), bins, duplicates='drop')

train.groupby(train.TOP_PACK.map(quantiled)).CHURN.mean()

TOP_PACK
(1.0, 2.0]      NaN
(2.0, 3.0]      NaN
(3.0, 5.0]      NaN
(5.0, 7.0]      NaN
(7.0, 9.0]      NaN
(9.0, 13.0]     NaN
(13.0, 21.0]    NaN
(21.0, 140.0]   NaN
Name: CHURN, dtype: float64