In [1]:
import os
import gc
import time
import shutil
import feather
import numpy as np
import pandas as pd
from scipy.stats import norm, rankdata
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc, roc_auc_score

In [2]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

Saving the 'target' and 'ID_code' data.

In [3]:
target = train.pop('target')
train_ids = train.pop('ID_code')
test_ids = test.pop('ID_code')

In [4]:
len_train = len(train)

In [5]:
#Merging test and train.
merged = pd.concat([train, test])

In [6]:
#Saving the list of original features in a new list `original_features`.
original_features = merged.columns

In [7]:
for col in merged.columns:
    # Normalize the data, so that it can be used in norm.cdf(), 
    # as though it is a standard normal variable
    merged[col] = ((merged[col] - merged[col].mean()) 
    / merged[col].std()).astype('float32')

    # Square
    merged[col+'^2'] = merged[col] * merged[col]

    # Cube
    merged[col+'^3'] = merged[col] * merged[col] * merged[col]

    # 4th power
    merged[col+'^4'] = merged[col] * merged[col] * merged[col] * merged[col]

    # Cumulative percentile (not normalized)
    merged[col+'_cp'] = rankdata(merged[col]).astype('float32')

    # Cumulative normal percentile
    merged[col+'_cnp'] = norm.cdf(merged[col]).astype('float32')

In [8]:
#Getting the list of names of the added features.
new_features = set(merged.columns) - set(original_features)

In [9]:
#Normalize the data. Again.#
for col in new_features:
    merged[col] = ((merged[col] - merged[col].mean()) 
    / merged[col].std()).astype('float32')

In [10]:
# Logistic regession with the added features
NFOLDS = 15
RANDOM_STATE = 871972


train=merged.iloc[:len_train]
test=merged.iloc[len_train:]

feature_list = train.columns

test = test[feature_list]

X = train.values.astype(float)
X_test = test.values.astype(float)

folds = StratifiedKFold(n_splits=NFOLDS, shuffle=True, 
                        random_state=RANDOM_STATE)
oof_preds = np.zeros((len(train), 1))
test_preds = np.zeros((len(test), 1))
roc_cv =[]

Modeling.

In [11]:
y=target
for fold_, (trn_, val_) in enumerate(folds.split(y, y)):
    print("Current Fold: {}".format(fold_))
    trn_x, trn_y = X[trn_, :], y[trn_]
    val_x, val_y = X[val_, :], y[val_]
    
    clf = Pipeline([
        #('scaler', StandardScaler()),
        #('qt', QuantileTransformer(output_distribution='normal')),
        ('lr_clf', LogisticRegression(solver='lbfgs', max_iter=4000, C=10))
    ])

    clf.fit(trn_x, trn_y)

    val_pred = clf.predict_proba(val_x)[:,1]
    test_fold_pred = clf.predict_proba(X_test)[:,1]
    
    roc_cv.append(roc_auc_score(val_y, val_pred))
    
    print("AUC = {}".format(roc_auc_score(val_y, val_pred)))
    oof_preds[val_, :] = val_pred.reshape((-1, 1))
    test_preds += test_fold_pred.reshape((-1, 1))

Current Fold: 0
AUC = 0.8933077235134981
Current Fold: 1
AUC = 0.9078332076486003
Current Fold: 2
AUC = 0.8922540872426263
Current Fold: 3
AUC = 0.888915850960306
Current Fold: 4
AUC = 0.9071914688687627
Current Fold: 5
AUC = 0.8974788389219484
Current Fold: 6
AUC = 0.8976365048195739
Current Fold: 7
AUC = 0.9022757056043887
Current Fold: 8
AUC = 0.9008628167426024
Current Fold: 9
AUC = 0.8990194528898076
Current Fold: 10
AUC = 0.8967333556514933
Current Fold: 11
AUC = 0.8940150411122907
Current Fold: 12
AUC = 0.893734155869531
Current Fold: 13
AUC = 0.9006149778558279
Current Fold: 14
AUC = 0.8917885694711012


Predicting.

In [12]:
test_preds /= NFOLDS

Evaluating the cross-validation AUC score (we compute both the average AUC for all folds and the AUC for combined folds).  

In [13]:
roc_score_1 = round(roc_auc_score(y, oof_preds.ravel()), 5)
roc_score = round(sum(roc_cv)/len(roc_cv), 5)
st_dev = round(np.array(roc_cv).std(), 5)

print("Average of the folds' AUCs = {}".format(roc_score))
print("Combined folds' AUC = {}".format(roc_score_1))
print("The standard deviation = {}".format(st_dev))

Average of the folds' AUCs = 0.89758
Combined folds' AUC = 0.89757
The standard deviation = 0.00532


Creating the submission file.

In [14]:
print("Saving submission file")
sample = pd.read_csv('../input/sample_submission.csv')
sample.target = test_preds.astype(float)
sample.ID_code = test_ids
sample.to_csv('submission.csv', index=False)

Saving submission file


The LB score is now 0.985 versus 0.984 for linear regression. The mprovement of 0.001 is obviously very small. It looks like for this data linear and logistic regression work equally well! Moving forward, I think it would be interesting to see how the feature engineering presented here would affect other classification models (e.g. Gaussian Naive Bayes, LDA, LightGBM, XGBoost, CatBoost).