In [None]:
import regex as re
import gc
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Loading the clean train and test dataset

train_data = pd.read_pickle('advanced_fe_train.pkl')
test_data = pd.read_pickle('advanced_fe_test.pkl')

In [None]:
print("*"*35)
print("\n Train Data Shape : {} \n".format(train_data.shape))
print("\n Test Data Shape : {} \n".format(test_data.shape))
print("*"*35)

***********************************

 Train Data Shape : (590538, 313) 


 Test Data Shape : (506691, 312) 

***********************************


## Utility Functions
<br>

In [None]:
def cat_num_features(df):
    
    '''
        Utility Function to get the names of Categorical Features and 
        Numerical Features of the given Dataset.
    '''
    
    catf = []
    numf = []
    
    # Given Categorical Features 
    catf = ['ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 
            'card6', 'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain', 'M1', 'M2', 
            'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'DeviceType', 'DeviceInfo',
            'TransactionWD', 'card1_div_1000', 'card2_div_10', 'P_parent_domain', 
            'P_domain_name', 'P_top_level_domain', 'R_parent_domain', 'R_domain_name', 
            'R_top_level_domain', 'device_name', 'device_version', 'os_name', 'os_version', 
            'screen_width', 'screen_height', 'card_intr1', 'card_intr2', 'card1_addr1', 
            'card1_addr2', 'card2_addr1', 'card2_addr2', 'card3_addr1', 'card3_addr2', 
            'card5_addr1', 'card5_addr2', 'card6_addr1', 'card6_addr2', 'ProductCD_addr1', 
            'ProductCD_addr2', 'card1_ProductCD', 'card2_ProductCD', 'card5_ProductCD', 
            'card6_ProductCD', 'addr1_P_emaildomain', 'card1_P_emaildoman', 'card1_addr1_P_emaildomain',
            ]
    catf+=['id_'+str(i) for i in range(12,39)]


    # Updating the Categorical Feature Names List based on the columns present in the dataframe
    catf = [feature for feature in catf if feature in df.columns.values]
    numf = [feature for feature in df.columns if feature not in catf and not feature == 'isFraud']
    
    return (catf, numf)  

In [None]:
def covariate_shift(train_df, test_df, feature, catf):

    train = pd.DataFrame(data={feature: train_df[feature], 'isTest': 0})
    test = pd.DataFrame(data={feature: test_df[feature], 'isTest': 1})

    df = pd.concat([train, test], ignore_index=True)
    del train, test

    if str(df[feature].dtype) in catf:
        df[feature] = LabelEncoder().fit_transform(df[feature].astype(str))
    
    X_train, X_test, y_train, y_test = train_test_split(df[feature], df['isTest'], test_size=0.33,
                                                        random_state=3, stratify=df['isTest'])
    clf = XGBClassifier(
      objective='binary:logistic',
      eval_metric='auc',
      n_estimators=500,
      tree_method='gpu_hist',
      random_state=3,
    )

    clf.fit(X_train.values.reshape(-1,1), y_train.values, verbose=1)
    
    roc_auc =  roc_auc_score(y_test.values, clf.predict_proba(X_test.values.reshape(-1,1))[:, 1])

    del df, X_train, y_train, X_test, y_test
        
    return roc_auc

In [None]:
# https://www.kaggle.com/c/ieee-fraud-detection/discussion/111696

def adversarial_validation(train_df, test_df, threshold, catf):
    list_auc_value = []

    cols = list(train_df.columns)
    cols.remove('isFraud')

    for f in cols:

        auc = covariate_shift(test_df , train_df, f, catf)
        list_auc_value.append(auc)
        if auc > threshold:
            print('feature:', f, 'covariate shift:', auc)

    cov = pd.Series(list_auc_value, index = cols).sort_values() 
    discarded_features = list(cov[cov > threshold].index)
        
    return cov, discarded_features

## Testing each Feature for Covariate Shift
<br>

We will be finding all the features which are responsible for making the train data very different from test data. We will store all such columns separately and will not use while doing final modeling.
<br><br>

In [None]:
# Storing Categorical and Numerical Feature Names 

catf, numf = cat_num_features(train_data)

In [None]:
# Considering all the features which have a test auc of 0.7 to be covariate shifted
cov, list_discarded = adversarial_validation(train_data, test_data, 0.7, catf)

feature: TransactionID covariate shift: 0.9999914634147976
feature: TransactionDT covariate shift: 0.9999914634147976
feature: D1 covariate shift: 0.9143981259080995
feature: D2 covariate shift: 0.7230931320205236
feature: D3 covariate shift: 0.8812546211710214
feature: D4 covariate shift: 0.8202221166684895
feature: D5 covariate shift: 0.8368906385231879
feature: D10 covariate shift: 0.8707121247425403
feature: D11 covariate shift: 0.730581062379851
feature: D15 covariate shift: 0.850172822904419
feature: TransactionDay covariate shift: 0.9999898440887305
feature: DT_M covariate shift: 1.0


In [None]:
# Features having Covariate Shift

list_discarded

['D2',
 'D11',
 'D4',
 'D5',
 'D15',
 'D10',
 'D3',
 'D1',
 'TransactionDay',
 'TransactionDT',
 'TransactionID',
 'DT_M']

In [None]:
with open("features_having_covariate_shift.pkl", 'wb') as handle:
  pickle.dump(list_discarded, handle)