In [8]:
import pandas as pd
import numpy as np
import preprocess as pp

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import xgboost as xgb
import os
import time

In [9]:
print(pp.__file__)

/Users/HongX/Desktop/Documents/Self study/Data Science/Kaggle/Fraud Detection/Fraud-Detection/preprocess.py


In [2]:
def feature_creation(categorical, numerical, method, df):
    
    #Creating some features by default because they will probably be needed anyway
    means_temp = df.groupby(categorical)[numerical].agg(['mean']).to_dict()
    means = df[categorical].map(means_temp['mean'])
    
    stds_temp = df.groupby(categorical)[numerical].agg(['std']).to_dict()
    stds = df[categorical].map(stds_temp['std'])
    
    
    if method == 'counts':
        counts_temp = df[categorical].value_counts().to_dict()
        counts = df[categorical].map(counts_temp)
        return(counts)
    
    if method == 'means':
        return(means)
    
    if method == 'stds':
        return(stds)
    
    if method == "devs":
        devs = df[numerical] - means
        return(devs)
    
    if method == "std_devs":
        devs = df[numerical] - means
        std_devs = devs/stds
        return(std_devs)
    
def feature_aggregation_creation(combination_list, df):
    
    out_df = pd.DataFrame(
        {'temp':np.zeros(len(df))}
    )
    
    for i in np.arange(0,len(combination_list)):
        combination = combination_list[i]
        
        print(combination)
        feature = feature_creation(
            categorical = combination[0],
            numerical = combination[1],
            method = combination[2],
            df=df)
        
        name = combination[0] + '.' + combination[1] + '.' + combination[2]
        out_df[name] = feature
        
    out_df.drop('temp',axis=1,inplace=True)
    return(out_df)    

In [3]:
train = pd.read_csv('Data/train_transaction.csv')
test = pd.read_csv('Data/test_transaction.csv')

In [4]:
y = train['isFraud']
train.drop(['TransactionID','isFraud'],axis=1,inplace=True)

train_shape = train.shape
test_shape = train.shape

In [5]:
X = pd.concat([train,test], ignore_index=True)
del train, test

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [6]:
numerics = X.select_dtypes(exclude='object')
strings = X.select_dtypes(include='object')
strings = strings.fillna('NaN')
del X

numerics = pp.fill_nans(numerics)
X = pd.concat([numerics, strings], axis =1)

del numerics, strings

Ok, pretty standard stuff so far. Now what we're going to do differently is create our new features and add them to our existing variables.

In [7]:
new_ranks = pd.read_csv('Data/new_rankings.csv')
new_ranks['standardised_weight'] = new_ranks['weight']/new_ranks['std']


#Dropping the infinite and missing values
new_ranks = new_ranks.replace([-np.inf,np.inf],np.nan)
new_ranks = new_ranks.dropna()

#Subsetting to variables with a standardised weight greater than 3 to ensure statistical significance at the 0.3% level
new_ranks = new_ranks[new_ranks['standardised_weight'] > 3]
new_ranks

Unnamed: 0.1,Unnamed: 0,feature,weight,std,standardised_weight
1,3,C4,0.002896,4.3e-05,67.077305
2,0,C14,0.005882,8.8e-05,67.049029
3,2,M6.C1.std_devs,0.004056,0.000155,26.124825
4,4,M4.V198.devs,0.0014,5.4e-05,25.732512
5,1,C1,0.00464,0.000192,24.174537
6,7,V58,0.000662,3.8e-05,17.592452
7,20,ProductCD.V63.means,0.000112,7e-06,14.96663
8,9,R_emaildomain.V108.means,0.0005,3.3e-05,14.940358
9,5,ProductCD,0.000838,6.2e-05,13.495081
10,8,R_emaildomain.V129.std_devs,0.000632,5.2e-05,12.171865


In [8]:
#newly created features have a full stop in their name, so this will identify them
feature_names = new_ranks['feature'].tolist()
new_feature_names = [feature for feature in feature_names if '.' in feature]

#This cuts us down to these features
features_to_deploy = new_ranks.loc[new_ranks['feature'].isin(new_feature_names)]
features_to_deploy

#To put into our feature creation function
combinations = [feature_name.split('.') for feature_name in new_feature_names ]
print(combinations)

[['M6', 'C1', 'std_devs'], ['M4', 'V198', 'devs'], ['ProductCD', 'V63', 'means'], ['R_emaildomain', 'V108', 'means'], ['R_emaildomain', 'V129', 'std_devs'], ['R_emaildomain', 'V129', 'devs'], ['R_emaildomain', 'V58', 'means'], ['R_emaildomain', 'V276', 'stds'], ['M4', 'V188', 'means'], ['M4', 'V198', 'std_devs'], ['M5', 'C4', 'std_devs'], ['M6', 'V278', 'std_devs']]


In [9]:
#Creating new features
new_features = feature_aggregation_creation(combinations, X)

#Adding them to our existing features
X = pd.concat([X,new_features], axis =1)


['M6', 'C1', 'std_devs']
['M4', 'V198', 'devs']
['ProductCD', 'V63', 'means']
['R_emaildomain', 'V108', 'means']
['R_emaildomain', 'V129', 'std_devs']
['R_emaildomain', 'V129', 'devs']
['R_emaildomain', 'V58', 'means']
['R_emaildomain', 'V276', 'stds']
['M4', 'V188', 'means']
['M4', 'V198', 'std_devs']
['M5', 'C4', 'std_devs']
['M6', 'V278', 'std_devs']


I'm also going to weed out numeric features with negative weights. These weights were determined by permutation importance in another notebook. 

In [10]:
#Importing rankings of features based off of permutation importance

numerics_rankings = pd.read_csv('Data/numerics_rankings.csv')

#Gettingvariables to drop
weak_numerics = numerics_rankings[numerics_rankings['weight'] <=0]
weak_numerics_names = weak_numerics['feature'].tolist()

#Dropping them
X.drop(weak_numerics_names,axis=1,inplace=True)


In [11]:
#Converting strings to dummies, I couldn't do this before as I needed them as categorical values to create the aggregations
strings = X.select_dtypes(include = 'object')
numerics = X.select_dtypes(exclude= 'object')

del X

dummies = encoded = strings.apply(LabelEncoder().fit_transform)
del strings

X = pd.concat([numerics, dummies], axis =1)
del numerics

In [12]:
X_train = X.iloc[:train_shape[0],:]
X_test = X.iloc[train_shape[0]:,:]

del X

X_train, X_val, y_train, y_val = train_test_split(X_train,y,test_size=0.2,shuffle=False)

In [13]:
#Changing some settings to prevent xgboost from killing the kernal
#see https://stackoverflow.com/questions/51164771/python-xgboost-kernel-died
os.environ['KMP_DUPLICATE_LIB_OK']='True'

#Splitting up training set into train and validation


#Setting up xgboost model
eval_set = [(X_val,y_val)]

model = xgb.XGBClassifier(
learning_rate = 0.2,
n_estimators = 300,
max_depth = 10,
objective = 'binary:logistic')

model.fit(X_train, y_train, 
      eval_metric = "auc", 
      eval_set= eval_set,
      early_stopping_rounds = 30)

[0]	validation_0-auc:0.826488
Will train until validation_0-auc hasn't improved in 30 rounds.
[1]	validation_0-auc:0.838582
[2]	validation_0-auc:0.840039
[3]	validation_0-auc:0.843782
[4]	validation_0-auc:0.84546
[5]	validation_0-auc:0.849732
[6]	validation_0-auc:0.850215
[7]	validation_0-auc:0.851663
[8]	validation_0-auc:0.85374
[9]	validation_0-auc:0.855649
[10]	validation_0-auc:0.860912
[11]	validation_0-auc:0.864926
[12]	validation_0-auc:0.864868
[13]	validation_0-auc:0.866153
[14]	validation_0-auc:0.870309
[15]	validation_0-auc:0.871149
[16]	validation_0-auc:0.872979
[17]	validation_0-auc:0.874732
[18]	validation_0-auc:0.876853
[19]	validation_0-auc:0.879509
[20]	validation_0-auc:0.879983
[21]	validation_0-auc:0.883976
[22]	validation_0-auc:0.887279
[23]	validation_0-auc:0.889778
[24]	validation_0-auc:0.892345
[25]	validation_0-auc:0.894108
[26]	validation_0-auc:0.89503
[27]	validation_0-auc:0.895977
[28]	validation_0-auc:0.897334
[29]	validation_0-auc:0.897848
[30]	validation_0-a

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.2,
       max_delta_step=0, max_depth=10, min_child_weight=1, missing=None,
       n_estimators=300, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [14]:
preds = model.predict_proba(X_test)

In [15]:
#Writing predictions to csv and submitting
submission = pd.read_csv('Data/sample_submission.csv')
submission['isFraud'] = preds[:,1]
submission.to_csv('Data/submission.csv',index=False)

Ok! Seems to do better than my previous Random Forests attempt. This time we got a public AUC of 0.916935 vs the 0.894957 with the Random Forests. Difficult to tell what this improvement was due too as I changed a lot of things i.e. switching to xgboost, creation of new features and filtering out of bad features using permutation importance. This is obviously not in line with the scientific method, but I did this because it takes a lot of computational time to train on the full data set, if serious about getting good results, I should learn to avoid this in future. Might be worth going back to cross check to see what changes actually had an impact.

In [19]:
X_train = pd.concat([X_train, X_val], ignore_index=True)
y = pd.concat([y_train, y_val], ignore_index=True)
train = pd.concat([y,X_train],axis=1)

del X_train, X_val,y

train.to_csv('Data/train_preprocessed_1.csv',index=False)
X_test.to_csv('Data/test_preprocessed_1.csv', index=False)