In [2]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
from sklearn import metrics
import pandas as pd
import matplotlib.pyplot as plt
from skopt import gp_minimize
from skopt.utils import use_named_args
from skopt.space import Real, Integer

%matplotlib inline

In [3]:
performance  = pd.read_csv('train/performance_train.csv', index_col= False)
facturation  = pd.read_csv('train/facturation_train.csv', index_col= False)
paiements    = pd.read_csv('train/paiements_train.csv', index_col= False)
transactions = pd.read_csv('train/transactions_train.csv', index_col= False)
#load test dataset
performance_test  = pd.read_csv('test/performance_test.csv', index_col= False)
facturation_test  = pd.read_csv('test/facturation_test.csv', index_col= False)
paiements_test    = pd.read_csv('test/paiements_test.csv', index_col= False)
transactions_test = pd.read_csv('test/transactions_test.csv', index_col= False)



In [4]:
def summarize_by_ID(dataframe):
    output = {}
    DECISION_keys = dataframe["DECISION_XCD"].value_counts().keys()
    TRANSACTION_C_keys = dataframe["TRANSACTION_CATEGORY_XCD"].value_counts().keys()
    TRANSACTION_T_keys = dataframe["TRANSACTION_TYPE_XCD"].value_counts().keys()
    cmt = dataframe["cred_minus_transaction_net_positive"].value_counts().keys()
    SICGROUP_keys = dataframe["SICGROUP"].value_counts().keys()
    for i in dataframe["ID_CPTE"].value_counts().keys():        
        subframe = dataframe.loc[dataframe["ID_CPTE"] == i]
        #query for DECISION_XCD
        DECISION_dict = {}
        for j in DECISION_keys:
            s = "DECISION_XCD_" + j
            try:
                DECISION_dict[s] = subframe["DECISION_XCD"].value_counts(normalize=True)[j]
            except:
                DECISION_dict[s] = 0
        #query for transaction_c
        TRANSACTION_C_dict = {}
        for j in TRANSACTION_C_keys:
            s = "TRANSACTION_C_" + j
            try:
                TRANSACTION_C_dict[s] = subframe["TRANSACTION_CATEGORY_XCD"].value_counts(normalize=True)[j]
            except:
                TRANSACTION_C_dict[s] = 0
        TRANSACTION_T_dict = {}
        #query for transaction_t    
        for j in TRANSACTION_T_keys:
            s = "TRANSACTION_T_" + j
            try:
                TRANSACTION_T_dict[s] = subframe["TRANSACTION_TYPE_XCD"].value_counts(normalize=True)[j]
            except:
                TRANSACTION_T_dict[s] = 0
        #query for SICGROUP
        SICGROUP_dict = {}

        for j in SICGROUP_keys:
            s = "SCIGROUP_" + j
            try:
                SICGROUP_dict[s] = subframe["SICGROUP"].value_counts(normalize=True)[j]
            except:
                SICGROUP_dict[s] = 0
        CMT_dict = {}
        for j in cmt:
            s = "cred_minus_transaction_net_positive" + str(j)
            try:
                CMT_dict[s] = subframe["cred_minus_transaction_net_positive"].value_counts(normalize=True)[j]
            except:
                CMT_dict[s] = 0
                
        output[i] = [DECISION_dict, TRANSACTION_C_dict, TRANSACTION_T_dict, SICGROUP_dict, CMT_dict]
    return output
def summarize_by_ID_2(dataframe):
    output = {}
    PAYMENT_REVERSAL_XFLG_key =  dataframe["PAYMENT_REVERSAL_XFLG"].value_counts().keys()
    for i in dataframe["ID_CPTE"].value_counts().keys():        
        subframe = dataframe.loc[dataframe["ID_CPTE"] == i]
        TRANSACTION_SUM_dict = {}
        TRANSACTION_SUM_dict["TRANSACTION_AMT_sum"] = subframe["TRANSACTION_AMT"].sum()

        PAYMENT_REVERSAL_XFLG_dict = {}
        for j in PAYMENT_REVERSAL_XFLG_key:
            s = "PAYMENT_REVERSAL_XFLG_key_" + str(j)
            try:
                PAYMENT_REVERSAL_XFLG_dict[s] = subframe["PAYMENT_REVERSAL_XFLG"].value_counts(normalize=True)[j]
            except:
                PAYMENT_REVERSAL_XFLG_dict[s] = 0   
        output[i] = [TRANSACTION_SUM_dict,PAYMENT_REVERSAL_XFLG_dict]
    return output

In [5]:
ID_of_defaults = performance.loc[performance["Default"] == 1]["ID_CPTE"]
#Uniform across all years so drop

In [6]:
def feature_engineering(performance,paiements,transactions,test):
    performance["PERIODID_MY"]= pd.to_datetime(performance["PERIODID_MY"]).dt.year
    #Get rid of BS features
    transaction_dropped = transactions.drop(["MERCHANT_CITY_NAME","MERCHANT_CATEGORY_XCD","MERCHANT_COUNTRY_XCD", "TRANSACTION_DTTM"],1)
    
    ## add credit limit minus transaction amount and drop credit limit, transaction amount
    cred_minus_transaction = transaction_dropped["PRIOR_CREDIT_LIMIT_AMT"].sub(transaction_dropped["TRANSACTION_AMT"])
    transaction_dropped = transaction_dropped.drop(["PRIOR_CREDIT_LIMIT_AMT", "TRANSACTION_AMT"],1)
    transaction_dropped['cred_minus_transaction'] = cred_minus_transaction
    
    # drop cred_minus_transaction and query whether it is positive
    transaction_dropped["cred_minus_transaction_net_positive"] = transaction_dropped["cred_minus_transaction"].ge(0)
    transaction_dropped = transaction_dropped.drop(["cred_minus_transaction"],1)
    
    ##Create cleaned dataframe for transaction 
    output = summarize_by_ID(transaction_dropped)
    convert = {}
    s = pd.Series()
    for i in output.keys():
        for k in output[i]:
            s= {**s,**k}
        convert[i] = pd.Series(s)
    final = pd.DataFrame.from_dict(convert, orient='index')
    
    #create cleaned dataframe for payments

    paiements_drop = paiements.drop(["TRANSACTION_DTTM"],1)
    
    output2 = summarize_by_ID_2(paiements_drop)
    convert2 = {}
    s2 = pd.Series()
    for i in output2.keys():
        for k2 in output2[i]:
            s2= {**s2,**k2}
        convert2[i] = pd.Series(s2)    
    final2 = pd.DataFrame.from_dict(convert2, orient='index')
    
    #create cleaned dataframe for performance
    temp = performance.set_index("ID_CPTE")
    del temp.index.name
    
    combined = final2.combine_first(final.combine_first(temp))
    if (not test):
        combined_drop_features = combined[["cred_minus_transaction_net_positiveTrue","Default", "PAYMENT_REVERSAL_XFLG_key_Q"]]
    else:
        combined_drop_features = combined[["cred_minus_transaction_net_positiveTrue", "PAYMENT_REVERSAL_XFLG_key_Q"]]
        
    return combined_drop_features

In [7]:
def imputing(dataset_train_x, imputee):
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp = imp.fit(dataset_train_x)
    return imp.transform(imputee)

In [8]:
def gradient_boosting_classifier(train,test):
    space  = [Integer(2, 200, name='max_depth'),
              Real(10**-5, 10**0, "log-uniform", name='learning_rate'),
              Integer(1, train_x.shape[1], name='max_features'),
              Integer(2, 100, name='min_samples_split'),
              Integer(1, 100, name='min_samples_leaf')]    
    @use_named_args(space)
    def objective(**params):
        reg.set_params(**params)

        return -np.mean(cross_val_score(reg, train,test , cv=5, n_jobs=-1,
                                        scoring="neg_mean_absolute_error"))
    reg = GradientBoostingClassifier(n_estimators=50, random_state=0)

    res_gp = gp_minimize(objective, space, n_calls=50, random_state=0)
    return GradientBoostingClassifier(n_estimators=50, random_state=0, max_depth = res_gp.x[0], 
                                      learning_rate = res_gp.x[1], max_features = res_gp.x[2], min_samples_split = res_gp.x[3]
                                     ,min_samples_leaf= res_gp.x[4])
    

In [9]:
def prediction(classifier, X):
    return classifier.predict(X)
    

In [10]:
def submission_creator(ID, default):
    return pd.concat([pd.DataFrame(ID),pd.DataFrame(default)],axis =1)
    

In [None]:
def csv_write(dataframe):
    dataframe.to_csv("submission.csv", index=False, header =['ID_CPTE', 'Default'] )

In [None]:

dataset_train = feature_engineering(performance,paiements,transactions, False)

#whole dataset split x,y
dataset_train_x, dataset_train_y =  dataset_train.drop(["Default"],1), dataset_train["Default"]

##### Training dataset created #####
#dataset split training and validation

train, valid = train_test_split(dataset_train, test_size=0.2)
train_y = train["Default"]
train_x = train.drop(["Default"],1)
valid_y = valid["Default"]
valid_x = valid.drop(["Default"],1)
#imputation#
train_x_imp, valid_x_imp = imputing(dataset_train_x,train_x), imputing(dataset_train_x,valid_x)
dataset_train_x_imp = imputing(dataset_train_x,dataset_train_x)

In [None]:
dataset_test = feature_engineering(performance_test,paiements_test,transactions_test, True)
dataset_test_imputed = imputing(dataset_train_x,dataset_test)

In [None]:
## Gradient boosting model selection and submission
bestGBclassifier = gradient_boosting_classifier(dataset_train_x_imp,dataset_train_y)
ID = pd.Series(dataset_test.index)
bestGBclassifier.fit(train_x_imp, train_y)
GB_prediction = prediction(bestGBclassifier,valid_x_imp)
submission_GB = submission_creator(ID,GB_prediction)

In [None]:
GB_prediction = prediction(bestGBclassifier,dataset_test_imputed)
submission_GB = submission_creator(ID,GB_prediction)


In [None]:
## write_csv
csv_write(submission_GB)