In [1]:
#### Util temp

import pandas as pd
import numpy as np


def agg_numeric(df, group_var, df_name):
    """Aggregates the numeric values in a dataframe. This can
    be used to create features for each instance of the grouping variable.

    Parameters
    --------
        df (dataframe):
            the dataframe to calculate the statistics on
        group_var (string):
            the variable by which to group df
        df_name (string):
            the variable used to rename the columns

    Return
    --------
        agg (dataframe):
            a dataframe with the statistics aggregated for
            all numeric columns. Each instance of the grouping variable will have
            the statistics (mean, min, max, sum; currently supported) calculated.
            The columns are also renamed to keep track of features created.

    """
    # Remove id variables other than grouping variable
    for col in df:
        if col != group_var and 'SK_ID' in col:
            df = df.drop(columns=col)
    group_ids = df[group_var]
    numeric_df = df.select_dtypes('number')
    numeric_df[group_var] = group_ids

    # Group by the specified variable and calculate the statistics
    agg = numeric_df.groupby(group_var).agg(['count', 'mean', 'max', 'min', 'sum']).reset_index()

    # Need to create new column names
    columns = [group_var]

    # Iterate through the variables names
    for var in agg.columns.levels[0]:
        # Skip the grouping variable
        if var != group_var:
            # Iterate through the stat names
            for stat in agg.columns.levels[1][:-1]:
                # Make a new column name for the variable and stat
                columns.append('%s_%s_%s' % (df_name, var, stat))

    agg.columns = columns
    return agg

def count_categorical(df, group_var, df_name):
    """Computes counts and normalized counts for each observation
    of `group_var` of each unique category in every categorical variable

    Parameters
    --------
    df : dataframe
        The dataframe to calculate the value counts for.

    group_var : string
        The variable by which to group the dataframe. For each unique
        value of this variable, the final dataframe will have one row

    df_name : string
        Variable added to the front of column names to keep track of columns

    Return
    --------
    categorical : dataframe
        A dataframe with counts and normalized counts of each unique category in every categorical variable
        with one row for every unique value of the `group_var`.

    """
    # Select the categorical columns
    categorical = pd.get_dummies(df.select_dtypes('object'))

    # Make sure to put the identifying id on the column
    categorical[group_var] = df[group_var]

    # Groupby the group var and calculate the sum and mean
    categorical = categorical.groupby(group_var).agg(['sum', 'mean'])

    column_names = []

    # Iterate through the columns in level 0
    for var in categorical.columns.levels[0]:
        # Iterate through the stats in level 1
        for stat in ['count', 'count_norm']:
            # Make a new column name
            column_names.append('%s_%s_%s' % (df_name, var, stat))

    categorical.columns = column_names

    return categorical

def aggregate_client(df, group_vars, df_names):
    """Aggregate a dataframe with data at the loan level
    at the client level

    Args:
        df (dataframe): data at the loan level
        group_vars (list of two strings): grouping variables for the loan
        and then the client (example ['SK_ID_PREV', 'SK_ID_CURR'])
        names (list of two strings): names to call the resulting columns
        (example ['cash', 'client'])

    Returns:
        df_client (dataframe): aggregated numeric stats at the client level.
        Each client will have a single row with all the numeric data aggregated
    """

    # Aggregate the numeric columns
    df_agg = agg_numeric_new(df, parent_var=group_vars[0], df_name=df_names[0])

    # If there are categorical variables
    if any(df.dtypes == 'category'):
        # Count the categorical columns
        df_counts = agg_categorical_new(df, parent_var=group_vars[0], df_name=df_names[0])

        # Merge the numeric and categorical
        df_by_loan = df_counts.merge(df_agg, on=group_vars[0], how='outer')

        # Merge to get the client id in dataframe
        df_by_loan = df_by_loan.merge(df[[group_vars[0], group_vars[1]]], on=group_vars[0], how='left')

        # Remove the loan id
        df_by_loan = df_by_loan.drop(columns=[group_vars[0]])

        # Aggregate numeric stats by column
        df_by_client = agg_numeric_new(df_by_loan, parent_var=group_vars[1], df_name=df_names[1])


    # No categorical variables
    else:
        # Merge to get the client id in dataframe
        df_by_loan = df_agg.merge(df[[group_vars[0], group_vars[1]]], on=group_vars[0], how='left')

        # Remove the loan id
        df_by_loan = df_by_loan.drop(columns=[group_vars[0]])

        # Aggregate numeric stats by column
        df_by_client = agg_numeric_new(df_by_loan, parent_var=group_vars[1], df_name=df_names[1])

    # Memory management

    return df_by_client

def convert_types(df, print_info=False):
    original_memory = df.memory_usage().sum()

    # Iterate through each column
    for c in df:

        # Convert ids and booleans to integers
        if ('SK_ID' in c):
            df[c] = df[c].fillna(0).astype(np.int32)

        # Convert objects to category
        elif (df[c].dtype == 'object') and (df[c].nunique() < df.shape[0]):
            df[c] = df[c].astype('category')

        # Booleans mapped to integers
        elif list(df[c].unique()) == [1, 0]:
            df[c] = df[c].astype(bool)

        # Float64 to float32
        elif df[c].dtype == float:
            df[c] = df[c].astype(np.float32)

        # Int64 to int32
        elif df[c].dtype == int:
            df[c] = df[c].astype(np.int32)

    new_memory = df.memory_usage().sum()

    if print_info:
        print(f'Original Memory Usage: {round(original_memory / 1e9, 2)} gb.')
        print(f'New Memory Usage: {round(new_memory / 1e9, 2)} gb.')

    return df



def agg_numeric_new(df, parent_var, df_name):
    """
    Groups and aggregates the numeric values in a child dataframe
    by the parent variable.

    Parameters
    --------
        df (dataframe):
            the child dataframe to calculate the statistics on
        parent_var (string):
            the parent variable used for grouping and aggregating
        df_name (string):
            the variable used to rename the columns

    Return
    --------
        agg (dataframe):
            a dataframe with the statistics aggregated by the `parent_var` for
            all numeric columns. Each observation of the parent variable will have
            one row in the dataframe with the parent variable as the index.
            The columns are also renamed using the `df_name`. Columns with all duplicate
            values are removed.

    """
    # Remove id variables other than grouping variable
    for col in df:
        if col != parent_var and 'SK_ID' in col:
            df = df.drop(columns=col)

    # Only want the numeric variables
    parent_ids = df[parent_var].copy()
    numeric_df = df.select_dtypes('number').copy()
    numeric_df[parent_var] = parent_ids

    # Group by the specified variable and calculate the statistics
    agg = numeric_df.groupby(parent_var).agg(['count', 'mean', 'max', 'min', 'sum'])

    # Need to create new column names
    columns = []

    # Iterate through the variables names
    for var in agg.columns.levels[0]:
        if var != parent_var:
            # Iterate through the stat names
            for stat in agg.columns.levels[1]:
                # Make a new column name for the variable and stat
                columns.append('%s_%s_%s' % (df_name, var, stat))

    agg.columns = columns

    # Remove the columns with all redundant values
    _, idx = np.unique(agg, axis=1, return_index=True)
    agg = agg.iloc[:, idx]

    return agg


def agg_categorical_new(df, parent_var, df_name):
    """
    Aggregates the categorical features in a child dataframe
    for each observation of the parent variable.

    Parameters
    --------
    df : dataframe
        The dataframe to calculate the value counts for.

    parent_var : string
        The variable by which to group and aggregate the dataframe. For each unique
        value of this variable, the final dataframe will have one row

    df_name : string
        Variable added to the front of column names to keep track of columns

    Return
    --------
    categorical : dataframe
        A dataframe with aggregated statistics for each observation of the parent_var
        The columns are also renamed and columns with duplicate values are removed.

    """

    # Select the categorical columns
    categorical = pd.get_dummies(df.select_dtypes('category'))
    # Make sure to put the identifying id on the column
    categorical[parent_var] = df[parent_var]

    # Groupby the group var and calculate the sum and mean
    categorical = categorical.groupby(parent_var).agg(['sum', 'count', 'mean'])

    column_names = []

    # Iterate through the columns in level 0
    for var in categorical.columns.levels[0]:
        # Iterate through the stats in level 1
        for stat in ['sum', 'count', 'mean']:
            # Make a new column name
            column_names.append('%s_%s_%s' % (df_name, var, stat))

    categorical.columns = column_names

    # Remove duplicate columns by values
    _, idx = np.unique(categorical, axis=1, return_index=True)
    categorical = categorical.iloc[:, idx]

    return categorical

## Start

In [5]:
parentFolder = "D:/Dataset/home-credit-default-risk/"
print("Reading bureau and bureau_balance")
bureau = pd.read_csv(parentFolder+'bureau.csv')
bureau_balance = pd.read_csv(parentFolder+'bureau_balance.csv')

bureau_counts = count_categorical(bureau, group_var='SK_ID_CURR', df_name='bureau')
bureau_counts.head()

bureau_agg = agg_numeric(bureau.drop(columns=['SK_ID_BUREAU']), group_var='SK_ID_CURR', df_name='bureau')
bureau_agg.head()
bureau_agg.fillna(value=0, inplace=True)

bureau_balance_counts = count_categorical(bureau_balance, group_var='SK_ID_BUREAU', df_name='bureau_balance')
bureau_balance_counts.head()

bureau_balance_agg = agg_numeric(bureau_balance, group_var='SK_ID_BUREAU', df_name='bureau_balance')
bureau_balance_agg.head()

# Dataframe grouped by the loan
bureau_by_loan = bureau_balance_agg.merge(bureau_balance_counts, right_index=True, left_on='SK_ID_BUREAU', how='outer')

# Merge to include the SK_ID_CURR
bureau_by_loan = bureau[['SK_ID_BUREAU', 'SK_ID_CURR']].merge(bureau_by_loan, on='SK_ID_BUREAU', how='left')

# Aggregate the stats for each client
bureau_balance_by_client = agg_numeric(bureau_by_loan.drop(columns=['SK_ID_BUREAU']), group_var='SK_ID_CURR',
                                       df_name='client')



df = bureau_counts.merge(bureau_agg, on ='SK_ID_CURR', how='left')
df = df.merge(bureau_balance_by_client, on='SK_ID_CURR', how='left')

# del bureau_agg, bureau_balance_by_client, bureau_counts, bureau, bureau_by_loan, bureau_balance, bureau_balance_agg, bureau_balance_counts
# import gc; gc.collect()

Reading bureau and bureau_balance


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [24]:
# app = pd.read_csv(parentFolder+'application_train.csv')
print(bureau.shape)
print(bureau_balance_counts.shape)
# print(app.shape)

(1716428, 17)
(817395, 16)


In [31]:
bureau_balance_counts.head()

Unnamed: 0_level_0,bureau_balance_STATUS_0_count,bureau_balance_STATUS_0_count_norm,bureau_balance_STATUS_1_count,bureau_balance_STATUS_1_count_norm,bureau_balance_STATUS_2_count,bureau_balance_STATUS_2_count_norm,bureau_balance_STATUS_3_count,bureau_balance_STATUS_3_count_norm,bureau_balance_STATUS_4_count,bureau_balance_STATUS_4_count_norm,bureau_balance_STATUS_5_count,bureau_balance_STATUS_5_count_norm,bureau_balance_STATUS_C_count,bureau_balance_STATUS_C_count_norm,bureau_balance_STATUS_X_count,bureau_balance_STATUS_X_count_norm
SK_ID_BUREAU,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
5001709,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,86,0.886598,11,0.113402
5001710,5,0.060241,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,48,0.578313,30,0.361446
5001711,3,0.75,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,0.25
5001712,10,0.526316,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,9,0.473684,0,0.0
5001713,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,22,1.0


In [21]:
print(len(bureau['SK_ID_CURR'].unique()))

305811


In [13]:
bureau_balance_counts.isnull().sum()

bureau_balance_STATUS_0_count         0
bureau_balance_STATUS_0_count_norm    0
bureau_balance_STATUS_1_count         0
bureau_balance_STATUS_1_count_norm    0
bureau_balance_STATUS_2_count         0
bureau_balance_STATUS_2_count_norm    0
bureau_balance_STATUS_3_count         0
bureau_balance_STATUS_3_count_norm    0
bureau_balance_STATUS_4_count         0
bureau_balance_STATUS_4_count_norm    0
bureau_balance_STATUS_5_count         0
bureau_balance_STATUS_5_count_norm    0
bureau_balance_STATUS_C_count         0
bureau_balance_STATUS_C_count_norm    0
bureau_balance_STATUS_X_count         0
bureau_balance_STATUS_X_count_norm    0
dtype: int64

In [9]:
df.isnull().sum()

SK_ID_CURR                                              0
bureau_CREDIT_ACTIVE_Active_count                       0
bureau_CREDIT_ACTIVE_Active_count_norm                  0
bureau_CREDIT_ACTIVE_Bad debt_count                     0
bureau_CREDIT_ACTIVE_Bad debt_count_norm                0
                                                    ...  
client_bureau_balance_STATUS_X_count_norm_count         0
client_bureau_balance_STATUS_X_count_norm_mean     171269
client_bureau_balance_STATUS_X_count_norm_max      171269
client_bureau_balance_STATUS_X_count_norm_min      171269
client_bureau_balance_STATUS_X_count_norm_sum           0
Length: 212, dtype: int64

In [None]:



print("Reading POS_CASH_BALANCE")
cash = pd.read_csv(parentFolder+'POS_CASH_balance.csv')
cash = convert_types(cash, print_info=True)
cash_by_client = aggregate_client(cash, group_vars=['SK_ID_PREV', 'SK_ID_CURR'], df_names=['cash', 'client'])

# app_train_poly = app_train_poly.merge(cash_by_client, on='SK_ID_CURR', how='left')
df = df.merge(cash_by_client, on='SK_ID_CURR', how='left')
del cash, cash_by_client
gc.collect();


print("Reading Credit_card_balance")
credit = pd.read_csv(parentFolder+'credit_card_balance.csv')
credit = convert_types(credit, print_info=True)
credit_by_client = aggregate_client(credit, group_vars=['SK_ID_PREV', 'SK_ID_CURR'], df_names=['credit', 'client'])

# app_train_poly = app_train_poly.merge(credit_by_client, on='SK_ID_CURR', how='left')
df = df.merge(credit_by_client, on='SK_ID_CURR', how='left')
del credit, credit_by_client; gc.collect()


print("Reading installment_payments")
installments = pd.read_csv(parentFolder+'installments_payments.csv')
installments = convert_types(installments, print_info=True)

installments_by_client = aggregate_client(installments, group_vars=['SK_ID_PREV', 'SK_ID_CURR'],
                                          df_names=['installments', 'client'])

# app_train_poly = app_train_poly.merge(installments_by_client, on='SK_ID_CURR', how='left')
df = df.merge(installments_by_client, on='SK_ID_CURR', how='left')
del installments_by_client, installments
gc.collect()

# cols = df.columns

# print("Before Imputer shape: ", df.shape)
# ## Impute
# imputer = SimpleImputer(strategy='median')
# df = imputer.fit_transform(df)
# print("After Imputer shape: ", df.shape)

df_final = pd.DataFrame(data=df, columns=cols)
del(df); gc.collect()


df_final.to_csv("./Res/PastMerged1.csv", index=False)

## final1

In [63]:
from collections import Counter
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from imblearn.under_sampling import RandomUnderSampler, NearMiss


parent = 'D:/Projects/Spyder/LoanDef/Res/'

## imports

In [2]:
df = pd.read_csv(parent+'final1.csv')

In [5]:
label_path = parent+'training_lbl1.pkl'
label = pickle.load(open(label_path,'rb'))
y = np.array(label)

In [7]:
df.drop(['Unnamed: 0', 'SK_ID_CURR'], axis=1, inplace=True)

In [16]:
np.count_nonzero(y)

24825

In [18]:
24825*4

99300

In [20]:
rand_shuffle = np.arange(len(df))

In [22]:
x = df[rand_shuffle]
y= y[rand_shuffle]
all_pos = np.where(y == 1)

In [25]:
# np.where(y == 1)[0]

array([     0,     26,     40, ..., 307481, 307489, 307509], dtype=int64)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(df, y, random_state=101, test_size=.20, stratify=y )

In [10]:
from sklearn.neural_network import MLPClassifier

In [36]:
solver='lbfgs'
alpha=1e-4
hls=(150, 100, 50)
rand_state=229
clf_nn = MLPClassifier(
        solver=solver, alpha=alpha, hidden_layer_sizes=hls,
        random_state=rand_state, verbose=True)

In [37]:
clf_nn.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPClassifier(hidden_layer_sizes=(150, 100, 50), random_state=229,
              solver='lbfgs', verbose=True)

In [55]:

print(classification_report(y_test.reshape(-1,1), clf_nn.predict(X_test)))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96     56538
           1       0.39      0.03      0.06      4965

    accuracy                           0.92     61503
   macro avg       0.66      0.51      0.51     61503
weighted avg       0.88      0.92      0.88     61503



In [57]:
lr = LogisticRegression(C=1.0, class_weight='balanced', max_iter=1000 )
lr.fit(X_train, y_train)

LogisticRegression(class_weight='Balanced', max_iter=1000)

In [61]:
print(confusion_matrix(y_test.reshape(-1,1), lr.predict(X_test)))
print(classification_report(y_test.reshape(-1,1), lr.predict(X_test)))

[[56504    34]
 [ 4916    49]]
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     56538
           1       0.59      0.01      0.02      4965

    accuracy                           0.92     61503
   macro avg       0.76      0.50      0.49     61503
weighted avg       0.89      0.92      0.88     61503



In [None]:
nm = NearMiss()
X_res, y_res = nm.fit_resample(df, y)


In [66]:
print('Resampled dataset shape %s' % Counter(y_res))

In [1]:
https://datascience.stackexchange.com/questions/45165/how-to-get-accuracy-f1-precision-and-recall-for-a-keras-model

SyntaxError: invalid syntax (<ipython-input-1-e4c3ef879cde>, line 1)