In [1]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt
import time
import seaborn as sns
sns.set()

In [2]:
# Global parameters for easy access. Ideally these would be the same but that's not a requirement.
NUMBER_OF_CLUSTERS = 10
NUMBER_OF_CATEGORIES = 5

In [3]:
pd.set_option('display.max_columns', None)
#Load data and keep index id as "ind_id"
data = pd.read_csv("invoice_data_with_features.csv", sep=",", index_col=0)

FileNotFoundError: [Errno 2] No such file or directory: 'invoice_data_with_features.csv'

In [None]:
data['TransDate'] = pd.to_datetime(data['TransDate'])
data['DueDate'] = pd.to_datetime(data['DueDate'])
data['SettleTransDate'] = pd.to_datetime(data['SettleTransDate'])
data['Customer_NKey'] = data['Customer_NKey'].astype('category')
data['last_invoice_paid_status'] = data['last_invoice_paid_status'].astype('category')
data['Invoice'] = data['Invoice'].astype('category')
data['is_late'] = data['is_late'].astype('category')

In [None]:
data.info()

## Mini-exploration

In [None]:
data.groupby("Customer_NKey").count()

In [None]:
def visualize_plots(dataset, column_name):
    fig, ax = plt.subplots(1,1)
    sns.histplot(data=dataset, x="days_late", kde=True)
    ax[0] =  sns.histplot(
        data=dataset,
        x=column_name,
        kde=True)
    column = f"{column_name}"
    dataset.column.plot.hist(bins=50)

In [None]:
data[(data['days_late']>=(-90) ) & (data['days_late']<=(90))]["days_late"].hist(bins=50)

In [None]:
data.groupby("Customer_NKey").count().ind_id.plot.hist()

In [None]:
data.groupby("Customer_NKey").count().ind_id.plot.density()

In [None]:
data.groupby("Customer_NKey").count().ind_id.sort_values(ascending=False).plot.bar()

In [None]:
data.describe()

## Creating categories

In [None]:
def categorize(x, info):

    if info == "number":
        if x<(-7):
            return(0)
        #### ***How to seperate values around 0? Depends. This can be changed depending on the need. 
        #Change if needed
        #### ***FOR NOW CONSIDERING 0-7 days early and late as one category, seperating them at 0 might not be best
        #Change if needed
        elif x in range(-7, 1):
            return(1)
        elif x in range(1, 8):
            return(1)
        #### 
        elif x in range(8, 31):
            return(2)
        elif x in range(31, 91):
            return(3)
        else: 
            return(4)
        
            if info ==  "label":
        return("Disabled for testing")
# Use this when the optimal categories are decided
    
#         if x<(-90):
#             return('Over 90 D early')
#         elif x in range(-90,-60):
#             return('61-90 D early')
#         elif x in range(-60,-30):
#             return('31-60 D early')
#         elif x in range(-30,-7):
#             return('8-30 D early')
#         #### ***FOR NOW CONSIDERING 0-7 days early and late as one category, seperating them at 0 might not be best
#         #Change if needed
#         elif x in range(-7, 1):
#             return('-7 to 0 days')
#         elif x in range(1, 8):
#             return('1 to 7 days')
#         #### 
#         elif x in range(8, 31):
#             return('8-30 D late')
#         elif x in range(31, 61):
#             return('31-60 D late')
#         elif x in range(61, 91):
#             return('61-90 D late')
#         else: 
#             return('Over 90 D late')

In [None]:
## Categorize every invoice

In [None]:
#Get category labels
data["category"] = data.days_late.apply(lambda x: categorize(x, "label"))

In [None]:
#Get category numbers
data["category_no"] = data.days_late.apply(lambda x: categorize(x, "number"))

## One-hot coding categorical features

In [None]:
#Getting dummies for feature #10 last_invoice_paid_status
data = pd.get_dummies(data, columns=['last_invoice_paid_status'], prefix = ['last_invoice_paid_status'])

## Seperating train, validation & test data

### Based on recency

In [None]:
#CHANGE PERCENTAGES OF TRAIN/VAL/SPLIT HERE
#Splitting the dataframe (must be already sorted by date)
def train_validate_test_split(df, train_percent=.7, validate_percent=.15):
    n = len(df.index)
    train_end = int(train_percent * n)
    validate_end = int(validate_percent * n) + train_end
    train = df.iloc[:train_end]
    validate = df.iloc[train_end:validate_end]
    test = df.iloc[validate_end:]
    return train, validate, test

In [None]:
def split_dataframes(dataset):
    list_of_client_ids = dataset["Customer_NKey"].unique()
    train_df_list = []
    validate_df_list = []
    test_df_list = []
    for client_id in list_of_client_ids:
        dataframe = dataset[dataset["Customer_NKey"] == client_id].sort_values(by="TransDate", ascending=True)
        train_df, val_df, test_df = train_validate_test_split(dataframe)
        train_df_list.append(train_df)
        validate_df_list.append(val_df)
        test_df_list.append(test_df)
    
    df_train_final = pd.concat(train_df_list)
    df_val_final = pd.concat(validate_df_list)
    df_test_final = pd.concat(test_df_list)
    return df_train_final, df_val_final, df_test_final

In [None]:
# Splitting data into train, validate, and test dataframes. 
# Splitting based: percentage (%) of most recent invoices per each customer
train_df, validate_df, test_df = split_dataframes(data)
train_val_combined_df = pd.concat([train_df, validate_df])

In [None]:
#Compare
train_df.describe()

In [None]:
test_df.describe()

### Late days between -90 and 90 distribution comparison

In [None]:
train_df[(train_df['days_late']>=(-90) ) & (train_df['days_late']<=(90))]["days_late"].hist(bins=50)

In [None]:
test_df[(test_df['days_late']>=(-90) ) & (test_df['days_late']<=(90))]["days_late"].hist(bins=50)

### Density comparison

In [None]:
train_df["days_late"].value_counts().plot.density()

In [None]:
test_df["days_late"].value_counts().plot.density()

# Preparation for training

In [None]:
#For training the models
import xgboost as xgb
import sklearn.metrics as sm
from sklearn.ensemble import RandomForestRegressor, AdaBoostClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.linear_model import LinearRegression,Ridge,Lasso, LogisticRegression
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score,cross_val_predict,cross_validate,RandomizedSearchCV
from sklearn.metrics import mean_squared_error,mean_absolute_error,explained_variance_score,max_error,r2_score,median_absolute_error,mean_squared_log_error
from sklearn.preprocessing import MinMaxScaler,normalize,StandardScaler,RobustScaler, Normalizer
from sklearn.metrics import f1_score, accuracy_score
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier


In [None]:
# #Optional (depending if training based on clusters or clientID)
# #Getting dummies for feature #10 last_invoice_paid_status
# data = pd.get_dummies(data, columns=['Customer_NKey'], prefix = ['Customer'])

In [None]:
#Categorical values for training
categorical_cols = ['last_invoice_paid_status_-1.0','last_invoice_paid_status_0.0', 'last_invoice_paid_status_1.0']
# Continious/ordinal  values for training
cont_cols = ['AmountEUR','days_between_created_and_due',
       'total_number_invoices', 'total_sum_invoices',
       'average_days_late_and_early', 'average_days_late_total',
       'average_days_early_total', 'total_paid_invoices',
       'sum_amount_paid_invoices', 'total_invoices_late',
       'total_invoices_early', 'sum_amount_late_invoices',
       'sum_amount_early_invoices', 'total_outstanding_invoices',
       'total_outstanding_late', 'sum_total_outstanding',
       'sum_late_outstanding', 'average_days_late', 'average_days_early',
       'average_days_outstanding_late', 'std_days_late', 'std_days_early',
       'std_days_outstanding_late', 'ratio_1_late', 'ratio_1_early',
       'ratio_2_late', 'ratio_2_early', 'ratio_3_late', 'ratio_4_late']

#Combined list
column_names_for_training = categorical_cols + cont_cols

## Clustering invoices and clients

In [None]:
# # #Test different numbers of clusters (for exploration purpose)
# # #This is commented out so it wouldn't run every time. 

# from sklearn.cluster import KMeans
# # Finding optimal number of clusters using elbow method 
# minmaxscaler = MinMaxScaler()
# kmeans_df_train = minmaxscaler.fit_transform(train_val_combined_df[column_names_for_training])
# # Finding optimal K value
# possible_K_values = [i for i in range(2,40)]
# # Inertia list
# inertia = []
# for each_value in possible_K_values:
#     # iterate through, taking each value from 
#     KMeans_model = KMeans(n_clusters=each_value, random_state=1)
#     # fit it
#     KMeans_model.fit(kmeans_df_train)
#     # append the inertia to our array
#     inertia.append(KMeans_model.inertia_)
    
# plt.plot(possible_K_values, inertia)
# plt.title('The Elbow Method')
# plt.xlabel('Number of Clusters')
# plt.ylabel('Inertia')
# plt.show()

In [None]:
#Fitting KMeans predefined number of clusters. 10 would be reasonable, but can be changed. 
from sklearn.cluster import KMeans
minmaxscaler = MinMaxScaler()
kmeans_df_train = minmaxscaler.fit_transform(train_val_combined_df[column_names_for_training])
KMeans_model = KMeans(n_clusters=NUMBER_OF_CLUSTERS, random_state=1)
# fit it
KMeans_model.fit(kmeans_df_train)

In [None]:
# Assigning clusters to each invoice
k_means_df = train_val_combined_df.copy(deep=True)
k_means_df["invoice_cluster"] = KMeans_model.labels_

In [None]:
# Assigning clusters to each customer
customer_clusters = k_means_df.groupby("Customer_NKey")["invoice_cluster"].agg(lambda x: x.value_counts().index[0])

In [None]:
def add_cluster_labels(dataset, customerclusters):
    dataframe = dataset.copy(deep=True)
    customer_clabels = customerclusters.to_dict()
    for client_id in customer_clabels:
        dataframe[dataframe["Customer_NKey"] == client_id]
        dataframe.loc[dataframe["Customer_NKey"] == client_id, 'client_cluster'] = customer_clabels[client_id]
        dataframe["client_cluster"] = dataframe["client_cluster"].apply("int64")
        
        
    return dataframe

In [None]:
# Add customer clusters to each dataset
train_df = add_cluster_labels(train_df, customer_clusters)
validate_df = add_cluster_labels(validate_df, customer_clusters)
test_df = add_cluster_labels(test_df, customer_clusters)
train_val_combined_df = add_cluster_labels(train_val_combined_df, customer_clusters)

## Seperate train, val and test dataframes for cluster based or clientID based training

### For clusters

In [None]:
cluster_train_df = train_df.copy(deep=True)
cluster_validate_df = validate_df.copy(deep=True)
cluster_test_df = test_df.copy(deep=True)
cluster_train_val_combined_df = train_val_combined_df.copy(deep=True)

In [None]:
#Getting dummies for CLIENT CLUSTERS
cluster_train_df = pd.get_dummies(cluster_train_df, columns=['client_cluster'], prefix = ['client_cluster'])
cluster_validate_df = pd.get_dummies(cluster_validate_df, columns=['client_cluster'], prefix = ['client_cluster'])
cluster_test_df = pd.get_dummies(cluster_test_df, columns=['client_cluster'], prefix = ['client_cluster'])
cluster_train_val_combined_df = pd.get_dummies(cluster_train_val_combined_df, columns=['client_cluster'], prefix = ['client_cluster'])

In [None]:
#Creating a list of col_names of dummy variables for client clusters
colnames_client_clusters = list(cluster_train_df.columns[-(NUMBER_OF_CLUSTERS):])

In [None]:
cols_for_cluster_training = categorical_cols + cont_cols + colnames_client_clusters

In [None]:
#Creating x and y data
### Train
#for linear models
cluster_train_df_linear_x = cluster_train_df[cols_for_cluster_training]
cluster_train_df_linear_y = cluster_train_df["days_late"]
#for logistic models
cluster_train_df_logistic_x = cluster_train_df[cols_for_cluster_training]
cluster_train_df_logistic_y = cluster_train_df["category_no"]

In [None]:
#Creating x and y data
### Validate
#for linear models
cluster_validate_df_linear_x = cluster_validate_df[cols_for_cluster_training]
cluster_validate_df_linear_y = cluster_validate_df["days_late"]
#for logistic models
cluster_validate_df_logistic_x = cluster_validate_df[cols_for_cluster_training]
cluster_validate_df_logistic_y = cluster_validate_df["category_no"]

In [None]:
#Creating x and y data
### Validate
#for linear models
cluster_test_df_linear_x = cluster_test_df[cols_for_cluster_training]
cluster_test_df_linear_y = cluster_test_df["days_late"]
#for logistic models
cluster_test_df_logistic_x = cluster_test_df[cols_for_cluster_training]
cluster_test_df_logistic_y = cluster_test_df["category_no"]

In [None]:
#Creating x and y data
### Train_val combined (just in case for CV)
#for linear models
cluster_train_val_combined_linear_x = cluster_train_val_combined_df[cols_for_cluster_training]
cluster_train_val_combined_linear_y = cluster_train_val_combined_df["days_late"]
#for logistic models
cluster_train_val_combined_logistic_x = cluster_train_val_combined_df[cols_for_cluster_training]
cluster_train_val_combined_logistic_y = cluster_train_val_combined_df["category_no"]

### For Client-ID based

In [None]:
clientid_train_df = train_df.copy(deep=True)
clientid_validate_df = validate_df.copy(deep=True)
clientid_test_df = test_df.copy(deep=True)
clientid_train_val_combined_df = train_val_combined_df.copy(deep=True)

In [None]:
#Getting dummies for CLIENT ID
clientid_train_df = pd.get_dummies(clientid_train_df, columns=['Customer_NKey'], prefix = ['customer'])
clientid_validate_df = pd.get_dummies(clientid_validate_df, columns=['Customer_NKey'], prefix = ['customer'])
clientid_test_df = pd.get_dummies(clientid_test_df, columns=['Customer_NKey'], prefix = ['customer'])
clientid_train_val_combined_df = pd.get_dummies(clientid_train_val_combined_df, columns=['Customer_NKey'], prefix = ['customer'])

In [None]:
#Creating a list of col_names of dummy variables for client IDs
colnames_client_ids = colnames_client_ids = list(clientid_train_df.columns[-(train_df["Customer_NKey"].nunique()):])

In [None]:
cols_for_cliendid_training = categorical_cols + cont_cols + colnames_client_ids

In [None]:
#Creating x and y data
### Train
#for linear models
clientid_train_df_linear_x = clientid_train_df[cols_for_cliendid_training]
clientid_train_df_linear_y = clientid_train_df["days_late"]
#for logistic models
clientid_train_df_logistic_x = clientid_train_df[cols_for_cliendid_training]
clientid_train_df_logistic_y = clientid_train_df["category_no"]

In [None]:
#Creating x and y data
### Validate
#for linear models
clientid_validate_df_linear_x = clientid_validate_df[cols_for_cliendid_training]
clientid_validate_df_linear_y = clientid_validate_df["days_late"]
#for logistic models
clientid_validate_df_logistic_x = clientid_validate_df[cols_for_cliendid_training]
clientid_validate_df_logistic_y = clientid_validate_df["category_no"]

In [None]:
#Creating x and y data
### Validate
#for linear models
clientid_test_df_linear_x = clientid_test_df[cols_for_cliendid_training]
clientid_test_df_linear_y = clientid_test_df["days_late"]
#for logistic models
clientid_test_df_logistic_x = clientid_test_df[cols_for_cliendid_training]
clientid_test_df_logistic_y = clientid_test_df["category_no"]

In [None]:
#Creating x and y data
### Train_val combined (just in case for CV)
#for linear models
clientid_train_val_combined_linear_x = clientid_train_val_combined_df[cols_for_cliendid_training]
clientid_train_val_combined_linear_y = clientid_train_val_combined_df["days_late"]
#for logistic models
clientid_train_val_combined_logistic_x = clientid_train_val_combined_df[cols_for_cliendid_training]
clientid_train_val_combined_logistic_y = clientid_train_val_combined_df["category_no"]

# More pre-processing: Fixing sample balanced + scaling data

In [None]:
#For scaling data
def scale_datasets(dataset_train, dataset_val, continuous_cols):
    scaler = StandardScaler()
    df_train = dataset_train.copy(deep=True)
    df_val = dataset_val.copy(deep=True)
    train_scaled = scaler.fit_transform(df_train[continuous_cols])
    val_scaled = scaler.transform(df_val[continuous_cols])
    df_train[continuous_cols] = train_scaled
    df_val[continuous_cols] = val_scaled
    
    return df_train, df_val

In [None]:
def balance_classes(train_x_data, train_y_data):
    #Apply SMOETE+Tomek links
    #It undersamples the majority class and oversamples all but majority so all classes would be represented equally
    resample=SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))
    df_x_balanced, df_y_balanced = resample.fit_resample(train_x_data, train_y_data)
    
    return df_x_balanced, df_y_balanced

# Training

In [None]:
def run_model_categories(train_x, train_y, val_x, val_y, continuouscols, classifier, name_for_model):
    data_train_y = train_y
    data_train_x = train_x
#     #For some reason, scaling messes up the results. Perhaps not needed at all. Skipping it. 
#     print("Scaling data...")
#     data_train_x, data_val_x = scale_datasets(train_x, val_x, continuouscols)
    print("Balancing classes...")
    #data_train_x, data_train_y = balance_classes(data_train_x, train_y)
    model = classifier
    print("Fitting model...")
    model.fit(data_train_x, data_train_y)
    predictions = model.predict(val_x)
    f1score = f1_score(val_y, predictions, average="weighted",zero_division=1)
    acc_score = accuracy_score(val_y, predictions)
    cf_mat = confusion_matrix(val_y, predictions)
    sns.heatmap(cf_mat/np.sum(cf_mat),cmap='Blues',
            annot=True, fmt='.2%')
    plt.xlabel('Predicted label', fontsize = 15)
    plt.ylabel('True label', fontsize = 15)
    plt.show()
    print(f"{name_for_model}: Accuracy: {acc_score*100}%")
    print(f"{name_for_model}: F1 score: {f1score*100}%")
    
    return data_train_x, data_train_y, model

In [None]:
def run_model_xgb(train_x, train_y, val_x, val_y, continuouscols, name_for_model, verbose_eval=False):
    data_train_y = train_y
    data_train_x = train_x
#     #For some reason, scaling messes up the results. Perhaps not needed at all. Skipping it. 
#     print("Scaling data...")
#     data_train_x, data_val_x = scale_datasets(train_x, val_x, continuouscols)
    print("Balancing classes...")
    #data_train_x, data_train_y = balance_classes(data_train_x, train_y)
    
    dtrain = xgb.DMatrix(data_train_x, label=data_train_y)
    dtest = xgb.DMatrix(val_x, label=val_y)
    param_list = [("objective", "multi:softmax"), ("eval_metric", "merror"), ("num_class", NUMBER_OF_CLUSTERS)]
    
    # Number of trees
    n_rounds = 600

    # if nothing seems to improve for 50 iterations - stop
    early_stopping = 50
    # train for training and test for ... validation!   
    eval_list = [(dtrain, "train"), (dtest, "validation")]
    
    
    print("Fitting model...")
    model = xgb.train(param_list, dtrain, n_rounds, evals=eval_list, early_stopping_rounds=early_stopping, verbose_eval=verbose_eval)
    
    predictions = model.predict(dtest, ntree_limit=model.best_iteration + 1)
    f1score = f1_score(val_y, predictions, average="weighted",zero_division=1)
    acc_score = accuracy_score(val_y, predictions)
    cf_mat = confusion_matrix(val_y, predictions)
    sns.heatmap(cf_mat/np.sum(cf_mat),cmap='Blues',
            annot=True, fmt='.2%')
    plt.xlabel('Predicted label', fontsize = 15)
    plt.ylabel('True label', fontsize = 15)
    plt.show()
    print(f"{name_for_model}: Accuracy: {acc_score*100}%")
    print(f"{name_for_model}: F1 score: {f1score*100}%")
    
    return data_train_x, data_train_y

## Cluster based

#### Logistic Regression

In [None]:
%%time
# CLIENTID based LogisticRegression()
# Choose model
model_init = LogisticRegression(solver = 'lbfgs', random_state=1, max_iter=500)
# Run model
model1 = run_model_categories(clientid_train_df_logistic_x, clientid_train_df_logistic_y, 
                        clientid_validate_df_logistic_x, clientid_validate_df_logistic_y,
                       cont_cols, model_init, "LogisticRegression")

#### AdaBoostClassifier

In [None]:
%%time
# CLIENTID based AdaBoostClassifier()
# Choose model
model_init = AdaBoostClassifier()

# Run model
model2 = run_model_categories(clientid_train_df_logistic_x, clientid_train_df_logistic_y, 
                        clientid_validate_df_logistic_x, clientid_validate_df_logistic_y,
                       cont_cols, model_init, "AdaBoostClassifier")

#### KNeighborsClassifier

In [None]:
%%time
# CLIENTID based KNeighborsClassifier()
# Choose model
model_init = KNeighborsClassifier()

# Run model
model3 = run_model_categories(clientid_train_df_logistic_x, clientid_train_df_logistic_y, 
                        clientid_validate_df_logistic_x, clientid_validate_df_logistic_y,
                       cont_cols, model_init, "KNeighborsClassifier")

#### BaggingClassifier

In [None]:
%%time
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier

# CLIENTID based BaggingClassifier()
# Choose model
model_init = BaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors=50), n_estimators=3, max_samples=0.8)

# Run model
model4 = run_model_categories(clientid_train_df_logistic_x, clientid_train_df_logistic_y, 
                        clientid_validate_df_logistic_x, clientid_validate_df_logistic_y,
                       cont_cols, model_init, "BaggingClassifier+KNeighborsClassifier")

#### XGBoost

In [None]:
%%time
#XGBoost
# Run model
model5 = run_model_xgb(clientid_train_df_logistic_x, clientid_train_df_logistic_y, 
                        clientid_validate_df_logistic_x, clientid_validate_df_logistic_y,
                       cont_cols, "XGBoost", False)