## Importing necessary libraries

In [164]:
# Import necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import f_oneway, chi2_contingency
import os
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc, roc_auc_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import RFE
import statsmodels.api as sm
from scipy.stats import boxcox,t
import xgboost as xgb
from sklearn.impute import SimpleImputer

pd.options.display.float_format = '{:,.2f}'.format
# Set pandas to display all rows
pd.set_option('display.max_rows', None)

In [165]:
data = pd.read_csv(r'C:\Users\amit7\OneDrive\Documents\VS Code\newest_invoices_data.csv')

In [166]:
data.head()

Unnamed: 0,id_invoice,issuedDate,country,service,total,discount,tax,invoiceStatus,balance,dueDate,client
0,67,2023-07-10,Cuba,AI Solution,8685.46,48.85,593.26,Paid,9229.87,2023-08-03,Company B
1,57,2022-10-05,Uruguay,Mobile App Development,3196.11,437.68,131.34,Pending,2889.77,2022-10-19,Company A
2,79,2022-08-01,Liberia,UI/UX Design,294.28,48.23,7.74,Overdue,253.79,2022-08-09,Company D
3,96,2022-11-30,Mongolia,Mobile App Development,2720.38,469.24,164.15,Overdue,2415.29,2022-12-16,Company D
4,87,2023-03-16,Cyprus,E-commerce Development,2202.77,410.17,214.34,Pending,2006.94,2023-04-04,Company D


In [167]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id_invoice     10000 non-null  int64  
 1   issuedDate     10000 non-null  object 
 2   country        10000 non-null  object 
 3   service        10000 non-null  object 
 4   total          10000 non-null  float64
 5   discount       10000 non-null  float64
 6   tax            10000 non-null  float64
 7   invoiceStatus  10000 non-null  object 
 8   balance        10000 non-null  float64
 9   dueDate        10000 non-null  object 
 10  client         10000 non-null  object 
dtypes: float64(4), int64(1), object(6)
memory usage: 859.5+ KB


In [168]:
data["issuedDate"] = pd.to_datetime(data["issuedDate"])
data["dueDate"] = pd.to_datetime(data["dueDate"])

In [169]:
data.describe()

Unnamed: 0,id_invoice,issuedDate,total,discount,tax,balance,dueDate
count,10000.0,10000,10000.0,10000.0,10000.0,10000.0,10000
mean,50.48,2023-01-08 10:28:59.519999744,3337.41,332.53,166.44,3171.33,2023-01-26 23:30:02.880000
min,1.0,2022-07-10 00:00:00,200.48,0.04,0.0,169.02,2022-07-17 00:00:00
25%,25.0,2022-10-09 00:00:00,1375.18,81.07,41.17,1322.91,2022-10-28 00:00:00
50%,50.0,2023-01-09 00:00:00,2864.52,217.94,112.61,2713.91,2023-01-28 00:00:00
75%,76.0,2023-04-10 00:00:00,4696.57,479.22,240.07,4445.29,2023-04-28 00:00:00
max,100.0,2023-07-10 00:00:00,9997.62,1938.56,972.98,10691.48,2023-08-08 00:00:00
std,29.02,,2353.53,332.98,165.8,2250.6,


In [170]:
for i in list(data.columns):
    print(f"Number of Unique Values in the column {i} is {data[i].nunique()}")

Number of Unique Values in the column id_invoice is 100
Number of Unique Values in the column issuedDate is 366
Number of Unique Values in the column country is 243
Number of Unique Values in the column service is 6
Number of Unique Values in the column total is 9923
Number of Unique Values in the column discount is 9257
Number of Unique Values in the column tax is 8657
Number of Unique Values in the column invoiceStatus is 3
Number of Unique Values in the column balance is 9927
Number of Unique Values in the column dueDate is 388
Number of Unique Values in the column client is 5


So, everything is fine since we have everything sorted and the id_invoice data is **ladchat bhosdiwala**

## Exploratory Data Analysis

In [171]:
data["issuedMonth"] = data["issuedDate"].dt.month
data["issuedyear"] = data["issuedDate"].dt.year
data["dueMonth"] = data["dueDate"].dt.month
data["dueyear"] = data["dueDate"].dt.year

In [172]:
data.drop(columns=["id_invoice"], inplace=True)

In [173]:
data["c2d"] = (data["dueDate"] - data["issuedDate"]).dt.days
# data["c2d"] = pd.to_numeric(data["c2d"])

In [174]:
cat_col = []
for i in data.columns:
    if data[i].dtype == "object":
        cat_col.append(i)

In [175]:
status_mapping = {
    'Paid': 0,
    'Pending': 2,
    'Overdue': 1
}

data['invoiceStatus'] = data['invoiceStatus'].map(status_mapping)

In [176]:
data.head()

Unnamed: 0,issuedDate,country,service,total,discount,tax,invoiceStatus,balance,dueDate,client,issuedMonth,issuedyear,dueMonth,dueyear,c2d
0,2023-07-10,Cuba,AI Solution,8685.46,48.85,593.26,0,9229.87,2023-08-03,Company B,7,2023,8,2023,24
1,2022-10-05,Uruguay,Mobile App Development,3196.11,437.68,131.34,2,2889.77,2022-10-19,Company A,10,2022,10,2022,14
2,2022-08-01,Liberia,UI/UX Design,294.28,48.23,7.74,1,253.79,2022-08-09,Company D,8,2022,8,2022,8
3,2022-11-30,Mongolia,Mobile App Development,2720.38,469.24,164.15,1,2415.29,2022-12-16,Company D,11,2022,12,2022,16
4,2023-03-16,Cyprus,E-commerce Development,2202.77,410.17,214.34,2,2006.94,2023-04-04,Company D,3,2023,4,2023,19


In [177]:
cat_col.remove("invoiceStatus")
cat_col

['country', 'service', 'client']

In [178]:
data['invoiceStatus'].value_counts()

invoiceStatus
0    3364
1    3324
2    3312
Name: count, dtype: int64

In [179]:
df = data[data['invoiceStatus'] != 2 ]

In [180]:
df['invoiceStatus'].value_counts()

invoiceStatus
0    3364
1    3324
Name: count, dtype: int64

In [181]:
df['issuedDate'].describe()

count                             6688
mean     2023-01-08 17:48:35.311004928
min                2022-07-10 00:00:00
25%                2022-10-10 00:00:00
50%                2023-01-09 00:00:00
75%                2023-04-11 00:00:00
max                2023-07-10 00:00:00
Name: issuedDate, dtype: object

In [182]:
profile = df[(df["issuedDate"] >= pd.Timestamp("2022-07-10")) & (df["issuedDate"] <= pd.Timestamp("2022-10-09"))]
train = df[(df["issuedDate"] >= pd.Timestamp("2022-10-10")) & (df["issuedDate"] <= pd.Timestamp("2023-04-09"))]
test = df[(df["issuedDate"] >= pd.Timestamp("2023-04-10")) & (df["issuedDate"] <= pd.Timestamp("2023-07-10"))]

In [183]:
print(train.shape)
print(test.shape)
print(profile.shape)

(3333, 15)
(1695, 15)
(1660, 15)


In [184]:
train.drop(columns=["issuedDate", "dueDate"], inplace=True)
test.drop(columns=["issuedDate", "dueDate"], inplace=True)
profile.drop(columns=["issuedDate", "dueDate"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.drop(columns=["issuedDate", "dueDate"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.drop(columns=["issuedDate", "dueDate"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  profile.drop(columns=["issuedDate", "dueDate"], inplace=True)


## Model Fitting

In [185]:
train.head()

Unnamed: 0,country,service,total,discount,tax,invoiceStatus,balance,client,issuedMonth,issuedyear,dueMonth,dueyear,c2d
3,Mongolia,Mobile App Development,2720.38,469.24,164.15,1,2415.29,Company D,11,2022,12,2022,16
5,New Caledonia,E-commerce Development,2700.06,153.57,31.26,0,2577.75,Company B,2,2023,2,2023,8
7,Kiribati,Web Development,845.23,35.66,34.76,0,844.33,Company C,2,2023,3,2023,14
13,Lesotho,Mobile App Development,4553.6,710.85,9.05,1,3851.8,Company D,3,2023,4,2023,29
18,Bangladesh,Mobile App Development,4828.61,410.12,71.13,0,4489.62,Company E,11,2022,12,2022,29


In [186]:
features = ['country', 'service', 'total', 'client', 'issuedMonth', 'issuedyear', 'dueMonth', 'dueyear',
       'c2d']

In [187]:
target = 'invoiceStatus'

In [188]:
def train_model(train_df, features, target, model):
    X_train = train_df[features]
    y_train = train_df[target]

    # Identify categorical and numerical columns
    categorical_features = X_train.select_dtypes(include=['object']).columns
    numerical_features = X_train.select_dtypes(exclude=['object']).columns

    # Define the numerical pipeline
    numerical_pipeline = Pipeline(steps=[
        ('impute', SimpleImputer(strategy='mean')),
        ('scale', StandardScaler())
    ])

    # Define the ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_pipeline, numerical_features),
            ('cat', OneHotEncoder(), categorical_features)
        ]
    )

    # Create the pipeline with preprocessor and classifier
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

    # Fit the model
    model_pipeline.fit(X_train, y_train)

    return model_pipeline

In [189]:
def evaluate_model(model, df,features, target, thresholds=[0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.90,0.95, 0.99]):

    X = df[features]
    y = df[target]

    # Get the predicted probabilities for the positive class
    y_prob = model.predict_proba(X)[:, 1]
    
    # Initialize a list to store the results
    results = []
    
    for threshold in thresholds:
        # Convert probabilities to binary predictions based on the threshold
        y_pred = (y_prob >= np.quantile(y_prob, threshold)).astype(int)
        
        # Calculate the metrics
        accuracy = accuracy_score(y, y_pred)
        precision = precision_score(y, y_pred)
        recall = recall_score(y, y_pred)
        f1 = f1_score(y, y_pred)


        # Append the results to the list
        results.append({
            'percentile threshold': threshold,
            'accuracy': accuracy,
            'probability threshold': np.quantile(y_prob, threshold),
            'precision': precision,
            'recall': recall,
            'f1': f1,
        })
    
    # Convert the results list to a DataFrame
    results_df = pd.DataFrame(results)
    
    return results_df.sort_values(by = 'percentile threshold', ascending=False)


## Random Forest Model 

In [197]:
# Perform hyperparameter tuning

# Define the parameter grid
param_grid = {
    'n_estimators': [5, 10, 25, 50],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [2,5,7,10],
    'min_samples_split': [20, 50, 100, 500],
    'min_samples_leaf': [10, 20, 40,75],
    'bootstrap': [True, False]
}
# Initialize the RandomizedSearchCV object
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=3, scoring='accuracy', n_jobs=-1)

In [199]:
# Identify categorical and numerical columns
categorical_features = train.select_dtypes(include=['object']).columns
numerical_features = train.select_dtypes(exclude=['object']).columns

X_train = train[features]
y_train = train[target]

# Identify categorical and numerical columns
categorical_features = X_train.select_dtypes(include=['object']).columns
numerical_features = X_train.select_dtypes(exclude=['object']).columns

# Define the numerical pipeline
numerical_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', StandardScaler())
])

# Define the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)
X_train = preprocessor.fit_transform(X_train)


# Fit the random search model
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_}")

Best Parameters: {'bootstrap': True, 'max_depth': 2, 'max_features': 'log2', 'min_samples_leaf': 10, 'min_samples_split': 500, 'n_estimators': 10}
Best Score: 0.5169516951695169


In [193]:
model_rf = train_model(train, features, target, RandomForestClassifier(n_estimators=2, max_depth=10, min_samples_split=100, min_samples_leaf=100, max_features='auto'))

  warn(


In [194]:
evaluate_model(model_rf, train, features, target)

Unnamed: 0,percentile threshold,accuracy,probability threshold,precision,recall,f1
11,0.99,0.51,0.49,0.5,0.82,0.62
10,0.95,0.51,0.49,0.5,0.82,0.62
9,0.9,0.51,0.49,0.5,0.82,0.62
8,0.8,0.51,0.49,0.5,0.82,0.62
7,0.7,0.51,0.49,0.5,0.82,0.62
6,0.6,0.51,0.49,0.5,0.82,0.62
5,0.5,0.51,0.49,0.5,0.82,0.62
4,0.4,0.51,0.49,0.5,0.82,0.62
3,0.3,0.51,0.49,0.5,0.82,0.62
2,0.2,0.51,0.49,0.5,0.82,0.62


In [148]:
evaluate_model(model_rf, val, features, target)

Unnamed: 0,percentile threshold,accuracy,probability threshold,precision,recall,f1
11,0.99,0.5,0.51,0.48,0.33,0.39
10,0.95,0.5,0.51,0.48,0.33,0.39
9,0.9,0.5,0.51,0.48,0.33,0.39
8,0.8,0.5,0.51,0.48,0.33,0.39
7,0.7,0.5,0.51,0.48,0.33,0.39
6,0.6,0.49,0.47,0.49,1.0,0.66
5,0.5,0.49,0.47,0.49,1.0,0.66
4,0.4,0.49,0.47,0.49,1.0,0.66
3,0.3,0.49,0.47,0.49,1.0,0.66
2,0.2,0.49,0.47,0.49,1.0,0.66


## XGBoost

In [200]:
# Define the parameter grid
param_grid = {
    'classifier__n_estimators': [2,5,10,20],
    'classifier__learning_rate': [0.01, 0.05, 0.001, 0.005],
    'classifier__max_depth': [1,2,3,4,5],
    'classifier__subsample': [0.7, 0.8, 0.9],
    'classifier__colsample_bytree': [0.65, 0.70, 0.75]
}

xgboost_model= xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
# Initialize GridSearchCV
grid_search = GridSearchCV(xgboost_model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)

In [201]:
# Identify categorical and numerical columns
categorical_features = train.select_dtypes(include=['object']).columns
numerical_features = train.select_dtypes(exclude=['object']).columns

X_train = train[features]
y_train = train[target]

# Identify categorical and numerical columns
categorical_features = X_train.select_dtypes(include=['object']).columns
numerical_features = X_train.select_dtypes(exclude=['object']).columns

# Define the numerical pipeline
numerical_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', StandardScaler())
])

# Define the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)
X_train = preprocessor.fit_transform(X_train)

# Fit the random search model
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_}")

Best Parameters: {'classifier__colsample_bytree': 0.65, 'classifier__learning_rate': 0.01, 'classifier__max_depth': 1, 'classifier__n_estimators': 2, 'classifier__subsample': 0.7}
Best Score: 0.49684968496849685


Parameters: { "classifier__colsample_bytree", "classifier__learning_rate", "classifier__max_depth", "classifier__n_estimators", "classifier__subsample", "use_label_encoder" } are not used.



In [202]:
xgboost_model = train_model(train, features, target, model= xgb.XGBClassifier(classifier__colsample_bytree = 0.65, classifier__learning_rate = 0.01, classifier__max_depth = 1, classifier__n_estimators = 2, classifier__subsample = 0.7))

Parameters: { "classifier__colsample_bytree", "classifier__learning_rate", "classifier__max_depth", "classifier__n_estimators", "classifier__subsample" } are not used.



In [203]:
evaluate_model(xgboost_model, train,features,target)

Unnamed: 0,percentile threshold,accuracy,probability threshold,precision,recall,f1
11,0.99,0.52,0.86,1.0,0.02,0.04
10,0.95,0.56,0.77,1.0,0.1,0.18
9,0.9,0.61,0.69,0.99,0.2,0.34
8,0.8,0.69,0.6,0.95,0.39,0.55
7,0.7,0.74,0.55,0.89,0.54,0.67
6,0.6,0.78,0.52,0.83,0.68,0.75
5,0.5,0.79,0.49,0.78,0.79,0.78
4,0.4,0.77,0.46,0.72,0.88,0.79
3,0.3,0.74,0.42,0.66,0.94,0.78
2,0.2,0.67,0.38,0.6,0.98,0.75


In [205]:
evaluate_model(xgboost_model, test,features,target)

Unnamed: 0,percentile threshold,accuracy,probability threshold,precision,recall,f1
11,0.99,0.52,0.86,0.71,0.01,0.03
10,0.95,0.51,0.73,0.42,0.04,0.08
9,0.9,0.51,0.66,0.45,0.09,0.16
8,0.8,0.52,0.59,0.51,0.21,0.3
7,0.7,0.51,0.55,0.49,0.3,0.37
6,0.6,0.5,0.52,0.49,0.4,0.44
5,0.5,0.5,0.49,0.49,0.5,0.5
4,0.4,0.5,0.47,0.49,0.61,0.54
3,0.3,0.5,0.43,0.49,0.7,0.58
2,0.2,0.49,0.39,0.48,0.8,0.6


# Feature Engineering