In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder


In [14]:
file_path = 'cases_clean.csv'
df = pd.read_csv(file_path)

print("First few rows of the dataset:")
print(df.head())

First few rows of the dataset:
             Case ID              Outcome Date of Decision  \
0   4-1001-2008/1880  PARTIALLY FULFILLED          2/21/20   
1   4-1001-2105/3246  PARTIALLY FULFILLED          3/17/21   
2  4-1001-2204/10382  PARTIALLY FULFILLED          7/29/22   
3    4-1305-2101/562              REFUSED       24.08.2021   
4   4-15 01-2102/823  PARTIALLY FULFILLED       04/02/2021   

  Bankruptcy Proceedings Initiated Bankruptcy Declared  \
0                               No                  No   
1                               No                  No   
2                               No                  No   
3                               No                  No   
4                               No                  No   

   Asked Principal Amount  Asked Penalty Amount  Asked Fine  \
0                173500.0             4815030.0         0.0   
1             167140250.0            33428050.0         0.0   
2             144537391.0            25244960.0         0.

In [15]:
columns_to_drop = ['Bankruptcy Proceedings Initiated', 'Bankruptcy Declared', 'Plaintiff Represented',
                   'Defendant Represented',
                   'Case ID', 'Date of Decision', 'Awarded Principal Amount', 
                   'Awarded Penalty Amount', 'Awarded Fine', 'Awarded Unjustly Withheld Funds', 
                    'Category of a Claim']
df = df.drop(columns=columns_to_drop)

binary_cols = ['Supporting Documents Presented', 'Presence of Reconciliation Act', 'Presence of Contract',
               'Presence of Invoice', 'Penalty Conditions', 'Partial Payment Made', 'Deadline Specified',
               'Presence of Demand Letter']
for col in binary_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0})

In [16]:
X = df.drop(columns=['Outcome'])
y = df['Outcome']

le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [17]:
print(X_train.dtypes)

Asked Principal Amount            float64
Asked Penalty Amount              float64
Asked Fine                        float64
Asked Unjustly Withheld Funds       int64
Presence of Reconciliation Act      int64
Presence of Contract                int64
Presence of Invoice                 int64
Penalty Conditions                  int64
Total Value of Delivered Goods    float64
Partial Payment Made                int64
Deadline Specified                  int64
Presence of Demand Letter           int64
Supporting Documents Presented      int64
dtype: object


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), 
                           param_grid, 
                           cv=5, 
                           n_jobs=-1, 
                           scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Best parameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best cross-validation score: 0.8237742830712304


In [20]:
best_rf = grid_search.best_estimator_

y_pred = best_rf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=le.classes_))

Accuracy: 0.7586206896551724
                     precision    recall  f1-score   support

          FULFILLED       0.82      0.70      0.76        20
PARTIALLY FULFILLED       0.73      0.89      0.80        18
            REFUSED       0.74      0.70      0.72        20

           accuracy                           0.76        58
          macro avg       0.76      0.76      0.76        58
       weighted avg       0.76      0.76      0.76        58



In [21]:
importances = best_rf.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False))

                           Feature  Importance
1             Asked Penalty Amount    0.236919
0           Asked Principal Amount    0.196192
8   Total Value of Delivered Goods    0.144205
7               Penalty Conditions    0.073852
9             Partial Payment Made    0.065762
6              Presence of Invoice    0.056414
11       Presence of Demand Letter    0.053595
10              Deadline Specified    0.049058
12  Supporting Documents Presented    0.039050
2                       Asked Fine    0.038984
4   Presence of Reconciliation Act    0.029813
3    Asked Unjustly Withheld Funds    0.010556
5             Presence of Contract    0.005600


In [22]:
feature_names = X.columns.tolist()

example_input = {
    'Asked Principal Amount': 100000000.0,
    'Asked Penalty Amount': 50000000.0,
    'Asked Fine': 0.0,
    'Asked Unjustly Withheld Funds': 0.0,
    'Presence of Reconciliation Act': 0,
    'Presence of Contract': 1,
    'Presence of Invoice': 1,
    'Penalty Conditions': 1,
    'Total Value of Delivered Goods': 100000000.0,
    'Partial Payment Made': 0,
    'Deadline Specified': 1,
    'Presence of Demand Letter': 1,
    'Supporting Documents Presented': 1
}

input_df = pd.DataFrame([example_input], columns=feature_names)

probabilities = best_rf.predict_proba(input_df)[0]
outcome_probabilities = dict(zip(le.classes_, probabilities))

prediction = best_rf.predict(input_df)
outcome = le.inverse_transform(prediction)[0]

print(f"Predicted Outcome: {outcome}")
print("\nProbability for each possible outcome:")
for outcome, prob in outcome_probabilities.items():
    print(f"{outcome}: {prob:.2%}")



Predicted Outcome: PARTIALLY FULFILLED

Probability for each possible outcome:
FULFILLED: 0.00%
PARTIALLY FULFILLED: 96.83%
REFUSED: 3.17%
