In [1]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error
import time

# Classification models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

In [3]:
df = pd.read_csv("/content/drive/Shareddrives/ENGS Final Project/Data/Financial Risk Profile Data/Financial Data/CLEAN_merged_frp_data_v2.csv")
print(df.shape)
df.head()

(947, 16)


Unnamed: 0.1,Unnamed: 0,company_name,sector,financial_risk_profile,circa_rating,business_description,ffo_to_debt_ltm,debt_to_ebitda_ltm,cfo_to_debt_ltm,focf_to_debt_ltm,dcf_to_debt_ltm,ffo_interest_coverage_ltm,ebitda_to_interest_ltm,ebit_margin_ltm,ebitda_margin_ltm,return_on_capital_ltm
0,0,AAR Corp.,AEROSPACE & DEFENSE,[3] Intermediate,[3] Intermediate risk,AAR Corp. provides products and services to co...,0.18,1.31,0.23,0.11,0.11,2.93,10.3,7.25,8.7,7.05
1,1,Boeing Co.,AEROSPACE & DEFENSE,[4] Significant,[3] Intermediate risk,"The Boeing Company, together with its subsidia...",0.11,10.6,0.16,0.12,0.13,2.42,1.47,1.66,4.05,2.03
2,2,BWX Technologies Inc.,AEROSPACE & DEFENSE,[4] Significant,[3] Intermediate risk,"BWX Technologies, Inc., together with its subs...",0.19,3.44,0.2,0.07,0.14,5.35,8.05,12.2,15.3,8.53
3,3,CACI International Inc.,AEROSPACE & DEFENSE,[3] Intermediate,[3] Intermediate risk,"CACI International Inc, together with its subs...",0.15,2.42,0.17,0.13,0.36,3.2,8.06,8.06,10.0,6.87
4,4,General Dynamics Corp.,AEROSPACE & DEFENSE,[3] Intermediate,[3] Intermediate risk,General Dynamics Corporation operates as an ae...,0.42,1.84,0.51,0.41,0.62,11.8,12.5,8.77,10.8,7.35


In [4]:
df = pd.get_dummies(df, columns=['sector'])
df.head()

Unnamed: 0.1,Unnamed: 0,company_name,financial_risk_profile,circa_rating,business_description,ffo_to_debt_ltm,debt_to_ebitda_ltm,cfo_to_debt_ltm,focf_to_debt_ltm,dcf_to_debt_ltm,...,sector_RAILROADS & PACKAGE EXPRESS,sector_REAL ESTATE INVESTMENT COMPANIES,sector_REGULATED UTILITIES,sector_RETAIL & RESTAURANTS,sector_SPECIALTY CHEMICALS,sector_TECHNOLOGY - HARDWARE & SEMICONDUCTORS,sector_TECHNOLOGY - SOFTWARE & SERVICES,sector_TELECOM & CABLE,sector_TRANSPORTATION CYCLICAL,sector_UNREGULATED POWER & GAS
0,0,AAR Corp.,[3] Intermediate,[3] Intermediate risk,AAR Corp. provides products and services to co...,0.18,1.31,0.23,0.11,0.11,...,0,0,0,0,0,0,0,0,0,0
1,1,Boeing Co.,[4] Significant,[3] Intermediate risk,"The Boeing Company, together with its subsidia...",0.11,10.6,0.16,0.12,0.13,...,0,0,0,0,0,0,0,0,0,0
2,2,BWX Technologies Inc.,[4] Significant,[3] Intermediate risk,"BWX Technologies, Inc., together with its subs...",0.19,3.44,0.2,0.07,0.14,...,0,0,0,0,0,0,0,0,0,0
3,3,CACI International Inc.,[3] Intermediate,[3] Intermediate risk,"CACI International Inc, together with its subs...",0.15,2.42,0.17,0.13,0.36,...,0,0,0,0,0,0,0,0,0,0
4,4,General Dynamics Corp.,[3] Intermediate,[3] Intermediate risk,General Dynamics Corporation operates as an ae...,0.42,1.84,0.51,0.41,0.62,...,0,0,0,0,0,0,0,0,0,0


In [5]:
def extract_integer_from_string(s):
    match = re.search(r'\[(\d+)\]', s)
    if match:
        return int(match.group(1))
    else:
        return None

df['circa_rating'] = df['circa_rating'].apply(extract_integer_from_string)
df['financial_risk_profile'] = df['financial_risk_profile'].apply(extract_integer_from_string)

In [6]:
df.drop(['business_description', 'company_name', 'Unnamed: 0'], axis = 1, inplace = True)
df.dropna(inplace = True)

print(df.shape)

(873, 49)


In [7]:
# Define the feature variables (X) and the target variable (y)
X = df.drop(['financial_risk_profile'], axis = 1)
y = df['financial_risk_profile']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
model = RandomForestClassifier(random_state=42)

# Train the classifier
model.fit(X_train, y_train)

# Predict the credit ratings on the test set
y_pred = model.predict(X_test)

# Evaluate the model
unique_labels = np.unique(np.concatenate((y_test, y_pred)))
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"F1: {f1_score(y_test, y_pred, average = 'macro')}")

Accuracy: 0.49714285714285716
F1: 0.451976604405983


In [11]:
# Helper function that does cross validation and gives best model
def run_cv(model, param_grid, X_train, y_train, X_val):
    print("training " + str(model))

    # Define the cross-validation strategy
    cv = KFold(n_splits=5)

    # Perform grid search with cross-validation
    grid_search = GridSearchCV(estimator=model(), param_grid=param_grid, cv=cv, scoring='f1_macro')

    grid_search.fit(X_train, y_train)
    y_pred = grid_search.predict(X_val)

    # Return the best model, y_pred
    return grid_search, y_pred # return the metric and model

## Step 3: Train models
classification_models = {
    LogisticRegression : {'solver': ['saga'],
                          'penalty': ['l2', 'elasticnet']},
    GaussianNB : {},
    # SVC : {'C': [0.01, 0.1, 1, 2, 10], 'kernel': ['linear', 'poly', 'rbf']},
    RandomForestClassifier : {'n_estimators': [500, 1000],
                            'max_depth': [100, None]},
    AdaBoostClassifier : {'n_estimators': [100, 200, 500],
                        'learning_rate': [0.001, 0.1]},
    # XGBClassifier : {'max_depth': [3, 5],
    #                 'learning_rate': [0.1],
    #                 'subsample': [0.5]} ,
}

# Define the feature variables (X) and the target variable (y)
X = df.drop(['financial_risk_profile'], axis = 1)
y = df['financial_risk_profile']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


results_df = pd.DataFrame(columns = ['model', 'params', 'Accuracy', 'F1 Score', 'MSE'])

for model_class in classification_models.keys():
    # Train model
    model, y_pred = run_cv(model_class, classification_models[model_class], X_train, y_train, X_test)

    print(y_pred.shape)
    print(y_test.shape)

    # Calculate loss
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average = 'macro')
    mse = mean_squared_error(y_test, y_pred)

    # Record result
    new_row = pd.DataFrame([{
        'model': str(model.best_estimator_),
        'params': str(model.best_params_),
        'Accuracy': accuracy,
        'F1 Score': f1,
        'MSE': mse
    }])
    results_df = pd.concat([results_df, new_row], ignore_index=True)


training <class 'sklearn.linear_model._logistic.LogisticRegression'>


5 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1291, in fit
    fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, prefer=prefer)(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/parallel.py", line 63, in __call__
    return super().__call__(iterable_with_config)
  File "/usr/local/lib/python3.10/dist-packages/joblib/parallel.py", line 1863, in __ca

(175,)
(175,)
training <class 'sklearn.naive_bayes.GaussianNB'>
(175,)
(175,)
training <class 'sklearn.ensemble._forest.RandomForestClassifier'>
(175,)
(175,)
training <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>
(175,)
(175,)


In [12]:
results_df.sort_values(by = 'F1 Score', ascending = False).head()

Unnamed: 0,model,params,Accuracy,F1 Score,MSE
2,"RandomForestClassifier(max_depth=100, n_estima...","{'max_depth': 100, 'n_estimators': 500}",0.491429,0.445331,1.034286
1,GaussianNB(),{},0.262857,0.244336,4.537143
3,"AdaBoostClassifier(learning_rate=0.1, n_estima...","{'learning_rate': 0.1, 'n_estimators': 200}",0.342857,0.222983,1.502857
0,LogisticRegression(solver='saga'),"{'penalty': 'l2', 'solver': 'saga'}",0.28,0.095347,2.485714
