In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
plt.rcParams['font.size'] = 14

The file I load below is a combined output generated from the two separate files on credit rating and ratios - all downloaded from the Wharton DB. I do not show the path how I got there because this should ultimately be part of your project.

In [2]:
df = pd.read_csv('SP500_LongTermCreditRating_Ratios_combined.csv', sep=',', 
                 parse_dates=['adate', 'qdate', 'public_date'])
df.head(10)

Unnamed: 0,permno,cusip,Ticker,adate,qdate,public_date,splticrm,bm,ps,pcf,...,dltt_be,debt_assets,debt_capital,de_ratio,cash_ratio,quick_ratio,curr_ratio,at_turn,ptb,PEG_trailing
0,10104,68389X10,ORCL,2009-05-31,2009-11-30,2010-01-31,A,0.252,4.976,13.353,...,0.451,0.485,0.338,0.949,1.875,2.409,2.409,0.466,4.145,1.045
1,10104,68389X10,ORCL,2009-05-31,2009-11-30,2010-02-28,A,0.252,5.323,14.285,...,0.451,0.485,0.338,0.949,1.875,2.409,2.409,0.466,4.434,1.117
2,10104,68389X10,ORCL,2009-05-31,2009-11-30,2010-03-31,A,0.252,5.556,14.911,...,0.451,0.485,0.338,0.949,1.875,2.409,2.409,0.466,4.628,1.165
3,10104,68389X10,ORCL,2009-05-31,2010-02-28,2010-04-30,A,0.233,5.381,15.909,...,0.443,0.493,0.346,0.982,1.739,2.237,2.245,0.453,4.515,1.545
4,10104,68389X10,ORCL,2009-05-31,2010-02-28,2010-05-31,A,0.233,4.692,13.871,...,0.443,0.493,0.346,0.982,1.739,2.237,2.245,0.453,3.937,1.348
5,10104,68389X10,ORCL,2009-05-31,2010-02-28,2010-06-30,A,0.233,4.461,13.189,...,0.443,0.493,0.346,0.982,1.739,2.237,2.245,0.453,3.743,1.282
6,10104,68389X10,ORCL,2010-05-31,2010-05-31,2010-07-31,A,0.275,4.43,13.687,...,0.369,0.493,0.334,0.986,1.257,1.821,1.838,0.492,3.805,1.311
7,10104,68389X10,ORCL,2010-05-31,2010-05-31,2010-08-31,A,0.275,4.093,12.645,...,0.369,0.493,0.334,0.986,1.257,1.821,1.838,0.492,3.516,1.212
8,10104,68389X10,ORCL,2010-05-31,2010-05-31,2010-09-30,A,0.275,5.033,15.549,...,0.369,0.493,0.334,0.986,1.257,1.821,1.838,0.492,4.323,1.489
9,10104,68389X10,ORCL,2010-05-31,2010-08-31,2010-10-31,A,0.296,5.046,16.86,...,0.428,0.498,0.352,1.006,1.568,2.061,2.077,0.488,4.551,1.632


In [3]:
df.shape

(30119, 40)

Let's assume (for whatever random reason) I believe that debt-ebitda ratio, debt-capital ratio, and book-to-market ratio are the most relevant features to estimate a company's long-term credit rating. 

For these columns, I need to have a full set of values.

In [4]:
# Check NA/missing values for relevant columns
selCols = ['debt_ebitda', 'debt_capital', 'bm', 'splticrm']
df[selCols].isnull().sum()  # alternatively: df[selCols].isna().sum()

debt_ebitda     145
debt_capital    279
bm              671
splticrm          0
dtype: int64

In [5]:
df[selCols].dtypes

debt_ebitda     float64
debt_capital    float64
bm              float64
splticrm         object
dtype: object

From the output above I see that I will need to address the question of missing data (either impute them or delete the corresponding samples). See further down.

We'll drop the two rows where credit rating is 'CCC' or as this is too small a sample to be considered (only 2 and 4 appearances, respectively).

In [6]:
df = df[df['splticrm'] != 'CCC']
df = df[df['splticrm'] != 'D']

In [7]:
df.shape

(30113, 40)

Assign columns to X (feature matrix) and y (response vector). The latter we need to transform into numeric categories.

In [8]:
# Assign feature columns to X
X = df[selCols].iloc[:, :-1]
X.head()

Unnamed: 0,debt_ebitda,debt_capital,bm
0,1.153,0.338,0.252
1,1.153,0.338,0.252
2,1.153,0.338,0.252
3,1.224,0.346,0.233
4,1.224,0.346,0.233


In [9]:
# Assign response columns to y
y = pd.factorize(df['splticrm'])[0]
print(np.bincount(y))
print(pd.factorize(df['splticrm'])[1])

[3797 1943  990  296 5649 3322  489 3955 4831  199   50  143  410 1069
 1648 1096  226]
Index(['A', 'A+', 'AA-', 'AAA', 'BBB', 'BBB-', 'AA', 'A-', 'BBB+', 'AA+',
       'CCC+', 'B-', 'B+', 'BB-', 'BB+', 'BB', 'B'],
      dtype='object')


To keep this example simple, I decide to drop the affected rows. For your project the expectation is that you specifically address this issue and fill/impute missing values based on your economic reasoning!

Now we use Scikit-learn's pipeline tool to get some results.

In [10]:
# Imports
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [11]:
# Train, test set split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=0, 
                                                    stratify=y)

In [12]:
# Create Pipeline object with standard scaler and RandomForest estimator
pipe = Pipeline([('imputer', SimpleImputer()),
                 ('scaler', StandardScaler()), 
                 ('classifier', RandomForestClassifier())])

# Define the hyperparameter values to be tested
param_grid = [{'imputer': [SimpleImputer(strategy='mean')],
               'scaler': [StandardScaler()],
               'classifier': [RandomForestClassifier(criterion='gini')],
               'classifier__max_depth': [1, 10, 100, None],
               'classifier__min_samples_split': [2, 10, 20]}]

# Run brute-force grid search
gs = GridSearchCV(estimator=pipe,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=5, n_jobs=-1)
gs = gs.fit(X_train, y_train)

print('Best CV accuracy: {:.2f}'.format(gs.best_score_))
print('Test score:       {:.2f}'.format(gs.score(X_test, y_test)))
print('Best parameters: {}'.format(gs.best_params_))

Best CV accuracy: 0.91
Test score:       0.95
Best parameters: {'classifier': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False), 'classifier__max_depth': None, 'classifier__min_samples_split': 2, 'imputer': SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0), 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}


In [13]:
# Create Pipeline object with standard scaler and LogisticRegression estimator
pipe = Pipeline([('imputer', KNNImputer()),
                 ('scaler', StandardScaler()), 
                 ('classifier', LogisticRegression())])

# Define the hyperparameter values to be tested
param_grid = [{'imputer': [KNNImputer(n_neighbors=5)],
               'scaler': [StandardScaler()],
               'classifier': [LogisticRegression(max_iter=1000)],
               'classifier__C': [1, 100, 1000]}]

# Run brute-force grid search
gs = GridSearchCV(estimator=pipe,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=5, n_jobs=-1)
gs = gs.fit(X_train, y_train)

print('Best CV accuracy: {:.2f}'.format(gs.best_score_))
print('Test score:       {:.2f}'.format(gs.score(X_test, y_test)))
print('Best parameters: {}'.format(gs.best_params_))

Best CV accuracy: 0.20
Test score:       0.20
Best parameters: {'classifier': LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False), 'classifier__C': 100, 'imputer': KNNImputer(add_indicator=False, copy=True, metric='nan_euclidean',
           missing_values=nan, n_neighbors=5, weights='uniform'), 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}


In [14]:
# Create Pipeline object with standard scaler and LogisticRegression estimator
pipe = Pipeline([('imputer', SimpleImputer()),
                 ('scaler', StandardScaler()), 
                 ('classifier', LogisticRegression())])

# Define the hyperparameter values to be tested
param_grid = [{'imputer': [SimpleImputer(strategy='mean')],
               'scaler': [StandardScaler()],
               'classifier': [LogisticRegression(max_iter=1000)],
               'classifier__C': [1, 100, 1000]}]

# Run brute-force grid search
gs = GridSearchCV(estimator=pipe,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=5, n_jobs=-1)
gs = gs.fit(X_train, y_train)

print('Best CV accuracy: {:.2f}'.format(gs.best_score_))
print('Test score:       {:.2f}'.format(gs.score(X_test, y_test)))
print('Best parameters: {}'.format(gs.best_params_))

Best CV accuracy: 0.20
Test score:       0.21
Best parameters: {'classifier': LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False), 'classifier__C': 1000, 'imputer': SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0), 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}


In [15]:
# Create Pipeline object with standard scaler and SVC estimator
pipe = Pipeline([('imputer', SimpleImputer()),
                 ('scaler', StandardScaler()), 
                 ('classifier', SVC())])

# Define the hyperparameter values to be tested
param_grid = [{'imputer': [SimpleImputer(strategy='mean')],
               'scaler': [StandardScaler()],
               'classifier': [SVC(kernel='rbf')],
               'classifier__gamma': [1, 10],
               'classifier__C': [10, 100]}]

# Run brute-force grid search
gs = GridSearchCV(estimator=pipe,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=5, n_jobs=-1)
gs = gs.fit(X_train, y_train)

print('Best CV accuracy: {:.2f}'.format(gs.best_score_))
print('Test score:       {:.2f}'.format(gs.score(X_test, y_test)))
print('Best parameters: {}'.format(gs.best_params_))

Best CV accuracy: 0.47
Test score:       0.47
Best parameters: {'classifier': SVC(C=100, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=10, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), 'classifier__C': 100, 'classifier__gamma': 10, 'imputer': SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0), 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}


Finally, below is a code snippet in case multiple classifiers should be run within the same pipeline. However, as noted during class, this might quickly get very time consuming. 

In [16]:
# Create pipeline object with standard scaler and SVC estimator
pipe = Pipeline([('imputer', SimpleImputer()),
                 ('scaler', StandardScaler()), 
                 ('classifier', LogisticRegression())])

# Define parameter grid
param_grid = [{'imputer': [SimpleImputer(strategy='mean')],
               'scaler': [StandardScaler()],
               'classifier': [LogisticRegression(max_iter=1000)],
               'classifier__C': [1, 100, 1000]},
              {'imputer': [SimpleImputer(strategy='mean')],
               'scaler': [StandardScaler()],
               'classifier': [SVC(kernel='rbf')],
               'classifier__gamma': [1, 10],
               'classifier__C': [10, 100]}]

# Run grid search
grid = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)

# Print results
print('Best CV accuracy: {:.2f}'.format(grid.best_score_))
print('Test score:       {:.2f}'.format(grid.score(X_test, y_test)))
print('Best parameters: {}'.format(grid.best_params_))

Best CV accuracy: 0.47
Test score:       0.47
Best parameters: {'classifier': SVC(C=100, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=10, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), 'classifier__C': 100, 'classifier__gamma': 10, 'imputer': SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0), 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}


**Ultimately, you will need to analyze your results using appropriate output, metrics, and plots.**