In [1]:
import pandas as pd
import numpy as np
import time

# import sklearn libraries 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, classification_report

# import ml models
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

# import deep learning tools and tokenizer / pad_sequences
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Preprocessing
My first step will be to pull in the data that I will be using and converting it into a format that I can try feeding into different models. To do this, I will need to convert the tags in the training data into numbers. I do this by creating a tokenizer which gives each tag a unique number. For the target, I use a simple label encoder to encode into 0 and 1. Next, I need all of the data to be the same length to feed into my ML and DL models, so I use pad_sequences.

In [2]:
# pull in training set, split into train and and target
train = pd.read_csv('financial-statement-ml-challenge-train.csv')
y = train['FINANCIAL_STATEMENT_TEMPLATE']
train = train['COMPANY_XBRL_TAGS']

# pull in test set
test = pd.read_csv('financial-statement-ml-challenge-test.csv')
test = test['COMPANY_XBRL_TAGS']

In [3]:
train[0]

'AccumulatedOtherComprehensiveIncomeLossNetOfTax AdvertisingExpense AmortizationOfDebtDiscountPremium AmortizationOfIntangibleAssets Assets AvailableForSaleSecurities AvailableForSaleSecuritiesDebtSecurities BankOwnedLifeInsurance BankOwnedLifeInsuranceIncome BankOwnedLifeInsuranceIncomeIncludingIncomeFromDeathBenefits BrokerageCommissionsRevenue BrokeredTimeDeposits BusinessCombinationAcquisitionRelatedCosts BusinessCombinationContingentConsiderationArrangementsChangeInAmountOfContingentConsiderationLiability1 CashAcquiredFromAcquisition CashAndCashEquivalentsPeriodIncreaseDecrease CashAndDueFromBanks CashCashEquivalentsAndFederalFundsSold CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect CommitmentsAndContingencies CommonStockDividendsPerShareDeclared CommonStockValue CommunicationsAndInformationTechnology ComprehensiveIncomeNetOfTax CoreProcessingCharges DebtAndEquitySecuritiesGainLoss DebtSecuritiesGainLoss Deposits Depre

In [4]:
# encode the target
le = LabelEncoder()
y = le.fit_transform(y)

In [5]:
# generate tokenizer, fit to training data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train)

# set up tag_index, which contains all of the tags and print the number of words
tag_index = tokenizer.word_index
print("Number of words in word index: " + str(len(tag_index)))

Number of words in word index: 24178


In [6]:
# find some statsistics on the number of tags
average_number_of_tags = train.str.split().apply(len).mean()
max_number_of_tags = train.str.split().apply(len).max()
median_number_of_tags = train.str.split().apply(len).median()
print("Average length of tags:  " + str(average_number_of_tags))
print("Max length of tags:  " + str(max_number_of_tags))
print("Median length of tags:  " + str(median_number_of_tags))

Average length of tags:  158.8218111002921
Max length of tags:  1486
Median length of tags:  152.0


In [7]:
# setting variables for pad_sequences
trunc_type='post'
max_length = 300
padding_type='post'

# convert the text into sequences of numbers, then pad/trim those sequences to all be the same length
sequences = tokenizer.texts_to_sequences(train)
sequences_padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print("Shape of sequences_padded: " + str(sequences_padded.shape))

Shape of sequences_padded: (1027, 300)


# Testing ML Models
Next, I will try out a few ML models and tune the hyperparameters using RandomizedSearch. Ultimately, XGBoost performed the best, but it still only achieved results of around 97% or 98%. This just isn't good enough to ensure that I will have all of the correct answers.

In [8]:
# split data into train and test set to use for testing some of the ML models after using RandomizedSearch
train_df = pd.DataFrame(sequences_padded)
y = pd.DataFrame(y)
X_train, X_test, y_train, y_test = train_test_split(train_df, y, test_size=0.33, random_state=0)

## Using RandomizedSearchCV on XGBoost
RandomizedSearchCV implements a fit and score. The paremeters of the estimator randomly searched based upon the param_grid that I specify. 

In [9]:
# creating an XGBClassifier
clf = XGBClassifier()

# specifying the parameters to randomly search
param_grid = {
        'max_depth': [6, 10, 15, 20],
        'learning_rate': [0.001, 0.01, 0.1, 0.2, 0,3],
        'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
        'gamma': [0, 0.25, 0.5, 1.0],
        'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0],
        'n_estimators': [50, 100, 150, 200, 400]}

# creating RandomizedSearchCV object
rs_clf = RandomizedSearchCV(clf, param_grid, n_iter=250,
                            n_jobs=1, 
                            verbose=1, cv=2,
                            scoring='accuracy', refit=False, random_state=42)

In [10]:
# fitting RandomizedSearchCV
print("Randomized search..")
search_time_start = time.time()
rs_clf.fit(train_df,np.ravel(y))
print("Randomized search time:", time.time() - search_time_start)

Randomized search..
Fitting 2 folds for each of 250 candidates, totalling 500 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Randomized search time: 114.22986054420471


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:  1.9min finished


In [11]:
# taking a look at the best score and the best parameters
best_score = rs_clf.best_score_
best_params = rs_clf.best_params_
print("Best score: {}".format(best_score))
print("--" * 50)
print("Best params: ")
for param_name in sorted(best_params.keys()):
    print('%s: %r' % (param_name, best_params[param_name]))

Best score: 0.9844187316540378
----------------------------------------------------------------------------------------------------
Best params: 
colsample_bylevel: 1.0
colsample_bytree: 0.5
gamma: 0
learning_rate: 0.2
max_depth: 10
min_child_weight: 1.0
n_estimators: 50
reg_lambda: 1.0
subsample: 1.0


In [12]:
# inputting the best parameters into an XGBClassifier, fitting on a portion of the training data, and testing on the cv set
xgb = XGBClassifier(colsample_bylevel= 1.0,
colsample_bytree= 0.5,
gamma= 0,
learning_rate= 0.2,
max_depth= 10,
min_child_weight= 1.0,
n_estimators= 50,
reg_lambda= 1.0,
subsample=1.0)
xgb.fit(X_train,np.ravel(y_train))
predictions = xgb.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98       149
           1       0.98      0.99      0.99       190

    accuracy                           0.99       339
   macro avg       0.99      0.98      0.99       339
weighted avg       0.99      0.99      0.99       339



## Using RandomizedSearchCV on Random Forest
I also wanted to try out Random Forest 

In [13]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [14]:
# First create the base model to tune
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf_random.fit(train_df, y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   47.7s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  6.2min finished
  self.best_estimator_.fit(X, y, **fit_params)


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [15]:
# taking a look at the best parameters
rf_random.best_params_

{'n_estimators': 600,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 60,
 'bootstrap': False}

In [16]:
# trying out the best parameters by creating a RandomForestClassifier and training it on part of the training data, then checking on the cv data
random_forest = RandomForestClassifier(n_estimators=1800, min_samples_split=2, min_samples_leaf=1, max_features='auto',max_depth=20,bootstrap=False)
random_forest.fit(X_train,np.ravel(y_train))
predictions = random_forest.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.95      0.92      0.94       149
           1       0.94      0.96      0.95       190

    accuracy                           0.94       339
   macro avg       0.94      0.94      0.94       339
weighted avg       0.94      0.94      0.94       339



# Implementing Deep Learning Model
The ML models are certainly performing well, getting 94%+ accuracy. However, this simply isn't good enough. I'm going to use a deep learning model to improve upon that even more. In order to check how my model is performing, I am going to use StratifiedShuffleSplit so that I can test out my model using 5 separate folds to see how well my model generalizes. After building my model, I found that I was achieving between 99.9% and 100% accuracy on each fold of the cv data. These results are much more promising, although the model is more complex.

In [17]:
# set up variables
embedding_dim = 22
vocab_size = 24179

In [18]:
# write a function to create the deep learning model
def make_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
        tf.keras.layers.Conv1D(128, 5, activation='relu'),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model

In [19]:
num_epochs = 4
n_splits=5

# create StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits = n_splits, test_size = 0.2, random_state = 0)

# train the neural network using each fold of the StratifiedShuffleSplit
for train_index, test_index in sss.split(train_df, y):
    X_train, X_test = train_df.iloc[train_index], train_df.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model = make_model()
    model.fit(X_train, y_train, epochs=num_epochs, validation_data=(X_test, y_test), verbose=True)
    print('-'*100)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
----------------------------------------------------------------------------------------------------
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
----------------------------------------------------------------------------------------------------
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
----------------------------------------------------------------------------------------------------
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
----------------------------------------------------------------------------------------------------
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
----------------------------------------------------------------------------------------------------


In [20]:
# training the model on the full dataset
model = make_model()
model.fit(sequences_padded, y, epochs=4, verbose=True)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x21187a556d0>

# Using my Model to Predict the Test Data
Now that I have created my model which performs really well, I'm going to use it to make the final predictions.

In [21]:
# convert the tags to sequences of numbers
test_sequences = tokenizer.texts_to_sequences(test)

# pad/trim the sequences
test_sequences_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# make predictions
predictions = model.predict(test_sequences_padded)

In [22]:
# the predictions ended up as a sigmoid activation. I will convert this to binary
binary_predictions = []
for item in predictions:
    if item >= 0.5:
        binary_predictions.append(1)
    else:
        binary_predictions.append(0)

# convert to written labels
binary_predictions_with_words = le.inverse_transform(binary_predictions)

# create dataframe with the predictions
bin_pred_df = pd.DataFrame(binary_predictions_with_words, columns=["Predictions"] )

In [23]:
# export as a csv, removing the header and index
bin_pred_df.to_csv('final_predictions_4_8_21.csv', header=False, index=False)

# Trying Another Method to Predict the Data
Now that I figured out how to achieve perfect results with a neural network, I want to see if I can find a way to get the same results but with a simpler model. To do this, I tried using a CountVectorizer instead of a tokenizer. Using this technique I was able to achieve 100% accuracy using a Naive Bayes model, without even needing to tune my model.

In [24]:
# pull in training set, split into train and and target
train = pd.read_csv('financial-statement-ml-challenge-train.csv')
y = train['FINANCIAL_STATEMENT_TEMPLATE']
train = train['COMPANY_XBRL_TAGS']

# pull in test set
test = pd.read_csv('financial-statement-ml-challenge-test.csv')
test = test['COMPANY_XBRL_TAGS']

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB

In [26]:
# create a count vectorizer
vectorizer = CountVectorizer()

# fit and transform on the training data, transform the test data
train = vectorizer.fit_transform(train)
test = vectorizer.transform(test)

In [27]:
# split into train and validation data for testing
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.5, random_state=0)

In [28]:
# create a simple model without tuning any hyperparameters
gnb = GaussianNB()

In [29]:
# fit the ML model and make predictions
y_pred = gnb.fit(X_train.toarray(), y_train).predict(X_test.toarray())

In [30]:
# view results of predictions
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         fin       1.00      1.00      1.00       239
        indu       1.00      1.00      1.00       275

    accuracy                           1.00       514
   macro avg       1.00      1.00      1.00       514
weighted avg       1.00      1.00      1.00       514



In [31]:
# make predictions on test set
final_predictions = gnb.predict(test.toarray())

In [32]:
final_predictions

array(['fin', 'fin', 'indu', 'indu', 'indu', 'indu', 'fin', 'fin', 'fin',
       'fin', 'fin', 'fin', 'indu', 'indu', 'indu', 'indu', 'indu', 'fin',
       'indu', 'indu', 'fin', 'fin', 'fin', 'indu', 'indu', 'indu',
       'indu', 'indu', 'indu', 'indu', 'indu', 'indu', 'indu', 'indu',
       'fin', 'fin', 'fin', 'fin', 'indu', 'indu', 'fin', 'fin', 'fin',
       'indu', 'indu', 'fin', 'fin', 'fin', 'fin', 'fin', 'fin'],
      dtype='<U4')

In [33]:
# export as a csv, removing the header and index
pd.DataFrame(final_predictions).to_csv('final_predictions_nb_5_25_21.csv', header=False, index=False)