In [17]:
### 개발 가상환경 py37TF2

# https://towardsdatascience.com/multi-label-multi-class-text-classification-with-bert-transformer-and-keras-c6355eccb63a
    
    

#######################################
### -------- Load libraries ------- ###
# Load Huggingface transformers
from transformers import TFBertModel,  BertConfig, BertTokenizerFast
# Then what you need from tensorflow.keras
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
# And pandas for data import + sklearn because you allways need sklearn
import pandas as pd
from sklearn.model_selection import train_test_split

In [18]:
# https://www.consumerfinance.gov/data-research/consumer-complaints/

#######################################
### --------- Import data --------- ###
# Import data from csv
data = pd.read_csv('./data/complaints.csv')

In [19]:
data[:5]

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,2019-09-24,Debt collection,I do not know,Attempts to collect debt not owed,Debt is not yours,transworld systems inc. \nis trying to collect...,,TRANSWORLD SYSTEMS INC,FL,335XX,,Consent provided,Web,2019-09-24,Closed with explanation,Yes,,3384392
1,2019-09-19,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,,Company has responded to the consumer and the ...,Experian Information Solutions Inc.,PA,15206,,Consent not provided,Web,2019-09-20,Closed with non-monetary relief,Yes,,3379500
2,2019-11-08,Debt collection,I do not know,Communication tactics,Frequent or repeated calls,"Over the past 2 weeks, I have been receiving e...",,"Diversified Consultants, Inc.",NC,275XX,,Consent provided,Web,2019-11-08,Closed with explanation,Yes,,3433198
3,2019-05-23,Checking or savings account,Checking account,Managing an account,Deposits and withdrawals,,Company has responded to the consumer and the ...,MIDFIRST BANK,AZ,85254,,,Referral,2019-05-28,Closed with explanation,Yes,,3255455
4,2021-05-05,"Credit reporting, credit repair services, or o...",Credit reporting,Problem with a credit reporting company's inve...,Was not notified of investigation status or re...,,,"EQUIFAX, INC.",TX,76006,,,Web,2021-05-05,Closed with explanation,Yes,,4352688


In [20]:
# Select required columns
data = data[['Consumer complaint narrative', 'Product', 'Issue']]

In [21]:
data[:3]

Unnamed: 0,Consumer complaint narrative,Product,Issue
0,transworld systems inc. \nis trying to collect...,Debt collection,Attempts to collect debt not owed
1,,"Credit reporting, credit repair services, or o...",Incorrect information on your report
2,"Over the past 2 weeks, I have been receiving e...",Debt collection,Communication tactics


In [22]:
# Remove a row if any of the three remaining columns are missing
data = data.dropna()

In [23]:
# Remove rows, where the label is present only ones (can't be split)
data = data.groupby('Issue').filter(lambda x : len(x) > 1)
data = data.groupby('Product').filter(lambda x : len(x) > 1)

In [24]:
# Set your model output as categorical and save in new label col
data['Issue_label'] = pd.Categorical(data['Issue'])
data['Product_label'] = pd.Categorical(data['Product'])

In [25]:
# Transform your output to numeric
data['Issue'] = data['Issue_label'].cat.codes
data['Product'] = data['Product_label'].cat.codes

In [26]:
# Split into train and test - stratify over Issue
data, data_test = train_test_split(data, test_size = 0.2, stratify = data[['Issue']])

In [27]:
data

Unnamed: 0,Consumer complaint narrative,Product,Issue,Issue_label,Product_label
163119,Consumer complaint narrative I have not receiv...,7,13,Attempts to collect debt not owed,Debt collection
1459314,My name is XXXX XXXX XXXX not XXXX XX...,6,72,Incorrect information on your report,"Credit reporting, credit repair services, or o..."
17552,I have an issue with my Barclay credit card wh...,4,57,Fees or interest,Credit card or prepaid card
570497,There are multiple errors for the accounts lis...,6,110,Problem with a credit reporting company's inve...,"Credit reporting, credit repair services, or o..."
1834045,I have paid this. In person in full. XXXX made...,7,36,Cont'd attempts collect debt not owed,Debt collection
...,...,...,...,...,...
1841873,Issue 1 : I am currently enrolled in an income...,15,48,Dealing with my lender or servicer,Student loan
584432,Equifax ; XXXX and XXXX are reporting incorr...,6,110,Problem with a credit reporting company's inve...,"Credit reporting, credit repair services, or o..."
80338,Nelnet capitalized my student loans in the amo...,15,49,Dealing with your lender or servicer,Student loan
1808295,Equifax is reporting a large number of inquiri...,5,71,Incorrect information on credit report,Credit reporting


In [28]:
# 20%
data_test 

Unnamed: 0,Consumer complaint narrative,Product,Issue,Issue_label,Product_label
678987,I send a letter to discover ( the credit card ...,4,93,"Other features, terms, or problems",Credit card or prepaid card
718678,I have a private student loan with XXXX XXXX s...,15,21,Can't repay my loan,Student loan
807587,( a ) XXXX. Except as otherwise provided in th...,6,72,Incorrect information on your report,"Credit reporting, credit repair services, or o..."
271977,I have an account on my file from the XXXX XXX...,6,110,Problem with a credit reporting company's inve...,"Credit reporting, credit repair services, or o..."
888050,"Im a victim of identity theft, and this compan...",6,72,Incorrect information on your report,"Credit reporting, credit repair services, or o..."
...,...,...,...,...,...
940925,My home has been forclosed. I went into an off...,10,30,Closing on a mortgage,Mortgage
212589,The credit bureaus XXXX and Transunion are rep...,6,72,Incorrect information on your report,"Credit reporting, credit repair services, or o..."
1826220,"Got divorced XXXX of XXXX, first payment fell ...",10,78,"Loan modification,collection,foreclosure",Mortgage
912469,On XX/XX/2018 my account was compromised with ...,1,111,Problem with a lender or other company chargin...,Checking or savings account


In [29]:
#######################################
### --------- Setup BERT ---------- ###
# Name of the BERT model to use
model_name = 'bert-base-uncased'
# Max length of tokens
max_length = 100
# Load transformers config and set output_hidden_states to False
config = BertConfig.from_pretrained(model_name)
config.output_hidden_states = False

In [30]:
# Load BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)
# Load the Transformers BERT model
transformer_model = TFBertModel.from_pretrained(model_name, config = config)


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [31]:
#######################################
### ------- Build the model ------- ###
# TF Keras documentation: https://www.tensorflow.org/api_docs/python/tf/keras/Model
# Load the MainLayer
bert = transformer_model.layers[0]
# Build your model input
input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
inputs = {'input_ids': input_ids}
# Load the Transformers BERT model as a layer in a Keras model
bert_model = bert(inputs)[1]
dropout = Dropout(config.hidden_dropout_prob, name='pooled_output')
pooled_output = dropout(bert_model, training=False)
# Then build your model output
issue = Dense(units=len(data.Issue_label.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='issue')(pooled_output)
product = Dense(units=len(data.Product_label.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='product')(pooled_output)
outputs = {'issue': issue, 'product': product}
# And combine it all in a model object
model = Model(inputs=inputs, outputs=outputs, name='BERT_MultiLabel_MultiClass')
# Take a look at the model
model.summary()

Model: "BERT_MultiLabel_MultiClass"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 100)]        0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 109482240   input_ids[0][0]                  
__________________________________________________________________________________________________
pooled_output (Dropout)         (None, 768)          0           bert[0][1]                       
__________________________________________________________________________________________________
issue (Dense)                   (None, 159)          122271      pooled_output[0][0]              
_________________________________________________________________________

In [None]:
#######################################
### ------- Train the model ------- ###
# Set an optimizer
optimizer = Adam(
    learning_rate=5e-05,
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)
# Set loss and metrics
loss = {'issue': CategoricalCrossentropy(from_logits = True), 'product': CategoricalCrossentropy(from_logits = True)}
metric = {'issue': CategoricalAccuracy('accuracy'), 'product': CategoricalAccuracy('accuracy')}
# Compile the model
model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)
# Ready output data for the model
y_issue = to_categorical(data['Issue'])
y_product = to_categorical(data['Product'])
# Tokenize the input (takes some time)
x = tokenizer(
    text=data['Consumer complaint narrative'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = False,
    verbose = True)
# Fit the model
history = model.fit(
    x={'input_ids': x['input_ids']},
    y={'issue': y_issue, 'product': y_product},
    validation_split=0.2,
    batch_size=64,
    epochs=2)

Epoch 1/2
 133/7189 [..............................] - ETA: 34:06:29 - loss: 5.4543 - issue_loss: 3.6902 - product_loss: 1.7641 - issue_accuracy: 0.2212 - product_accuracy: 0.4873

In [None]:
#######################################
### ----- Evaluate the model ------ ###
# Ready test data
test_y_issue = to_categorical(data_test['Issue'])
test_y_product = to_categorical(data_test['Product'])
test_x = tokenizer(
    text=data_test['Consumer complaint narrative'].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = False,
    verbose = True)
# Run evaluation
model_eval = model.evaluate(
    x={'input_ids': test_x['input_ids']},
    y={'issue': test_y_issue, 'product': test_y_product}
)