In [1]:
# import libraries

import pandas as pd
import numpy as np
import re
import os
import glob

import nltk
from nltk.stem import WordNetLemmatizer 

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

import torch
from transformers import AutoTokenizer, AutoModel

In [2]:
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\f8210\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
df_rate = pd.read_excel("標準普爾最新信用評級.xls", header=7)
dict_rate = dict(zip(df_rate['Exchange:Ticker'], df_rate['S&P Entity Credit Rating - Issuer Credit Rating - Local Currency LT [Latest] (Rating)']))
feature_names = [col.replace('[', '').replace(']', '').replace('<', '') for col in pd.read_excel(f"NEW財務數據/財務數據/2019財務數據.xls", header=7).columns]

In [4]:
def clean_text(text):
    
    # 去除 HTML 標籤
    text = re.sub(r'<[^>]+>', '', text)
    
    # 去除數字
    text = re.sub(r'\d+', '', text)
    
    # 去除標點符號
    text = re.sub(r'[^\w\s]', '', text)
    
    # 去除非英文單字
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # 去除換行符號
    text = re.sub(r'\n', ' ', text)
    
    # 統一為小寫
    text = text.lower()
    
    # 詞性還原
    lemmatized_text = ' '.join([WordNetLemmatizer().lemmatize(w) for w in nltk.word_tokenize(text)])
    
    return lemmatized_text

In [5]:
def data_processing(year):
    
    df = pd.read_excel(f"NEW財務數據/財務數據/{year}財務數據.xls", header=7)
    
    # replace columns as 2019 columns
    df.columns = feature_names
    
    # map Exchange:Ticker to credit rating
    df["rate"] = df["Exchange:Ticker"].map(dict_rate)
    
    df['Exchange:Ticker'] = df['Exchange:Ticker'].str.split(':').str[-1]
    
    # For each ticker in the 'Exchange:Ticker' column, search for a matching text file
    for ticker in df['Exchange:Ticker']:
        txt_files = glob.glob(os.path.join(f'NEW文字檔/10-K文字檔/{year}txt/', f"{ticker}_*.txt"))
        if txt_files:
            with open(txt_files[0], 'r') as f:
                content = clean_text(f.read())
            df.loc[df['Exchange:Ticker'] == ticker, 'text'] = content
        else:
            df.loc[df['Exchange:Ticker'] == ticker, 'text'] = np.nan    

    # Drop the following columns
    df = df.drop(columns=["Company Name", "Security Tickers","Exchange:Ticker"])    

    # Create a rating map dictionary
    rating_map = {
        'AAA': 1, 'AA+': 1, 'AA': 1, 'AA-': 1, 'A+': 1, 'A': 1, 'A-': 1,
        'BBB+': 2, 'BBB': 2, 'BBB-': 2,
        'BB+': 3, 'BB': 3, 'BB-': 3,
        'B+': 4, 'B': 4, 'B-': 4, 'CCC+': 4, 'CCC': 4, 'CCC-': 4, 'D': 4,
        'NR': np.nan
    }

    # Map the credit rating values to numerical values 
    df['rate'] = df['rate'].map(lambda x: rating_map.get(x, x))
    
    # drop the nan in rate column
    df = df.dropna(subset=['rate'])

    # Replace '-' with NaN values in all columns
    for col in df.columns:
        df[col] = df[col].replace('-', np.nan)

    # Replace 'NM' with NaN values
    df = df.replace('NM', np.nan)

    # Fill NaN values with the mean
    for col in df.columns:
        if df[col].dtype != 'object':
            df[col] = df[col].fillna(df[col].mean())
    
    df = df.dropna(axis=1, how='all')
    df = df.dropna()

    return df

In [6]:
df = pd.DataFrame()

for year in [2019,2020,2021,2022]:
    # Concatenate the DataFrames
    df = pd.concat([df, data_processing(year)], ignore_index=True)

In [7]:
df

Unnamed: 0,"Total Assets - Capital IQ Latest Annual - 5 ($USDmm, Historical rate)","Total Capital - Capital IQ Latest Annual - 5 ($USDmm, Historical rate)","Sales/Sq. Ft., All (Net) - Capital IQ Latest Annual - 5 ($USD, Historical rate)","Total Equity - Capital IQ Latest Annual - 5 ($USDmm, Historical rate)","Total Enterprise Value My Setting Latest - 3 Year(s) ($USDmm, Historical rate)","Capital Expenditures - Compustat LTM - 3 ($USDmm, Historical rate)","Market Capitalization My Setting Latest - 3 Year(s) ($USDmm, Historical rate)",Shares Outstanding My Setting Latest - 3 Year(s) (mm),"Earnings from Cont. Ops., 1 Yr Growth % - Compustat LTM - 3 (%)","Net Income - Capital IQ LTM - 3 ($USDmm, Historical rate)",...,Effective Tax Rate - Capital IQ LTM - 3 (%),"Total Current Assets - Capital IQ Latest Annual - 5 ($USDmm, Historical rate)","Book Value/Share - Capital IQ Latest Annual - 5 ($USD, Historical rate)","Total Debt - Capital IQ Latest Annual - 5 ($USDmm, Historical rate)","Long-Term Debt - Capital IQ Latest Annual - 5 ($USDmm, Historical rate)","Common Dividends Paid - Capital IQ LTM - 3 ($USDmm, Historical rate)","Basic EPS - Capital IQ LTM - 3 ($USD, Historical rate)",Payout Ratio - Capital IQ LTM - 3 (%),rate,text
0,36500.0,24593.0,315.180,9848.0,131552.9,-1793.000,117563.9,579.7,48.107144,-1465.0,...,35.442712,13709.0,17.00,14745.0,13428.000000,-3324.000000,-2.630,62.669630,2.0,item management discussion and analysis of fin...
1,1524.7,1113.5,315.180,936.3,1602.4,-29.500,1434.8,34.9,14.400000,90.2,...,25.900000,942.7,27.50,177.2,177.200000,-813.538593,2.570,62.669630,3.0,item management discussion and analysis of fin...
2,67173.0,50288.0,315.180,30722.0,205588.3,-1964.000,195290.3,1776.8,48.107144,5161.0,...,17.800000,14632.0,17.40,19566.0,19359.000000,-3429.000000,2.950,66.400000,1.0,item management discussion and analysis of fin...
3,59352.0,31864.0,315.180,-8446.0,279650.4,-743.000,203857.4,1766.2,48.107144,8685.0,...,16.800000,16945.0,-5.71,40310.0,35002.000000,-10296.000000,4.880,118.500000,1.0,item management discussion and analysis of fin...
4,2385.6,1515.4,315.180,1218.6,3172.5,-194.800,2652.3,61.5,33.500000,109.6,...,45.100000,1336.0,18.30,296.8,296.800000,-813.538593,2.210,62.669630,3.0,item management discussion and analysis of fin...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3127,3770.3,3139.2,419.175,1967.7,3374.4,-108.700,3135.3,47.3,51.930497,41.5,...,36.800000,1300.7,41.50,1171.5,1036.000000,-829.574811,0.894,89.866451,3.0,item management discussion and analysis of fin...
3128,23456.4,19980.7,419.175,12666.4,33928.7,-602.800,28271.6,208.6,252.000000,1024.0,...,3.950000,4885.0,60.60,7314.3,5477.500000,-200.900000,4.910,19.600000,2.0,item management discussion and analysis of fin...
3129,398.6,260.3,419.175,235.0,1727.5,-0.918,1687.1,100.9,51.930497,49.1,...,30.400000,310.6,1.99,25.3,7961.290928,-829.574811,0.487,89.866451,4.0,item management discussion and analysis of fin...
3130,13900.0,11333.0,419.175,4544.0,81158.7,-732.000,76502.7,462.1,10.800000,2344.0,...,20.300000,6930.0,9.61,6789.0,6597.000000,-692.000000,5.080,29.500000,2.0,item management discussion and analysis of fin...


In [8]:
# Sample 500 rows with a fixed random state
df = df.sample(n=500, random_state=42)
df = df.reset_index(drop=True)
df

Unnamed: 0,"Total Assets - Capital IQ Latest Annual - 5 ($USDmm, Historical rate)","Total Capital - Capital IQ Latest Annual - 5 ($USDmm, Historical rate)","Sales/Sq. Ft., All (Net) - Capital IQ Latest Annual - 5 ($USD, Historical rate)","Total Equity - Capital IQ Latest Annual - 5 ($USDmm, Historical rate)","Total Enterprise Value My Setting Latest - 3 Year(s) ($USDmm, Historical rate)","Capital Expenditures - Compustat LTM - 3 ($USDmm, Historical rate)","Market Capitalization My Setting Latest - 3 Year(s) ($USDmm, Historical rate)",Shares Outstanding My Setting Latest - 3 Year(s) (mm),"Earnings from Cont. Ops., 1 Yr Growth % - Compustat LTM - 3 (%)","Net Income - Capital IQ LTM - 3 ($USDmm, Historical rate)",...,Effective Tax Rate - Capital IQ LTM - 3 (%),"Total Current Assets - Capital IQ Latest Annual - 5 ($USDmm, Historical rate)","Book Value/Share - Capital IQ Latest Annual - 5 ($USD, Historical rate)","Total Debt - Capital IQ Latest Annual - 5 ($USDmm, Historical rate)","Long-Term Debt - Capital IQ Latest Annual - 5 ($USDmm, Historical rate)","Common Dividends Paid - Capital IQ LTM - 3 ($USDmm, Historical rate)","Basic EPS - Capital IQ LTM - 3 ($USD, Historical rate)",Payout Ratio - Capital IQ LTM - 3 (%),rate,text
0,1872.9,1679.8,372.425,603.6,20575.800000,-110.7,19799.9,69.7,49.220310,120.0,...,7.050000,1248.7,9.14,1076.2,1043.7,-816.291839,1.720,77.965304,4.0,item management discussion and analysis of fin...
1,244718.0,176410.0,419.175,65815.0,148353.800000,-24610.0,50975.8,1390.1,1.360000,10127.0,...,5.410000,82103.0,39.80,110595.0,16006.0,-477.000000,7.350,4.710000,2.0,item management discussion and analysis of fin...
2,412.7,273.4,315.180,109.6,524.400000,-20.3,376.8,32.9,48.107144,-9.6,...,164.100000,309.3,3.59,163.8,154.7,-813.538593,-0.293,62.669630,4.0,item management discussion and analysis of fin...
3,10628.0,5306.0,315.180,-1215.0,18523.300000,-541.0,9129.3,244.8,48.107144,-2055.0,...,35.442712,3600.0,-4.35,6521.0,6448.0,-338.000000,-8.990,62.669630,3.0,item management discussion and analysis of fin...
4,897.6,674.7,344.200,279.4,1155.300000,-138.6,951.2,65.6,49.220310,-38.8,...,28.068174,300.7,4.35,395.3,322.8,-816.291839,-0.585,77.965304,4.0,item management discussion and analysis of fin...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,33975.7,23954.7,344.200,7582.2,226057.600000,-378.5,216722.0,443.4,49.220310,4525.4,...,11.200000,6178.5,17.30,16372.5,14759.3,-816.291839,10.200,77.965304,2.0,item management discussion and analysis of fin...
496,10677.0,6989.0,344.200,5709.0,13998.400000,-132.0,13640.4,289.1,49.220310,423.0,...,24.000000,3030.0,19.50,1280.0,838.0,-490.000000,1.570,115.800000,2.0,item management discussion and analysis of fin...
497,83850.4,23683.8,419.175,6648.8,43636.081179,0.0,6023.3,312.1,51.930497,339.2,...,37.509051,18900.0,19.90,17035.0,17021.6,-601.200000,1.070,177.200000,3.0,item management discussion and analysis of fin...
498,7764.1,7028.2,372.425,4251.3,16238.600000,-706.9,14062.0,152.2,49.220310,515.2,...,17.700000,1929.5,26.10,2776.9,1906.7,-816.291839,3.350,77.965304,2.0,item management discussion and analysis of fin...


In [9]:
# 分成文本和數值型
df_numeric = df.drop("text", axis=1)
df_text = df[["text","rate"]]

# 純數值型

In [10]:
le = LabelEncoder()

X = df_numeric.drop('rate', axis=1)
y = le.fit_transform(df_numeric['rate'])

# 對 X 資料標準化
scaler = StandardScaler().fit(X)
X = scaler.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# 標準化後的數值
X_train

array([[-0.27905154, -0.25554753, -0.40128053, ...,  0.36127193,
        -0.42134266, -0.068453  ],
       [-0.37532667, -0.37851598,  0.32846992, ...,  0.35502765,
         0.01439546,  0.02229356],
       [-0.44889959, -0.50300362, -1.1515855 , ..., -0.05392202,
        -0.45246681, -0.07072834],
       ...,
       [-0.21795899, -0.17587146,  1.53717969, ...,  0.3639011 ,
         1.31624357, -0.12924076],
       [-0.39000712, -0.42894313, -1.1515855 , ...,  0.52838849,
        -0.42026941, -0.09381231],
       [-0.46491738, -0.52280282,  1.53717969, ...,  0.6102789 ,
        -0.72217368, -0.03691987]])

In [12]:
# Create the XGBoost model
model = xgb.XGBClassifier(
    objective='binary:logistic',
    max_depth=3,
    learning_rate=0.1,
    n_estimators=1000,
    random_state=42
)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the validation set
y_test_pred = model.predict(X_test)

# Calculate evaluation metrics
acc = accuracy_score(y_test, y_test_pred)                             # Calculate the accuracy score
f1 = f1_score(y_test, y_test_pred, average='weighted')                # Calculate the weighted F1-score
precision = precision_score(y_test, y_test_pred, average='weighted')  # Calculate the weighted precision
recall = recall_score(y_test, y_test_pred, average='weighted')        # Calculate the weighted recall
cm = confusion_matrix(y_test, y_test_pred)                            # Calculate the confusion matrix

# Print the evaluation metrics
print(f"Accuracy: {acc:.2f}")
print(f"F1-score: {f1:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print("Confusion Matrix:")
print(cm)

Accuracy: 0.69
F1-score: 0.69
Precision: 0.72
Recall: 0.69
Confusion Matrix:
[[ 5  5  2  0]
 [ 6 30  8  0]
 [ 0  2 26  1]
 [ 0  0  7  8]]


# 純文本

In [13]:
# Split the DataFrame into training and test sets
df_train, df_test = train_test_split(df_text, test_size=0.2, random_state=42)

In [14]:
# Set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the DistilBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)

In [15]:
# Tokenize the text data for the training and validation sets
tokenized_train = tokenizer(df_train["text"].values.tolist(), padding=True, truncation=True, return_tensors="pt")
tokenized_test = tokenizer(df_test["text"].values.tolist(), padding=True, truncation=True, return_tensors="pt")

# Pass the tokenized text through the DistilBERT model to get the hidden states
with torch.no_grad():
    hidden_train = model(**tokenized_train)
    hidden_test = model(**tokenized_test)

# Get only the [CLS] token hidden states
cls_train = hidden_train.last_hidden_state[:,0,:]
cls_test = hidden_test.last_hidden_state[:,0,:]

In [16]:
# Encode the target variable
le = LabelEncoder()

# Concatenate the [CLS] token hidden states and the general features for the training set
x_train = cls_train
y_train = df_train["rate"]
y_train = le.fit_transform(y_train)

# Concatenate the [CLS] token hidden states and the general features for the validation set
x_test = cls_test
y_test = df_test["rate"]
y_test = le.fit_transform(y_test)

# Print the shapes of the input and target tensors
print(f"x_train shape: {x_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"x_test shape: {x_test.shape}")
print(f"y_test shape: {y_test.shape}")

x_train shape: torch.Size([400, 768])
y_train shape: (400,)
x_test shape: torch.Size([100, 768])
y_test shape: (100,)


In [17]:
# Create the XGBoost model
model = xgb.XGBClassifier(objective='multi:softprob',    # Set the objective function for multi-class classification
                            n_estimators=1000,           # Set the number of boosting iterations
                            max_depth=7,                 # Set the maximum depth of the decision trees
                            learning_rate=0.1,           # Set the learning rate for the boosting algorithm
                            random_state=42)             # Set the random state for reproducibility

# Train the model
model.fit(x_train, y_train)

# Make predictions on the validation set
y_test_pred = model.predict(x_test)

# Calculate evaluation metrics
acc = accuracy_score(y_test, y_test_pred)                             # Calculate the accuracy score
f1 = f1_score(y_test, y_test_pred, average='weighted')                # Calculate the weighted F1-score
precision = precision_score(y_test, y_test_pred, average='weighted')  # Calculate the weighted precision
recall = recall_score(y_test, y_test_pred, average='weighted')        # Calculate the weighted recall
cm = confusion_matrix(y_test, y_test_pred)                            # Calculate the confusion matrix

# Print the evaluation metrics
print(f"Accuracy: {acc:.2f}")
print(f"F1-score: {f1:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print("Confusion Matrix:")
print(cm)

Accuracy: 0.66
F1-score: 0.66
Precision: 0.67
Recall: 0.66
Confusion Matrix:
[[ 5  4  1  2]
 [ 0 33 10  1]
 [ 0  4 21  4]
 [ 1  3  4  7]]


# 數值 + 文本

In [18]:
# Encode the target variable
le = LabelEncoder()

# Concatenate the [CLS] token hidden states and the general features for the training set
x_train = torch.cat((cls_train, torch.from_numpy(X_train)), 1)
y_train = df_train["rate"]
y_train = le.fit_transform(y_train)

# Concatenate the [CLS] token hidden states and the general features for the validation set
x_test = torch.cat((cls_test, torch.from_numpy(X_test)), 1)
y_test = df_test["rate"]
y_test = le.fit_transform(y_test)

# Print the shapes of the input and target tensors
print(f"x_train shape: {x_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"x_test shape: {x_test.shape}")
print(f"y_test shape: {y_test.shape}")

x_train shape: torch.Size([400, 791])
y_train shape: (400,)
x_test shape: torch.Size([100, 791])
y_test shape: (100,)


In [19]:
# Create the XGBoost model
model = xgb.XGBClassifier(objective='multi:softprob',    # Set the objective function for multi-class classification
                            n_estimators=1000,           # Set the number of boosting iterations
                            max_depth=7,                 # Set the maximum depth of the decision trees
                            learning_rate=0.1,           # Set the learning rate for the boosting algorithm
                            random_state=42)             # Set the random state for reproducibility

# Train the model
model.fit(x_train, y_train)

# Make predictions on the validation set
y_test_pred = model.predict(x_test)

# Calculate evaluation metrics
acc = accuracy_score(y_test, y_test_pred)                             # Calculate the accuracy score
f1 = f1_score(y_test, y_test_pred, average='weighted')                # Calculate the weighted F1-score
precision = precision_score(y_test, y_test_pred, average='weighted')  # Calculate the weighted precision
recall = recall_score(y_test, y_test_pred, average='weighted')        # Calculate the weighted recall
cm = confusion_matrix(y_test, y_test_pred)                            # Calculate the confusion matrix

# Print the evaluation metrics
print(f"Accuracy: {acc:.2f}")
print(f"F1-score: {f1:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print("Confusion Matrix:")
print(cm)

Accuracy: 0.70
F1-score: 0.69
Precision: 0.69
Recall: 0.70
Confusion Matrix:
[[ 3  8  1  0]
 [ 5 32  7  0]
 [ 0  2 25  2]
 [ 0  1  4 10]]
