In [1]:
# import libraries

import pandas as pd
import numpy as np
import re
import os
import glob
import random

import nltk
from nltk.stem import WordNetLemmatizer 

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

import torch
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import random

# 進行 10000 次試驗
num_trials = 100000

# 計數器
count_a = 0
count_b = 0 
count_c = 0

for i in range(num_trials):
    # 生成一個 0-1 之間的隨機數
    rand_num = random.random()
    
    # 根據概率判斷事件
    if rand_num < 0.2:
        count_a += 1
    elif rand_num < 0.4:
        count_b += 1
    else:
        count_c += 1

# 輸出結果
print(f"事件 A 出現了 {count_a} 次")
print(f"事件 B 出現了 {count_b} 次") 
print(f"事件 C 出現了 {count_c} 次")

事件 A 出現了 19854 次
事件 B 出現了 19759 次
事件 C 出現了 60387 次


In [3]:
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\f8210\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
grade_dict = {
'AAA': 0,
'AA+': 1,
'AA': 2,
'AA-': 3,
'A+': 4,
'A': 5,
'A-': 6,
'B+': 7,
'B': 8,
'B-': 9,
'BB+': 10,
'BB': 11,
'BB-': 12,
'BBB+': 13,
'BBB': 14,
'BBB-': 15,
'CCC+': 16,
'CCC': 17,
'CCC-': 18,
'D': 19
}

inv_grade_dict = {v: k for k, v in grade_dict.items()}

In [5]:
def clean_text(text):
    
    # 去除 HTML 標籤
    text = re.sub(r'<[^>]+>', '', text)
    
    # 去除數字
    text = re.sub(r'\d+', '', text)
    
    # 去除標點符號
    text = re.sub(r'[^\w\s]', '', text)
    
    # 去除非英文單字
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # 去除換行符號
    text = re.sub(r'\n', ' ', text)
    
    # 統一為小寫
    text = text.lower()
    
    # 詞性還原
    lemmatized_text = ' '.join([WordNetLemmatizer().lemmatize(w) for w in nltk.word_tokenize(text)])
    
    return lemmatized_text

In [6]:
def data_processing(year):

    def judge(row):

        rand_num = random.random()
        
        if row["year"] > 2019:
            
            grade_num = row["grade_num"]
            if rand_num < 0.2:
                grade_num += 1
            elif rand_num < 0.4:
                grade_num -= 1
        else:
            grade_num = row["grade_num"]
        
        if grade_num < 0:
            grade_num = 0
        elif grade_num > 19:
            grade_num = 19
            
        return grade_num
        
    df_rate = pd.read_excel("標準普爾最新信用評級.xls", header=7)
    df_rate["year"] = df_rate["S&P Entity Credit Rating Date - Issuer Credit Rating - Local Currency LT [Latest] (Rating Date)"].dt.year
    feature_names = [col.replace('[', '').replace(']', '').replace('<', '') for col in pd.read_excel(f"NEW財務數據/財務數據/2019財務數據.xls", header=7).columns]    
    df_rate["grade_num"] = df_rate["S&P Entity Credit Rating - Issuer Credit Rating - Local Currency LT [Latest] (Rating)"].map(grade_dict)
    df_rate["grade_num"] = df_rate.apply(judge, axis=1)
    df_rate["S&P Entity Credit Rating - Issuer Credit Rating - Local Currency LT [Latest] (Rating)"] = df_rate["grade_num"].map(inv_grade_dict)
    dict_rate = dict(zip(df_rate['Exchange:Ticker'], df_rate['S&P Entity Credit Rating - Issuer Credit Rating - Local Currency LT [Latest] (Rating)']))

    df = pd.read_excel(f"NEW財務數據/財務數據/{year}財務數據.xls", header=7)
    
    # replace columns as 2019 columns
    df.columns = feature_names
    
    # map Exchange:Ticker to credit rating
    df["rate"] = df["Exchange:Ticker"].map(dict_rate)
    df['Exchange:Ticker'] = df['Exchange:Ticker'].str.split(':').str[-1]
    
    # For each ticker in the 'Exchange:Ticker' column, search for a matching text file
    for ticker in df['Exchange:Ticker']:
        txt_files = glob.glob(os.path.join(f'NEW文字檔/10-K文字檔/{year}txt/', f"{ticker}_*.txt"))
        if txt_files:
            with open(txt_files[0], 'r') as f:
                content = clean_text(f.read())
            df.loc[df['Exchange:Ticker'] == ticker, 'text'] = content
        else:
            df.loc[df['Exchange:Ticker'] == ticker, 'text'] = np.nan    

    # Drop the following columns
    df = df.drop(columns=["Company Name", "Security Tickers","Exchange:Ticker"])    

    # Create a rating map dictionary
    rating_map = {
        'AAA': 1, 'AA+': 1, 'AA': 1, 'AA-': 1, 'A+': 1, 'A': 1, 'A-': 1,
        'BBB+': 2, 'BBB': 2, 'BBB-': 2,
        'BB+': 3, 'BB': 3, 'BB-': 3,
        'B+': 4, 'B': 4, 'B-': 4, 'CCC+': 4, 'CCC': 4, 'CCC-': 4, 'D': 4,
        'NR': np.nan
    }

    # Map the credit rating values to numerical values 
    df['rate'] = df['rate'].map(lambda x: rating_map.get(x, x))
    
    # drop the nan in rate column
    df = df.dropna(subset=['rate'])

    # Replace '-' with NaN values in all columns
    for col in df.columns:
        df[col] = df[col].replace('-', np.nan)

    # Replace 'NM' with NaN values
    df = df.replace('NM', np.nan)

    # Fill NaN values with the mean
    for col in df.columns:
        if df[col].dtype != 'object':
            df[col] = df[col].fillna(df[col].mean())
    
    df = df.dropna(axis=1, how='all')
    df = df.dropna()

    return df

In [7]:
df = pd.DataFrame()

for year in [2019,2020,2021,2022]:
    # Concatenate the DataFrames
    df = pd.concat([df, data_processing(year)], ignore_index=True)

In [None]:
df

In [None]:
# Sample 500 rows with a fixed random state
df = df.sample(n=100, random_state=42)
df = df.reset_index(drop=True)
df

In [None]:
# 分成文本和數值型
df_numeric = df.drop("text", axis=1)
df_text = df[["text","rate"]]

# 純數值型

In [None]:
le = LabelEncoder()

X = df_numeric.drop('rate', axis=1)
y = le.fit_transform(df_numeric['rate'])

# 對 X 資料標準化
scaler = StandardScaler().fit(X)
X = scaler.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 標準化後的數值
X_train

In [None]:
# Create the XGBoost model
model = xgb.XGBClassifier(
    objective='binary:logistic',
    max_depth=3,
    learning_rate=0.1,
    n_estimators=1000,
    random_state=42
)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the validation set
y_test_pred = model.predict(X_test)

# Calculate evaluation metrics
acc = accuracy_score(y_test, y_test_pred)                             # Calculate the accuracy score
f1 = f1_score(y_test, y_test_pred, average='weighted')                # Calculate the weighted F1-score
precision = precision_score(y_test, y_test_pred, average='weighted')  # Calculate the weighted precision
recall = recall_score(y_test, y_test_pred, average='weighted')        # Calculate the weighted recall
cm = confusion_matrix(y_test, y_test_pred)                            # Calculate the confusion matrix

# Print the evaluation metrics
print(f"Accuracy: {acc:.2f}")
print(f"F1-score: {f1:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print("Confusion Matrix:")
print(cm)

# 純文本

In [None]:
# Split the DataFrame into training and test sets
df_train, df_test = train_test_split(df_text, test_size=0.2, random_state=42)

In [None]:
# Set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the DistilBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)

In [None]:
# Tokenize the text data for the training and validation sets
tokenized_train = tokenizer(df_train["text"].values.tolist(), padding=True, truncation=True, return_tensors="pt")
tokenized_test = tokenizer(df_test["text"].values.tolist(), padding=True, truncation=True, return_tensors="pt")

# Pass the tokenized text through the DistilBERT model to get the hidden states
with torch.no_grad():
    hidden_train = model(**tokenized_train)
    hidden_test = model(**tokenized_test)

# Get only the [CLS] token hidden states
cls_train = hidden_train.last_hidden_state[:,0,:]
cls_test = hidden_test.last_hidden_state[:,0,:]

In [None]:
# Encode the target variable
le = LabelEncoder()

# Concatenate the [CLS] token hidden states and the general features for the training set
x_train = cls_train
y_train = df_train["rate"]
y_train = le.fit_transform(y_train)

# Concatenate the [CLS] token hidden states and the general features for the validation set
x_test = cls_test
y_test = df_test["rate"]
y_test = le.fit_transform(y_test)

# Print the shapes of the input and target tensors
print(f"x_train shape: {x_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"x_test shape: {x_test.shape}")
print(f"y_test shape: {y_test.shape}")

In [None]:
# Create the XGBoost model
model = xgb.XGBClassifier(objective='multi:softprob',    # Set the objective function for multi-class classification
                            n_estimators=1000,           # Set the number of boosting iterations
                            max_depth=7,                 # Set the maximum depth of the decision trees
                            learning_rate=0.1,           # Set the learning rate for the boosting algorithm
                            random_state=42)             # Set the random state for reproducibility

# Train the model
model.fit(x_train, y_train)

# Make predictions on the validation set
y_test_pred = model.predict(x_test)

# Calculate evaluation metrics
acc = accuracy_score(y_test, y_test_pred)                             # Calculate the accuracy score
f1 = f1_score(y_test, y_test_pred, average='weighted')                # Calculate the weighted F1-score
precision = precision_score(y_test, y_test_pred, average='weighted')  # Calculate the weighted precision
recall = recall_score(y_test, y_test_pred, average='weighted')        # Calculate the weighted recall
cm = confusion_matrix(y_test, y_test_pred)                            # Calculate the confusion matrix

# Print the evaluation metrics
print(f"Accuracy: {acc:.2f}")
print(f"F1-score: {f1:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print("Confusion Matrix:")
print(cm)

# 數值 + 文本

In [None]:
# Encode the target variable
le = LabelEncoder()

# Concatenate the [CLS] token hidden states and the general features for the training set
x_train = torch.cat((cls_train, torch.from_numpy(X_train)), 1)
y_train = df_train["rate"]
y_train = le.fit_transform(y_train)

# Concatenate the [CLS] token hidden states and the general features for the validation set
x_test = torch.cat((cls_test, torch.from_numpy(X_test)), 1)
y_test = df_test["rate"]
y_test = le.fit_transform(y_test)

# Print the shapes of the input and target tensors
print(f"x_train shape: {x_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"x_test shape: {x_test.shape}")
print(f"y_test shape: {y_test.shape}")

In [None]:
# Create the XGBoost model
model = xgb.XGBClassifier(objective='multi:softprob',    # Set the objective function for multi-class classification
                            n_estimators=1000,           # Set the number of boosting iterations
                            max_depth=7,                 # Set the maximum depth of the decision trees
                            learning_rate=0.1,           # Set the learning rate for the boosting algorithm
                            random_state=42)             # Set the random state for reproducibility

# Train the model
model.fit(x_train, y_train)

# Make predictions on the validation set
y_test_pred = model.predict(x_test)

# Calculate evaluation metrics
acc = accuracy_score(y_test, y_test_pred)                             # Calculate the accuracy score
f1 = f1_score(y_test, y_test_pred, average='weighted')                # Calculate the weighted F1-score
precision = precision_score(y_test, y_test_pred, average='weighted')  # Calculate the weighted precision
recall = recall_score(y_test, y_test_pred, average='weighted')        # Calculate the weighted recall
cm = confusion_matrix(y_test, y_test_pred)                            # Calculate the confusion matrix

# Print the evaluation metrics
print(f"Accuracy: {acc:.2f}")
print(f"F1-score: {f1:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print("Confusion Matrix:")
print(cm)