In [1]:
import pandas as pd
import numpy as np
import os
import glob
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [2]:
# 2021

df = pd.read_excel("財務數據excel/2021data.xls", header=7)
df['Exchange:Ticker'] = df['Exchange:Ticker'].str.split(':').str[-1]
df = df.drop(columns=["Company Name", 
                        "SIC Codes", 
                        "SIC Codes (Primary Code Only)",
                        "SIC Codes (Primary)", 
                        "Company Type",
                        "Total Trading Assets, Dom. - Capital IQ [CY 2021] ($USDmm, Historical rate)",
                        "Retained Earnings - Capital IQ [CY 2021] ($USDmm, Historical rate)"])

rating_list = ['A', 
                'A+',
                'A-', 
                'AA',
                'AA+',
                'AA-',
                'AAA',
                'B',
                'B+',
                'B-',
                'BB',
                'BB+',
                'BB-',
                'BBB',
                'BBB+',
                'BBB-',
                'CCC',
                'CCC+',
                'CCC-',
                'D']

rating_map = {
    'AAA': 1, 'AA+': 1, 'AA': 1, 'AA-': 1, 'A+': 1, 'A': 1, 'A-': 1,
    'BBB+': 2, 'BBB': 2, 'BBB-': 2,
    'BB+': 3, 'BB': 3, 'BB-': 3,
    'B+': 4, 'B': 4, 'B-': 4, 'CCC+': 4, 'CCC': 4, 'CCC-': 4, 'D': 4,
    'NR': np.nan
}

# 创建一个新的 DataFrame，并将评级映射到数值
df['S&P Entity Credit Rating - Issuer Credit Rating - Local Currency LT [Latest] (Rating)'] = df['S&P Entity Credit Rating - Issuer Credit Rating - Local Currency LT [Latest] (Rating)'].map(lambda x: rating_map.get(x, x))

# 以下等資料補齊後就可以刪掉
for col in df.columns:
    df[col] = df[col].replace('-', np.nan)
df["Payout Ratio - Capital IQ [CY 2021] (%)"] = df["Payout Ratio - Capital IQ [CY 2021] (%)"].replace('NM', np.nan)
for col in df.columns:
    if col in ["Exchange:Ticker","S&P Entity Credit Rating - Issuer Credit Rating - Local Currency LT [Latest] (Rating)"]:
        df[col] = df[col].fillna(df[col].mode().iloc[0])  
    else:
        df[col] = df[col].fillna(df[col].mean())  

for ticker in df['Exchange:Ticker']:
    txt_files = glob.glob(os.path.join('10-K文字檔/2021data/', f"{ticker}_*.txt"))
    if txt_files:
        # 讀取第一個匹配的文件內容
        with open(txt_files[0], 'r') as f:
            content = f.read()
        # 將內容添加到df的新欄位中
        df.loc[df['Exchange:Ticker'] == ticker, 'text'] = content
    else:
        # 如果沒有找到匹配的文件,可以在新欄位中填寫一些默認值
        df.loc[df['Exchange:Ticker'] == ticker, 'text'] = np.nan

# 股票代碼沒對到的資料刪掉        
df = df.dropna() 

df

Unnamed: 0,Exchange:Ticker,S&P Entity Credit Rating - Issuer Credit Rating - Local Currency LT [Latest] (Rating),"Total Assets - Capital IQ [CY 2021] ($USDmm, Historical rate)","Total Capital - Capital IQ [CY 2021] ($USDmm, Historical rate)","Sales/Sq. Ft., All (Net) - Capital IQ [CY 2021] ($USD, Historical rate)","Total Equity - Capital IQ [LTM] ($USDmm, Historical rate)","Total Enterprise Value [My Setting] [Latest - 3 Year(s)] ($USDmm, Historical rate)","Market Capitalization [My Setting] [Latest - 3 Year(s)] ($USDmm, Historical rate)",Shares Outstanding [My Setting] [Latest - 3 Year(s)] (mm),"EBITDA - Capital IQ [CY 2021] ($USDmm, Historical rate)","Net Income - Capital IQ [CY 2021] ($USDmm, Historical rate)","Operating Income - Capital IQ [CY 2021] ($USDmm, Historical rate)",Cash Dividends to Net Income - Capital IQ [CY 2021] (%),"All Other Identifiable Intangible Assets - Capital IQ [CY 2021] ($USDmm, Historical rate)","Net Working Capital - Capital IQ [CY 2021] ($USD, Historical rate)",Net Working Capital/ Total Assets - Capital IQ [LTM],"Book Value/Share - Capital IQ [CY 2021] ($USD, Historical rate)","Cash Dividends - Compustat [CY 2021] ($USDmm, Historical rate)",Payout Ratio - Capital IQ [CY 2021] (%),text
1,FLWS,2.0,1321.3,877.80000,327.54,485.3,2085.500000,2067.70,65.100,167.100000,90.00,121.0,29.500595,242.347939,-86.800000,0.027000,8.66,0.000,53.66197,Item 7.\nMANAGEMENT’S DISCUSSION AND ANALYSIS ...
3,TXG,2.0,1018.8,899.50000,327.54,718.8,16822.300000,17367.00,109.800,-31.800000,-58.20,-52.9,29.500595,242.347939,54.900000,0.106000,7.27,0.000,53.66197,Item 7. Management’s Discussion and Analysis o...
5,EFSH,2.0,47.0,33.90000,327.54,-28.3,18.100000,9.47,0.012,-0.195000,-3.31,-1.1,29.500595,242.347939,-1.550000,-0.197000,-85.00,-1.030,53.66197,ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS O...
7,SRCE,2.0,8096.3,7111.43281,327.54,1081.5,14811.279084,1234.90,25.300,878.501129,118.50,156.8,25.710000,0.060000,19.099566,-0.045218,37.00,-31.300,26.40000,Item 7. Management’s Discussion and Analysis o...
9,XXII,2.0,76.0,68.40000,327.54,-11.6,663.200000,693.60,0.635,-27.100000,-32.60,-28.3,29.500595,242.347939,-1.870000,-0.254000,97.40,0.000,53.66197,Item 7.Management’s Discussion and Analysis of...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4287,ZI,3.0,6852.9,3306.00000,327.54,2005.9,8681.400000,7855.80,185.500,187.400000,116.80,153.4,29.500595,242.347939,-278.400000,-0.055000,4.95,0.000,17.00000,ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS O...
4288,ZS,2.0,2422.0,1529.00000,327.54,958.9,22617.100000,23120.70,135.900,-231.100000,-330.70,-268.7,29.500595,242.347939,-448.400000,-0.217000,3.82,0.000,53.66197,Item 7. Management’s Discussion and Analysis o...
4289,ZUMZ,2.0,862.0,736.20000,327.54,353.2,1002.500000,1065.00,25.700,180.800000,119.30,157.9,29.500595,242.347939,32.300000,0.108000,22.10,0.000,53.66197,Item 7.\nMANAGEMENT’S DISCUSSION AND ANALYSIS ...
4290,ZUO,2.0,441.3,229.40000,327.54,133.7,1713.200000,1830.50,122.200,-56.900000,-99.40,-73.7,29.500595,242.347939,-93.500000,-0.133000,1.33,0.000,53.66197,Item 7. Management’s Discussion and Analysis o...


In [None]:
# 2022

df = pd.read_excel("財務數據excel/2022data.xls", header=7)
df['Exchange:Ticker'] = df['Exchange:Ticker'].str.split(':').str[-1]
df = df.drop(columns=["Company Name", 
                        "SIC Codes", 
                        "SIC Codes (Primary Code Only)",
                        "SIC Codes (Primary)", 
                        "Company Type",
                        "Total Trading Assets, Dom. - Capital IQ [FY 2022] ($USDmm, Historical rate)",
                        "Retained Earnings - Capital IQ [FY 2022] ($USDmm, Historical rate)"])

rating_list = ['A', 
                'A+',
                'A-', 
                'AA',
                'AA+',
                'AA-',
                'AAA',
                'B',
                'B+',
                'B-',
                'BB',
                'BB+',
                'BB-',
                'BBB',
                'BBB+',
                'BBB-',
                'CCC',
                'CCC+',
                'CCC-',
                'D']

rating_map = {
    'AAA': 1, 'AA+': 1, 'AA': 1, 'AA-': 1, 'A+': 1, 'A': 1, 'A-': 1,
    'BBB+': 2, 'BBB': 2, 'BBB-': 2,
    'BB+': 3, 'BB': 3, 'BB-': 3,
    'B+': 4, 'B': 4, 'B-': 4, 'CCC+': 4, 'CCC': 4, 'CCC-': 4, 'D': 4,
    'NR': np.nan
}

# 创建一个新的 DataFrame，并将评级映射到数值
df['S&P Entity Credit Rating - Issuer Credit Rating - Local Currency LT [Latest] (Rating)'] = df['S&P Entity Credit Rating - Issuer Credit Rating - Local Currency LT [Latest] (Rating)'].map(lambda x: rating_map.get(x, x))

# 以下等資料補齊後就可以刪掉
for col in df.columns:
    df[col] = df[col].replace('-', np.nan)
df["Payout Ratio - Capital IQ [FY 2022] (%)"] = df["Payout Ratio - Capital IQ [FY 2022] (%)"].replace('NM', np.nan)

for col in df.columns:
    if col in ["Exchange:Ticker","S&P Entity Credit Rating - Issuer Credit Rating - Local Currency LT [Latest] (Rating)"]:
        df[col] = df[col].fillna(df[col].mode().iloc[0])  
    else:
        df[col] = df[col].fillna(df[col].mean())  

for ticker in df['Exchange:Ticker']:
    txt_files = glob.glob(os.path.join('10-K文字檔/2021data/', f"{ticker}_*.txt"))
    if txt_files:
        # 讀取第一個匹配的文件內容
        with open(txt_files[0], 'r') as f:
            content = f.read()
        # 將內容添加到df的新欄位中
        df.loc[df['Exchange:Ticker'] == ticker, 'text'] = content
    else:
        # 如果沒有找到匹配的文件,可以在新欄位中填寫一些默認值
        df.loc[df['Exchange:Ticker'] == ticker, 'text'] = np.nan

# 股票代碼沒對到的資料刪掉        
df = df.dropna() 

df

In [3]:
df = df.sample(n=300, random_state=42)

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)

df_train, df_val = train_test_split(df, test_size=0.2, random_state=42)

general_features_train = df_train.iloc[:,2:-1]
general_features_val = df_val.iloc[:,2:-1]

In [5]:
tokenized_train = tokenizer(df_train["text"].values.tolist(), padding = True, truncation = True, return_tensors="pt")
tokenized_val = tokenizer(df_val["text"].values.tolist() , padding = True, truncation = True,  return_tensors="pt")

with torch.no_grad():
    hidden_train = model(**tokenized_train)
    hidden_val = model(**tokenized_val)

#get only the [CLS] hidden states
cls_train = hidden_train.last_hidden_state[:,0,:]
cls_val = hidden_val.last_hidden_state[:,0,:]

In [6]:
le = LabelEncoder()

x_train = torch.cat((cls_train, torch.from_numpy(general_features_train.values)), 1)
y_train = df_train["S&P Entity Credit Rating - Issuer Credit Rating - Local Currency LT [Latest] (Rating)"]
y_train = le.fit_transform(y_train)

x_val = torch.cat((cls_val, torch.from_numpy(general_features_val.values)), 1)
y_val = df_val["S&P Entity Credit Rating - Issuer Credit Rating - Local Currency LT [Latest] (Rating)"]
y_val = le.fit_transform(y_val)

print(x_train.shape, y_train.shape, x_val.shape, y_val.shape)

torch.Size([240, 785]) (240,) torch.Size([60, 785]) (60,)


In [7]:
# 創建 XGBoost 模型
model = xgb.XGBClassifier(objective='multi:softprob',
                            n_estimators=1000,
                            max_depth=7,
                            learning_rate=0.1,
                            random_state=42)

# 訓練模型
model.fit(x_train, y_train)

# 在驗證集上進行預測
y_val_pred = model.predict(x_val)

# 計算評估指標
acc = accuracy_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred, average='weighted')
precision = precision_score(y_val, y_val_pred, average='weighted')
recall = recall_score(y_val, y_val_pred, average='weighted')
cm = confusion_matrix(y_val, y_val_pred)

print(f"Accuracy: {acc:.2f}")
print(f"F1-score: {f1:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print("Confusion Matrix:")
print(cm)

Accuracy: 0.67
F1-score: 0.59
Precision: 0.52
Recall: 0.67
Confusion Matrix:
[[ 0  3  0  0]
 [ 1 40  1  2]
 [ 0  8  0  0]
 [ 0  5  0  0]]


In [8]:
df_val

Unnamed: 0,Exchange:Ticker,S&P Entity Credit Rating - Issuer Credit Rating - Local Currency LT [Latest] (Rating),"Total Assets - Capital IQ [CY 2021] ($USDmm, Historical rate)","Total Capital - Capital IQ [CY 2021] ($USDmm, Historical rate)","Sales/Sq. Ft., All (Net) - Capital IQ [CY 2021] ($USD, Historical rate)","Total Equity - Capital IQ [LTM] ($USDmm, Historical rate)","Total Enterprise Value [My Setting] [Latest - 3 Year(s)] ($USDmm, Historical rate)","Market Capitalization [My Setting] [Latest - 3 Year(s)] ($USDmm, Historical rate)",Shares Outstanding [My Setting] [Latest - 3 Year(s)] (mm),"EBITDA - Capital IQ [CY 2021] ($USDmm, Historical rate)","Net Income - Capital IQ [CY 2021] ($USDmm, Historical rate)","Operating Income - Capital IQ [CY 2021] ($USDmm, Historical rate)",Cash Dividends to Net Income - Capital IQ [CY 2021] (%),"All Other Identifiable Intangible Assets - Capital IQ [CY 2021] ($USDmm, Historical rate)","Net Working Capital - Capital IQ [CY 2021] ($USD, Historical rate)",Net Working Capital/ Total Assets - Capital IQ [LTM],"Book Value/Share - Capital IQ [CY 2021] ($USD, Historical rate)","Cash Dividends - Compustat [CY 2021] ($USDmm, Historical rate)",Payout Ratio - Capital IQ [CY 2021] (%),text
1336,ENG,2.0,42.1,34.6,327.54,-2.85,63.7,69.1,3.5,-12.9,-5.69,-13.5,29.500595,242.347939,8.45,-0.07,6.4,0.0,53.66197,ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS O...
415,ATNI,2.0,1608.6,1249.7,327.54,713.3,907.8,753.3,15.7,130.2,-22.1,19.7,29.500595,242.347939,-42.7,0.002,38.8,-10.8,53.66197,ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS O...
2823,OPEN,2.0,9506.0,9357.0,327.54,899.0,7418.6,8734.7,577.7,-517.0,-662.0,-564.0,29.500595,242.347939,6976.0,0.653,3.65,0.0,53.66197,Item 7. Management’s Discussion and Analysis o...
3288,RTX,2.0,161404.0,108252.0,327.54,62137.0,155427.2,129284.2,1506.6,11193.0,3864.0,6636.0,29.500595,242.347939,-662.0,-0.013,49.2,-2957.0,76.5,ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS O...
1847,HXL,3.0,2819.4,2359.5,327.54,1632.1,5278.6,4448.3,83.8,208.6,16.1,70.6,29.500595,242.347939,252.3,0.145,17.7,0.0,53.66197,ITEM 7. Management’s Discussion and Analysis o...
4070,VRCA,2.0,80.1,76.0,327.54,1.51,265.9,311.1,27.4,-30.7,-35.1,-30.9,29.500595,242.347939,-0.137,-0.089,1.19,0.0,53.66197,ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS O...
3958,UCTT,4.0,2025.4,1527.9,327.54,889.1,2180.8,2112.0,41.1,253.2,119.5,185.7,29.500595,242.347939,240.9,0.17,18.9,0.0,53.66197,Item 7.\nManagement’s Discussion and Analysis ...
3829,THR,2.0,626.5,534.3,327.54,467.4,779.1,646.4,33.2,42.7,10.4,23.0,29.500595,242.347939,135.7,0.207,11.7,0.0,53.66197,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS O...
449,RNA,2.0,427.6,393.2,327.54,830.9,520.6,827.5,37.6,-117.4,-118.0,-118.1,29.500595,242.347939,-22.3,-0.055,7.99,0.0,53.66197,ITEM 7. Management’s Discussion and Analysis o...
1543,FFIC,2.0,8045.9,7111.43281,327.54,669.8,14811.279084,729.6,31.0,878.501129,81.8,112.6,32.43,2.67,19.099566,-0.045218,22.3,-26.5,32.4,Item 7. Management’s Discussion and Analysis o...


In [9]:
y_val_pred

array([1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1], dtype=int64)