### 알파벳 빈도수 기반 언어 식별 모델
* 데이터셋: lang.zip
* 피쳐/속성: 알파벳 26개
* 타겟/라벨: class 칼럼 (클래스 4개)
		* -> 0: en, 1:fr, 2:id, 3:tl
* 학습방법:	지도학습 >> 분류 >> 다중분류 (클래스: 4개)
* 알고리즘: 딥러닝 층: 3개 (입력층, 은닉층:1개, 출력층)

In [23]:
# 딥러닝 관련 모듈
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim 
from torchmetrics.classification import MulticlassF1Score
from torchinfo import summary

# 데이터 전처리 & 시각화 관련 모듈
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

#### [1] 데이터 로드 & 전처리 & 타겟/피쳐 분리

In [24]:
path_train = "../language/train_feature.csv"
path_test  = "../language/test_feature.csv"

trainDF = pd.read_csv(path_train)
testDF = pd.read_csv(path_test)

print(trainDF.head(3))
print()
print(testDF.head(3))

          a         b         c         d         e         f         g  \
0  0.075952  0.012840  0.045702  0.046137  0.105332  0.015669  0.019151   
1  0.084178  0.019912  0.030404  0.038870  0.136998  0.017408  0.031239   
2  0.071646  0.012172  0.045643  0.032642  0.120055  0.014661  0.025173   

          h         i         j  ...         r         s         t         u  \
0  0.043743  0.073993  0.001741  ...  0.077693  0.061371  0.080522  0.025898   
1  0.027423  0.075355  0.002623  ...  0.090140  0.071659  0.077739  0.030643   
2  0.023513  0.094606  0.002490  ...  0.053942  0.087967  0.081051  0.029046   

          v         w         x         y         z  class  
0  0.009793  0.014146  0.000653  0.020022  0.000435     en  
1  0.013712  0.013950  0.002027  0.010731  0.000596     en  
2  0.018811  0.011895  0.000553  0.017981  0.000553     en  

[3 rows x 27 columns]

          a         b         c         d         e         f         g  \
0  0.067823  0.013459  0.034328  0.

In [25]:
# 'class' 칼럼 인코딩
# 0:en, 1:fr, 2:id, 3:tl

lang_class = trainDF['class'].unique().tolist()
print(lang_class)

labels = dict(zip(lang_class, range(len(lang_class))))
print(labels)

# add encoded class column to trainDF/testDF
trainDF['class_encd'] = trainDF['class'].replace(labels)
print(trainDF.head(3))
testDF['class_encd'] = testDF['class'].replace(labels)
print()
print(testDF.head(3))

['en', 'fr', 'id', 'tl']
{'en': 0, 'fr': 1, 'id': 2, 'tl': 3}
          a         b         c         d         e         f         g  \
0  0.075952  0.012840  0.045702  0.046137  0.105332  0.015669  0.019151   
1  0.084178  0.019912  0.030404  0.038870  0.136998  0.017408  0.031239   
2  0.071646  0.012172  0.045643  0.032642  0.120055  0.014661  0.025173   

          h         i         j  ...         s         t         u         v  \
0  0.043743  0.073993  0.001741  ...  0.061371  0.080522  0.025898  0.009793   
1  0.027423  0.075355  0.002623  ...  0.071659  0.077739  0.030643  0.013712   
2  0.023513  0.094606  0.002490  ...  0.087967  0.081051  0.029046  0.018811   

          w         x         y         z  class  class_encd  
0  0.014146  0.000653  0.020022  0.000435     en           0  
1  0.013950  0.002027  0.010731  0.000596     en           0  
2  0.011895  0.000553  0.017981  0.000553     en           0  

[3 rows x 28 columns]

          a         b         c         

In [26]:
# 타겟 & 피쳐 분리
targetDF = trainDF[[trainDF.columns[-1]]]
featureDF = trainDF[trainDF.columns[:-2]]

print(featureDF.shape, targetDF.shape)

X_train, X_val, y_train, y_val = train_test_split(featureDF, targetDF,
                                                  stratify=targetDF,
                                                  random_state=10)
print()
print(X_train.shape)
print(y_train.shape)
print()
print(X_val.shape)
print(y_val.shape)

(20, 26) (20, 1)

(15, 26)
(15, 1)

(5, 26)
(5, 1)


In [27]:
print(y_train.value_counts())
print(y_val.value_counts())

class_encd
0             4
2             4
3             4
1             3
Name: count, dtype: int64
class_encd
1             2
0             1
2             1
3             1
Name: count, dtype: int64


#### [2] 모델 클래스 설계 및 정의
- 클래스목적 : lang data 학습 및 언어 클래스 추론 목적
- 클래스이름 : LangMCFModel
- 부모클래스 : nn.Module
- 매개__변수 : 층별 입출력 개수 고정 -> 필요 X
- 속성__필드 : 
- 기능__역할 : __init__() : 모델 구조 설정 ,  forward() : 순방향 학습 <= 오바라이딩(overriding)
- 클래스구조 
    * 입력층 : 입력  26개(피쳐)      출력    50개 AF: ReLU
    * 은닉층 : 입력 50개            출력    10개  AF: ReLU
    * 출력층 : 입력 10개            출력    4개  AF: X


In [28]:
class LangMCFModel(nn.Module):
    
	def __init__(self):
		super().__init__()

		self.in_layer = nn.Linear(26, 50)
		self.hd_layer = nn.Linear(50, 10)
		self.ot_layer = nn.Linear(10, 4)

	def forward(self, x):

		y = F.relu(self.in_layer(x))
		y = F.relu(self.hd_layer(y))
		return self.ot_layer(y)
		

In [29]:
# [테스트] 사용자 정의 모델 확인
model = LangMCFModel()

print(model)
print()
summary(model, input_size=(20000, 26))



LangMCFModel(
  (in_layer): Linear(in_features=26, out_features=50, bias=True)
  (hd_layer): Linear(in_features=50, out_features=10, bias=True)
  (ot_layer): Linear(in_features=10, out_features=4, bias=True)
)



Layer (type:depth-idx)                   Output Shape              Param #
LangMCFModel                             [20000, 4]                --
├─Linear: 1-1                            [20000, 50]               1,350
├─Linear: 1-2                            [20000, 10]               510
├─Linear: 1-3                            [20000, 4]                44
Total params: 1,904
Trainable params: 1,904
Non-trainable params: 0
Total mult-adds (M): 38.08
Input size (MB): 2.08
Forward/backward pass size (MB): 10.24
Params size (MB): 0.01
Estimated Total Size (MB): 12.33

#### [3] 데이터셋 클래스 설계 및 정의

In [30]:
class LangDataset(Dataset):
    
	def __init__(self, featureDF, targetDF):

		self.featureDF = featureDF
		self.targetDF  = targetDF
		self.n_rows = featureDF.shape[0]
		self.n_features = featureDF.shape[1]

	def __len__(self):
		return self.n_rows
	
	def __getitem__(self, idx):

		featureTS = torch.FloatTensor(self.featureDF.iloc[idx].values)
		targetTS = torch.FloatTensor(self.targetDF.iloc[idx].values)

		return featureTS, targetTS


In [31]:
# [테스트] 데이터셋 인스턴스 생성

langDS = LangDataset(featureDF, targetDF)

langDL = DataLoader(langDS)

for feature, label in langDL:
    print(feature.shape, label.shape)
    break

torch.Size([1, 26]) torch.Size([1, 1])


#### [4] 학습준비

In [32]:
EPOCH = 1000
BACTCH_SIZE = 5
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
LR = 0.001

In [33]:
# 학습용, 검증용, 테스트용 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(featureDF, targetDF,
                                                  stratify=targetDF,
                                                  random_state=10)
print(f'{X_train.shape}  {X_val.shape}')
print(f'{y_train.shape}  {y_val.shape}')
print(f'{y_train.value_counts()} {y_val.value_counts()}')
print()

# 데이터셋 분리
trainDS = LangDataset(X_train, y_train)
validDS = LangDataset(X_val, y_val)
testDS  = LangDataset(testDF[testDF.columns[:-2]], testDF[[testDF.columns[-1]]])

# 학습용 데이터로더 인스턴스
trainDL = DataLoader(trainDS, batch_size = BACTCH_SIZE)

(15, 26)  (5, 26)
(15, 1)  (5, 1)
class_encd
0             4
2             4
3             4
1             3
Name: count, dtype: int64 class_encd
1             2
0             1
2             1
3             1
Name: count, dtype: int64



In [34]:
# 최적화 인스턴스
optimizer = optim.Adam(model.parameters(), lr=LR)

# 손실함수 인스턴스
crossLoss = nn.CrossEntropyLoss()

#### [5] 학습 진행

- > 모델 저장 준비

In [35]:
SAVE_PATH = r'C:/Users/KDP-43/Desktop/딥러닝/0919/models/lang/'

SAVE_MODEL = 'model_all.pth'

if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)

- > 학습 진행

In [36]:
## 학습의 효과 확인 손실값과 성능평가값 저장 필요
LOSS_HISTORY, SCORE_HISTORY=[[],[]], [[],[]]
CNT=len(trainDL)
print(f'BATCH_CNT => {CNT}')

## 학습 모니터링 설정 -------------------------------------
BREAK_CNT = 0
LIMIT_VALUE = 30

for epoch in range(EPOCH):
    # 학습 모드로 모델 설정
    model.train()
    
    # 배치 크기 만큼 데이터 로딩해서 학습 진행
    loss_total, score_total=0,0
    for featureTS, targetTS in trainDL:
        # 학습 진행
        pre_y=model(featureTS)
        
        # 손실 계산 : nn.CrossEntropyLoss 요구사항 : 정답/타겟은 0D 또는 1D,  타입은 long
        loss=crossLoss(pre_y, targetTS.reshape(-1).long())
        loss_total += loss.item()
        
        # 성능평가 계산
        score=MulticlassF1Score(num_classes=4)(pre_y, targetTS.reshape(-1))
        score_total += score.item()
        
        # 최적화 진행
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    # 에포크 당 검증기능
    # 모델 검증 모드 설정
    model.eval()
    with torch.no_grad():
        # 검증 데이터셋
        val_featrueTS=torch.FloatTensor(validDS.featureDF.values)
        val_targetTS=torch.FloatTensor(validDS.targetDF.values)
        
        # 추론/평가
        pre_val=model(val_featrueTS)
        print(pre_val.shape, val_targetTS.reshape(-1).shape)
        
        # 손실
        loss_val=crossLoss(pre_val, val_targetTS.reshape(-1).long())
        score_val=MulticlassF1Score(num_classes=3)(pre_val, val_targetTS.reshape(-1))

    # 에포크 당 손실값과 성능평가값 저장    
    LOSS_HISTORY[0].append(loss_total/CNT)
    SCORE_HISTORY[0].append(score_total/CNT)
    
    LOSS_HISTORY[1].append(loss_val)
    SCORE_HISTORY[1].append(score_val)
    
    print(f'[{epoch}/{EPOCH}]\n- [TRAIN] LOSS : {LOSS_HISTORY[0][-1]} SCORE : {SCORE_HISTORY[0][-1]}')
    print(f'- [VALID] LOSS : {LOSS_HISTORY[1][-1]} SCORE : {SCORE_HISTORY[1][-1]}')

	# 모델 모니터링---------------------------------------------------
    # 검증 DS 기준
    
    if len(SCORE_HISTORY[1]) > 1:
        if SCORE_HISTORY[1][-1] <= SCORE_HISTORY[1][-2]:
            BREAK_CNT += 1
            
	# 좋은 성능 학습 모델 & 파라미터 저장
    SAVE_FILE = f'Model_train_wb_{epoch}_{score_val:.4f}.pth'
    
    if len(SCORE_HISTORY[1]) ==1:
        torch.save(model.state_dict(), SAVE_PATH+SAVE_FILE)
        torch.save(model, SAVE_PATH+SAVE_MODEL)
        

    else:
        torch.save(model.state_dict(), SAVE_PATH+SAVE_FILE)
        torch.save(model, SAVE_PATH+SAVE_MODEL)
        

	# 학습 중단 여부 결정
    if BREAK_CNT > LIMIT_VALUE:
        print(f"score 개선 변화가 없어 {epoch} EPOCH에서 학습을 중단.")
        break
    


BATCH_CNT => 3
torch.Size([5, 4]) torch.Size([5])


ValueError: If `preds` have one dimension more than `target`, `preds.shape[1]` should be equal to number of classes.