## Import

In [1]:
from datetime import datetime
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader,random_split
from tqdm import tqdm

SEED =37

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(SEED) # Seed 고정
def norm(df):
    df_normalized = (df - df.mean()) / (df.max() - df.min())
    return df_normalized
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [3]:
class ConfusionMatrix:
  def __init__(self,classes = None):
    self.cm = None
   
    if classes is not None:
      self.classes = classes
      self.n_classes = len(classes)

  def get_conf_matrix(self,actual,pred):
    if self.classes is None:
      self.classes = torch.unique(actual)
      self.n_classes = len(self.classes)
    conf_matrix = torch.zeros((self.n_classes, self.n_classes), dtype=torch.int32)
    for i in range(self.n_classes):
        for j in range(self.n_classes):
            conf_matrix[i, j] = torch.sum((actual == self.classes[i]) & (pred == self.classes[j]))

    return conf_matrix

  def update(self,actual,pred):

    curr_cm = self.get_conf_matrix(actual,pred)
    if self.cm is not None:
      self.cm += curr_cm
    else:
      self.cm = curr_cm

  def get_result(self):
    return self.cm

## Model Define

In [41]:
class FeatureExtractor(nn.Module):
    def __init__(self,in_features= 2884, num_classes=3):
        super().__init__()
        self.in_features = in_features
        self.embeding = nn.Conv1d(in_features,in_features,1,1,bias=False)
        self.bn = nn.BatchNorm1d(in_features)
        self.feature = nn.Sequential(
                nn.Linear(in_features,in_features,False),
                nn.BatchNorm1d(in_features),
                nn.ReLU(),
                nn.Linear(in_features,in_features//2,False),
                nn.BatchNorm1d(in_features//2),
                nn.ReLU(),
                # nn.Dropout1d(0.5),
                nn.Linear(in_features//2,in_features//8,False),
                nn.BatchNorm1d(in_features//8),
                nn.ReLU(),
        )

        self.reg = nn.Sequential(
            nn.Linear(in_features//8,in_features//32,False),
            nn.BatchNorm1d(in_features//32),
            nn.ReLU(),
            nn.Linear(in_features//32,1,False),
        )

        self.clf = nn.Sequential(
            nn.Linear(in_features//8,in_features//32,False),
            nn.BatchNorm1d(in_features//32),
            nn.ReLU(),
            nn.Linear(in_features//32,num_classes,False),
        )

    def forward(self,x):
        # feature
        x = x.view(-1,self.in_features,1)
        x = self.embeding(x)
        x = x.view(-1,self.in_features)
        x = torch.sigmoid(self.bn(x))
        x = self.feature(x)

        out1 = self.reg(x)
        out2 = self.clf(x)

        return out1,out2


## Data Load

In [42]:
# train_df = pd.read_csv('./train.csv')
# test_df = pd.read_csv('./test.csv')

In [43]:
class Data(Dataset):
    """
        df : 
        train_df.PRODUCT_CODE.unique(),test_df.PRODUCT_CODE.unique()
        >>> (array(['A_31', 'T_31', 'O_31'], dtype=object),
            array(['T_31', 'A_31', 'O_31'], dtype=object))

        a,b = train_df.LINE.unique() , test_df.LINE.unique()
        a.sort()
        b.sort()
        a,b, a == b
        >>> (array(['T010305', 'T010306', 'T050304', 'T050307', 'T100304', 'T100306'],
            dtype=object),
            array(['T010305', 'T010306', 'T050304', 'T050307', 'T100304', 'T100306'],
            dtype=object),
            array([ True,  True,  True,  True,  True,  True]))
        df['LINE', 'PRODUCT_CODE']  values => onehot vector

        onehot: y ['sparse','categorical']
    """
    def __init__(self,train=True,onehot=True,y_qual=False):
        super().__init__()
        
        self. train = train
        self.y_qual = y_qual
        if train:
            self.df = pd.read_csv('./train.csv')
        else:
            self.df = pd.read_csv('./test.csv')
        self.df = self.df.fillna(0)
        qual_col = ['LINE', 'PRODUCT_CODE']

        for i in qual_col:
            col_names = [f'{i}_{j}' for j in range(len(self.df[i].unique()))]
            onehot_matrix = self._onehot_encoder(self.df[i])
            for k,col in enumerate(col_names):
                self.df[col] = onehot_matrix[:,k]

        if train:        
            self.train_x = self.df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality','LINE', 'PRODUCT_CODE']).to_numpy()
            if onehot:
                self.train_y = self._onehot_encoder(self.df['Y_Class'])
                if y_qual:
                    self.train_y = pd.concat((pd.DataFrame(self.train_y),self.df['Y_Quality']),axis=1).to_numpy()
            else:
                self.train_y = (self.df['Y_Class'])
        else:
            self.test_x = self.df.drop(columns=['PRODUCT_ID', 'TIMESTAMP','LINE', 'PRODUCT_CODE']).to_numpy()

    def _onehot_encoder(self,scalar):
        dic = {key:i for i,key in enumerate(np.sort(np.unique(scalar)))}
        zeros = np.zeros((len(scalar),len(dic.keys())))
        for i,s in enumerate(scalar.to_numpy()):
            zeros[i,dic[s]] = 1
        return zeros
    

    def __getitem__(self,index):
        return (self.train_x[index],self.train_y[index]) if self.train else self.test_x[index]


    def __len__(self):
        return len(self.df)

def onehot_encoder(scalar:np.ndarray):
    dic = {key:i for i,key in enumerate(np.sort(np.unique(scalar)))}
    zeros = np.zeros((len(scalar),len(dic.keys())))
    for i,s in enumerate(scalar.tolist()):
        zeros[i][dic[s]] = 1
    return zeros

## Classification Model Fit

In [44]:
EPOCHS = 100
LR = 3e-1
BATCH = 256

dataset = Data(True,True,True)
train_len = int(dataset.__len__() * 0.8)
val_len = dataset.__len__() - train_len
print(train_len,val_len)
train_set,val_set = random_split(dataset,[train_len,val_len])
train_loader = DataLoader(train_set,BATCH,True)
val_loader = DataLoader(val_set,BATCH,False)


478 120


In [45]:
# for data in train_loader:
#     x,y = data
#     print(y)

In [46]:
model = FeatureExtractor()
cirterion0 = nn.CrossEntropyLoss()
cirterion1 = nn.MSELoss()
# loss func에 y_quality 추가 가능성 있음
optim = torch.optim.SGD(model.parameters(),LR)

In [47]:
def eval(model,data_loader,device):
    model.eval()
    cm = ConfusionMatrix([0,1,2])
    for data in (data_loader):
        x,y = data
        x = x.to(device).float()
        y = y.to(device).float()
        model.to(device)

        with torch.no_grad():
            pred = torch.softmax(model(x),1)
        pred = torch.argmax(pred,1)
        cm.update(torch.argmax(y,1),pred)
    
    pos = torch.sum(torch.Tensor([cm.cm[j,j] for j in range(3)]))
    acc = pos/torch.sum(cm.cm)  
    return acc  

In [60]:
def train(model,data_loader,device,y_qual):
    model.train()
    train_loss = 0.
    i=0
    for data in (data_loader):
        x,y = data
        x = x.to(device).float()
        y = y.to(device).float()
        model.to(device)

        if y_qual:    
            y_clf,y_reg = y[:,:-1],y[:,-1]
            print(y_clf.shape, y_reg.shape)
            pred_reg,pred_clf = model(x)
            print('re',pred_reg.shape,pred_clf.shape)
            loss = cirterion0(pred_clf,y)+cirterion1(pred_reg.unsqueeze(0),y)
        else:
            pred = model(x)
            loss = cirterion0(pred,y)

        optim.zero_grad()
        loss.backward()
        optim.step()
        train_loss += loss.item()
        i+=1
    return model,train_loss/i

In [61]:
def learning(model,train_load,val_loader,device,y_qual):
    train_losses=[]
    val_scores = []
    history = {}
    for ep in range(EPOCHS):
        model,train_loss = train(model,train_loader,device,y_qual)
        print(f'{ep} epochs train loss : {train_loss:.5f}')
        val_acc = eval(model,val_loader,device)
        print(f'{ep} epochs val acc : {val_acc:.5f}')
        train_losses.append(train_loss)
        val_scores.append(val_acc)
    history['train_loss'] = train_losses
    history['val_acc'] = val_scores
    
    return model,history

In [62]:

model, history = learning(model,train_loader,val_loader,device,y_qual=True)

torch.Size([256, 3]) torch.Size([256])
re torch.Size([256, 1]) torch.Size([256, 3])


RuntimeError: 0D or 1D target tensor expected, multi-target not supported

## Inference

In [247]:
def infer(model,data_loader,device):
    model.eval()
    re = np.array([])
    for data in (data_loader):
        y = data
        y = y.to(device).float()
        model.to(device)
        
        with torch.no_grad():
            pred = torch.softmax(model(y),1)
        pred = torch.argmax(pred,1).cpu().numpy()
        re = np.append(re,pred)
    return re.reshape((-1,))

In [248]:
test_data = Data(False)
test_loader = DataLoader(test_data,BATCH,False)
preds = infer(model,test_loader,device)
print('Done.')

Done.


In [287]:
for i in range(3):
    print(len(preds[preds ==i]))

0
284
26


## Submit

In [143]:
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y_%m_%d_%H_%M")
os.makedirs('./result',exist_ok=True)
submit = pd.read_csv('./sample_submission.csv')
submit['Y_Class'] = preds
submit.to_csv(f'./result/baseline_submission_{formatted_datetime}.csv', index=False)