In [1]:
import os,sys
import time
from tqdm import tqdm

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler

import torch
import torch.nn as nn

In [2]:
import sys
sys.path.append('/Users/khj/MyPython/lib-python/torch/')

In [3]:
from build_model import EarlyStopping
from torch_seed import seed_everything

In [4]:
SEED = 42
seed_everything(SEED)

In [5]:
train_df = pd.read_csv('./data/train.csv')
test_df  = pd.read_csv('./data/test.csv')

In [6]:
# ID : 샘플 별 고유 ID
# 가입일 : 서비스에 가입한 일수
# 음성사서함이용 : 음성사서함 이용 건수
# 주간통화시간 : 8시 ~ 16시까지의 통화 시간
# 주간통화횟수 : 8시 ~ 16시까지의 통화 횟수
# 주간통화요금 : 8시 ~ 16시까지의 통화 요금
# 저녁통화시간 : 16시 ~ 0시까지의 통화 시간
# 저녁통화횟수 : 16시 ~ 0시까지의 통화 횟수
# 저녁통화요금 : 16시 ~ 0시까지의 통화 요금
# 밤통화시간 : 0시 ~ 8시까지의 통화 시간
# 밤통화횟수 : 0시 ~ 8시까지의 통화 횟수
# 밤통화요금 : 0시 ~ 8시까지의 통화 요금
# 상담전화건수 : 고객센터에 전화를 건 횟수
# 전화해지여부 : 0(서비스 유지)/ 1(서비스 해지)

In [7]:
tr_rename_dict = {
    'ID':'id',
    '가입일':'days_subscription',
    '음성사서함이용':'n_voicemail',
    '주간통화시간':'time_call_morning',
    '주간통화횟수':'n_call_morning',
    '주간통화요금':'pay_call_morning',
    '저녁통화시간':'time_call_evening',
    '저녁통화횟수':'n_call_evening',
    '저녁통화요금':'pay_call_evening',
    '밤통화시간':'time_call_night',
    '밤통화횟수':'n_call_night',
    '밤통화요금':'pay_call_night',
    '상담전화건수':'n_call',
    '전화해지여부':'target',
}
te_rename_dict = tr_rename_dict.copy()
del te_rename_dict['전화해지여부']

train_df = train_df.rename(columns=tr_rename_dict)
test_df  = test_df .rename(columns=te_rename_dict)

In [8]:
# train_df.describe()
# train_df.isnull().sum()

In [9]:
train_df.shape, test_df.shape

((30200, 14), (12943, 13))

In [10]:
train_df.head()

Unnamed: 0,id,days_subscription,n_voicemail,time_call_morning,n_call_morning,pay_call_morning,time_call_evening,n_call_evening,pay_call_evening,time_call_night,n_call_night,pay_call_night,n_call,target
0,TRAIN_00000,329,0,99.2,93,27.3,268.8,68,28.92,262.9,328,32.89,2,0
1,TRAIN_00001,2,80,323.9,323,83.7,269.4,326,32.09,322.8,209,32.32,2,0
2,TRAIN_00002,93,28,282.4,323,34.2,207.0,322,32.82,280.8,328,8.28,0,0
3,TRAIN_00003,223,1,221.4,223,25.1,233.0,61,23.9,203.8,234,9.36,0,0
4,TRAIN_00004,222,0,96.3,222,28.7,223.9,69,28.08,263.1,223,2.8,8,0


In [11]:
target_feature = 'target'
unuse_features = ['id']
cat_features = [] #['n_call']
num_features = [col for col in train_df.columns if col not in [target_feature]+unuse_features+cat_features]

In [12]:
train_df.describe().round(1).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
days_subscription,30200.0,159.7,123.8,1.0,63.0,118.0,228.0,2212.0
n_voicemail,30200.0,12.0,24.9,0.0,0.0,1.0,22.0,1112.0
time_call_morning,30200.0,250.6,84.4,0.0,222.8,234.0,289.6,481.9
n_call_morning,30200.0,159.2,101.6,0.0,88.0,117.5,223.0,489.0
pay_call_morning,30200.0,41.1,23.2,0.0,24.4,30.8,45.9,118.6
time_call_evening,30200.0,263.2,74.1,0.1,223.7,243.0,290.3,481.6
n_call_evening,30200.0,158.5,102.2,0.0,87.0,112.0,223.0,489.0
pay_call_evening,30200.0,25.4,8.1,0.0,22.2,23.3,28.8,50.0
time_call_night,30200.0,263.4,73.8,20.8,223.7,242.8,290.5,481.8
n_call_night,30200.0,157.0,101.7,20.0,87.0,108.0,222.0,490.0


In [13]:
# i=0
# for col in num_features:
#     i+=1
#     print('\n({}/{}) {}'.format(i,len(num_features),col))
#     plt.figure(figsize=(15,7))
#     sns.boxplot(x=train_df[target_feature],y=train_df[col])
#     plt.grid()
#     plt.show()

In [14]:
X = train_df[num_features+cat_features]
y = train_df[target_feature]

X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.2,random_state=SEED)

In [15]:
display(y_train.value_counts() / len(y_train))
print('')
display(y_valid.value_counts() / len(y_valid))

0    0.888535
1    0.111465
Name: target, dtype: float64




0    0.896523
1    0.103477
Name: target, dtype: float64

In [16]:
scalers = {}
for col in tqdm(X_train.columns):
    # scaler
    scaler = MinMaxScaler()
    
    # Train
    scaled = scaler.fit_transform(np.array(X_train[col]).reshape(-1,1))
    X_train[col] = scaled
    scalers[col] = scaler
    
    # Valid
    X_valid[col] = scaler.transform(np.array(X_valid[col]).reshape(-1,1))

100%|██████████████████████████████████████████| 12/12 [00:00<00:00, 938.43it/s]


In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader, TensorDataset

print('> torch version  :',torch.__version__)
print('> cuda available :',torch.cuda.is_available())

> torch version  : 1.14.0.dev20221202
> cuda available : False


In [18]:
## f1 loss
# https://gist.github.com/SuperShinyEyes/dcc68a08ff8b615442e3bc6a9b55a354

In [19]:
(torch.tensor([0.5,0.7])>0.6).float()

tensor([0., 1.])

In [20]:
def train(
    model, optimizer, train_loader, valid_loader, epochs,
    early_stopping, early_stopping_patience, loss_function,
    early_stopping_verbose=False, device='cpu', scheduler=None, metric_period=1, 
    verbose=True, print_shape=False, save_model_path = './mc/best_model.pt',
    mode=['min','max'], threshold=0.5,
    #transform_y='identity'
):
    assert isinstance(early_stopping,bool), \
        "early_stopping must by type bool"
    #assert transform_y in ['identity','log','sqrt'], \
    #    "transform_y must be one of ['identity','log','sqrt']"
    
    es = EarlyStopping(
        patience=early_stopping_patience,
        verbose=early_stopping_verbose,
        path=save_model_path
    )
    if mode=='min':
        mode_offset = 1
    elif mode=='max':
        mode_offset = -1
    
    model.to(device)
    criterion = loss_function.to(device)

    best_loss  = 999999999 * mode_offset
    best_epoch = 1
    best_model = None
    is_best    = np.nan
    
    start_time = time.time()
    epoch_s = time.time()
    for epoch in range(1, epochs+1):
        
        model.train()
        train_loss = []
        for X, Y in iter(train_loader):

            X = X.float().to(device)
            Y = Y.float().to(device)

            output = model(X).float()
            if print_shape:
                    if epoch==1:
                        print(output.shape,Y.shape) # torch.Size([16, 1]) torch.Size([16, 1])
                        print(output[:2],Y[:2])
            output = (output>=threshold).float()
            output.requires_grad = True
            
            if print_shape:
                    if epoch==1:
                        print(output.shape,Y.shape) # torch.Size([16, 1]) torch.Size([16, 1])
                        print(output[:2],Y[:2])
            
            #Y = seq2list_cuda(Y,device)
            #output = seq2list_cuda(output,device)
            
            #if transform_y=='log':
            #    output = torch.exp(output)
            #    Y      = torch.exp(Y)
            #elif transform_y=='sqrt':
            #    output = output**2
            #    Y      = Y**2
            
            loss = criterion(output, Y)
            #loss = torch.sqrt(loss) # MSE -> RMSE
            
            optimizer.zero_grad()
            loss.backward()  # Getting gradients
            optimizer.step() # Updating parameters

            train_loss.append(loss.item())

        valid_loss = validation(model, valid_loader, criterion, threshold, device) #transform_y

        epoch_e = time.time()
            
        if scheduler is not None:
            scheduler.step(valid_loss)

        # update the best epoch & best loss
        is_first_epoch   = (epoch == 1)
        is_best_mode_min = (mode=='min') & (best_loss > valid_loss)
        is_best_mode_max = (mode=='max') & (best_loss < valid_loss)
        if is_first_epoch | is_best_mode_min | is_best_mode_max:
            best_epoch = epoch
            best_loss = valid_loss
            best_model = model
            is_best = 1
            torch.save(best_model.state_dict(), save_model_path)
        else:
            is_best = 0
            
        # 결과물 printing
        if (verbose) & (epoch % metric_period == 0):
            mark = '*' if is_best else ' '
            epoch_str = str(epoch).zfill(len(str(epochs)))
            progress = '{}[{}/{}] tr_loss: {:.5f}, val_loss: {:.5f}, best_epoch: {}, elapsed: {:.2f}s, total: {:.2f}s, remaining: {:.2f}s'\
                .format(
                    mark,
                    epoch_str,
                    epochs,
                    np.mean(train_loss),
                    valid_loss,
                    best_epoch,
                    epoch_e-epoch_s,
                    epoch_e-start_time,
                    (epoch_e-epoch_s)*(epochs-epoch)/metric_period,
                )
            epoch_s = time.time()
            print(progress)

        # early stopping 여부를 체크. 현재 과적합 상황 추적
        if early_stopping:
            es(valid_loss, model)
            if es.early_stop:
                break

    return best_model

def validation(model, valid_loader, criterion, threshold, device): #transform_y
    model.eval()
    valid_loss = []
    valid_loss_custom = []
    with torch.no_grad():
        for X, Y in iter(valid_loader):
            X = X.float().to(device)
            Y = Y.float().to(device)
            
            output = model(X).float()
            output = (output>=threshold).float()
            
            loss = criterion(output, Y)
            #loss = torch.sqrt(loss) # MSE -> RMSE

            valid_loss.append(loss.item())

    return np.mean(valid_loss)

def predict(best_model,loader,device,threshold): #transform_y
    best_model.to(device)
    best_model.eval()
    
    true_list = []
    pred_list = []
    with torch.no_grad():
        for data,label in iter(loader):
            data = data.float().to(device)

            output = best_model(data)
            output = (output>=threshold).float()
            output = output.cpu().numpy().tolist()
            label  = label.cpu().numpy().tolist()

            #if transform_y=='log':
            #    output = np.exp(output).tolist()
            #    label  = np.exp(label).tolist()
            #elif transform_y=='sqrt':
            #    output = np.square(output).tolist()
            #    label  = np.square(label).tolist()

            true_list += label
            pred_list += output

    return true_list, pred_list

In [21]:
class CustomDataset(Dataset):
    def __init__(self,X,y,infer_mode):
        self.infer_mode = infer_mode

        self.X_list = []
        self.y_list = []
        self.X_list = torch.Tensor(X.values)
        self.y_list = torch.Tensor(y.values)
        # for i in range(len(X)):
        #     seq_x = X.iloc[i].values
        #     seq_y = y.iloc[i]
        #     self.X_list.append(torch.Tensor(seq_x))
        #     self.y_list.append(seq_y)

    def __getitem__(self, index):
        data  = self.X_list[index]
        label = self.y_list[index]
        if self.infer_mode == False:
            return data, label
        else:
            return data

    def __len__(self):
        return len(self.X_list)

In [22]:
batch_size  = 32
num_workers = 0

train_dataset = CustomDataset(X=X_train, y=y_train, infer_mode=False)
train_loader  = DataLoader(train_dataset, batch_size = batch_size, shuffle=True, num_workers=num_workers)

valid_dataset = CustomDataset(X=X_valid, y=y_valid, infer_mode=False)
valid_loader  = DataLoader(valid_dataset, batch_size = batch_size, shuffle=True, num_workers=num_workers)

In [23]:
[y for x,y in train_loader][0]

tensor([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 1., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0.])

In [24]:
[x for x,y in train_loader][0].size()

torch.Size([32, 12])

In [25]:
class Network(nn.Module):
    def __init__(self,n_input,n_hidden,n_output):
        super(Network, self).__init__()
        # self.activation = nn.ReLU()
        # self.layers = nn.Sequential(
        #     # 1 stack
        #     nn.Linear(n_input, n_hidden),
        #     #nn.BatchNorm1d(n_hidden),
        #     self.activation,
        #     # fully connect
        #     nn.Linear(n_hidden, n_output),
        #     #nn.BatchNorm1d(n_output),
        #     self.activation,
        #     nn.Sigmoid()
        #ReLU() )
        self.layers = nn.Sequential(
            # 1 stack
            nn.Linear(n_input, n_output),
            #nn.Dropout(0.2),
            nn.Sigmoid()
        )
    def forward(self, x):
        x = self.layers(x)
        x = x.view(-1)
        return x

In [26]:
# model = CustomModel()
model = Network(n_input=12,n_hidden=50,n_output=1)
print(model.parameters)

<bound method Module.parameters of Network(
  (layers): Sequential(
    (0): Linear(in_features=12, out_features=1, bias=True)
    (1): Sigmoid()
  )
)>


In [27]:
class CustomLoss(nn.Module):
    def __init__(self,device):
        super(CustomLoss, self).__init__()

    def forward(self, output, target):
        threshold = 0.5
        print(output)
        output = torch.where(output>=threshold,1,0).float()
        loss_function = nn.BCELoss().to(device)
        loss = loss_function(output,target).float()
        loss.requires_grad = True
        return loss

In [31]:
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = 0.01) #, weight_decay=1e-5)
# optimizer = torch.optim.SGD(params = model.parameters(), lr = 1e-2, momentum=0.9)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=2, threshold_mode='abs',min_lr=1e-7, verbose=False)

device = 'cpu'
threshold = 0.5

best_model = train(
    model,
    optimizer=optimizer,
    train_loader=train_loader,
    valid_loader=valid_loader,
    scheduler=None,#scheduler,
    loss_function=nn.BCELoss(),
    threshold=threshold,
    mode='min',
    device=device,
    early_stopping=False,
    early_stopping_patience=10,
    early_stopping_verbose=False,
    metric_period=1,
    epochs=30,
    verbose=True,
    print_shape=False,
    save_model_path = './mc/best_model.pt',
)

*[01/30] tr_loss: 88.79967, val_loss: 89.58884, best_epoch: 1, elapsed: 0.30s, total: 0.30s, remaining: 8.56s
 [02/30] tr_loss: 88.79967, val_loss: 89.58884, best_epoch: 1, elapsed: 0.26s, total: 0.56s, remaining: 7.41s
*[03/30] tr_loss: 88.79967, val_loss: 89.58333, best_epoch: 3, elapsed: 0.26s, total: 0.83s, remaining: 7.14s
 [04/30] tr_loss: 88.79967, val_loss: 89.58333, best_epoch: 3, elapsed: 0.26s, total: 1.09s, remaining: 6.88s
 [05/30] tr_loss: 88.79967, val_loss: 89.58884, best_epoch: 3, elapsed: 0.27s, total: 1.36s, remaining: 6.64s
 [06/30] tr_loss: 88.79967, val_loss: 89.58333, best_epoch: 3, elapsed: 0.26s, total: 1.62s, remaining: 6.35s
 [07/30] tr_loss: 88.79967, val_loss: 89.58884, best_epoch: 3, elapsed: 0.26s, total: 1.88s, remaining: 6.02s
 [08/30] tr_loss: 88.79967, val_loss: 89.59436, best_epoch: 3, elapsed: 0.26s, total: 2.15s, remaining: 5.81s
 [09/30] tr_loss: 88.79967, val_loss: 89.59987, best_epoch: 3, elapsed: 0.27s, total: 2.41s, remaining: 5.57s
 [10/30] t

In [None]:
y_true, y_pred = predict(best_model,train_loader,threshold=threshold,device='cpu')
y_true, y_pred = pd.Series(y_true).astype(int), pd.Series(y_pred).astype(int)

print('> Macro F1 Score: {:.4f}'.format(f1_score(y_pred,y_true,average='macro')))
print('> Cross Table:')
pd.crosstab(y_true,y_pred)

In [None]:
pred = prediction.float().detach().numpy().flatten()
true = y_train.values.copy()

pd.crosstab(pred,true)

In [None]:
def f1_loss(y_pred, y_true):
    epsilon = 1e-7
    assert y_pred.ndim == 2
    assert y_true.ndim == 1
    y_true = F.one_hot(y_true, 2).to(torch.float32)
    y_pred = F.softmax(y_pred, dim=1)

    tp = (y_true * y_pred).sum(dim=0).to(torch.float32)
    tn = ((1 - y_true) * (1 - y_pred)).sum(dim=0).to(torch.float32)
    fp = ((1 - y_true) * y_pred).sum(dim=0).to(torch.float32)
    fn = (y_true * (1 - y_pred)).sum(dim=0).to(torch.float32)

    precision = tp / (tp + fp + epsilon)
    recall = tp / (tp + fn + epsilon)

    f1 = 2* (precision*recall) / (precision + recall + epsilon)
    f1 = f1.clamp(min=epsilon, max=1-epsilon)
    return 1 - f1.mean()

In [None]:
# 도구 임포트
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# 데이터 로딩
_x = torch.FloatTensor(X_train.values).float()
_y = torch.FloatTensor(y_train.values.reshape(-1,1))

# 클래스로 모델 생성
class BinaryClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(12, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        return self.sigmoid(self.linear(x))

# 모델 객체 생성
model = BinaryClassifier()

# optimizer 설정
optimizer = optim.SGD(model.parameters(), lr=1)

hypothesis_list = []
nb_epochs = 1000
for epoch in range(nb_epochs + 1):

    # H(x) 계산
    hypothesis = model(_x)

    # cost 계산
    # cost = F.binary_cross_entropy(hypothesis, _y)
    cost = f1_loss(hypothesis,_y.view(-1).type(torch.int64))

    # cost로 H(x) 개선
    optimizer.zero_grad()
    cost.backward()
    optimizer.step()

    # 20번마다 로그 출력
    if epoch % 100 == 0:
        prediction = hypothesis >= torch.FloatTensor([0.7]) # 예측값이 0.5를 넘으면 True로 간주
        correct_prediction = prediction.float() == _y # 실제값과 일치하는 경우만 True로 간주
        accuracy = correct_prediction.sum().item() / len(correct_prediction) # 정확도를 계산
        print('Epoch {:4d}/{} Cost: {:.6f} Accuracy {:2.2f}%'.format( # 각 에포크마다 정확도를 출력
            epoch, nb_epochs, cost.item(), accuracy * 100,
        ))
        hypothesis_list.append(hypothesis)

In [None]:
pred = prediction.float().detach().numpy().flatten()
true = _y.detach().numpy().flatten()

pd.crosstab(pred,true)