# <span style = "color:#00994c">**import**

In [35]:
import torch.nn as nn
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import random
import os
import numpy as np
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
import torch
test_path = "./test.csv"
train_path = "./train.csv"

In [36]:
# %pip install plotly (jupyter notebook)
from plotly.offline import iplot
import plotly.graph_objs as go
import plotly.io as pio
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
#pio.renderers.default = 'iframe_connected'
#pio.renderers.default = "vscode"
pio.renderers.default = "plotly_mimetype+notebook"

# <span style = "color:#00994c">**Data**

In [37]:
train = pd.read_csv(train_path).drop(columns = ["id"])
train_len = len(train)
test = pd.read_csv(test_path)
id_test = test["id"]
test = pd.read_csv(test_path).drop(columns = ["id"])

In [38]:
dataset = pd.concat([train,test],axis=0)
dataset = dataset.drop(columns = ["father","mother","gender"])

In [39]:
dataset.head(5)

Unnamed: 0,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15,class
0,2,G G,A G,A A,G A,C A,A A,A A,G G,A A,G G,A G,A A,A A,A A,A A,B
1,2,A G,A G,C A,A A,A A,A G,A A,G A,A A,A G,A A,G A,G G,A A,A A,C
2,2,G G,G G,A A,G A,C C,G G,A A,G A,G A,A G,A A,A A,A A,A A,A A,B
3,1,A A,G G,A A,G A,A A,G G,G G,A A,G G,A G,G G,G G,G G,A A,G G,A
4,2,G G,G G,C C,A A,C C,A A,A A,A A,A A,G G,A A,A A,A G,A A,G A,C


# <span style = "color:#00994c">**Preprocessing**

In [40]:
_t = []
for val in dataset.SNP_01 == "G G":
    if val == True:
        _t.append(1)
    else:
        _t.append(0)
dataset["has01GG"] = _t

_t = []
for val in dataset.SNP_02 == "A A":
    if val == True:
        _t.append(1)
    else:
        _t.append(0)
dataset["has02AA"] = _t

In [41]:
def create_col(dataset,col,value):
    _t = []
    for val in dataset[col] == value:
        if val == True:
            _t.append(1)
        else:
            _t.append(0)
    
    col_name_base = "has"+col[-2:]
    value_name = ""
    for chr in value:
        if chr != " ":
            value_name+=chr
    col_name = col_name_base+value_name
    #print(col_name)
    dataset[col_name] = _t

    return dataset

dataset = create_col(dataset,"SNP_03","A A")
dataset = create_col(dataset,"SNP_04","G G")
dataset = create_col(dataset,"SNP_05","C C")
dataset = create_col(dataset,"SNP_06","A A")
dataset = create_col(dataset,"SNP_07","A A")
dataset = create_col(dataset,"SNP_07","G G")
dataset = create_col(dataset,"SNP_08","G G")

dataset = create_col(dataset,"SNP_09","A A")
dataset = create_col(dataset,"SNP_09","G G")
dataset = create_col(dataset,"SNP_11","A A")

dataset = create_col(dataset,"SNP_12","A A")
dataset = create_col(dataset,"SNP_12","G G")

dataset = create_col(dataset,"SNP_13","A A")
dataset = create_col(dataset,"SNP_14","A A")

In [42]:
#one-hot encoding for distance base algorithm
dataset_ohe = pd.get_dummies(dataset,columns = dataset.columns.drop("class"),drop_first=True) #multicollinearity를 막기위한 drop_first 옵션
train_ohe = dataset_ohe[:train_len].copy()
test_ohe = dataset_ohe[train_len:].copy().drop(columns="class")

class_map = {"A":0,"B":1,"C":2}
train_ohe["class"]=train_ohe["class"].map(class_map).astype(int)
X_train_ohe = train_ohe.drop(columns = "class")
Y_train_ohe = train_ohe["class"] #X와 통일성을 위해 ohe로 일단 이름지음

In [43]:
Y_train_ohe[:5]

0    1
1    2
2    1
3    0
4    2
Name: class, dtype: int32

In [44]:
import torch
import torch.nn as nn


In [45]:
X_train_ohe = torch.from_numpy(X_train_ohe.values).float()
#y_train_ohe = torch.from_numpy(pd.get_dummies(Y_train_ohe).values).float()
Y_train_ohe = torch.from_numpy(Y_train_ohe.values).long()

## 탐지한 이상치 제외

In [46]:
outliers = pd.read_csv("./outlierdetect.csv").drop(columns = "Unnamed: 0")
normal_index = ~outliers.isoutlier

In [47]:
X_train_ohe = X_train_ohe[normal_index];Y_train_ohe = Y_train_ohe[normal_index]

In [48]:
X_train_ohe.shape

torch.Size([243, 47])

# <span style = "color:#00994c">**Modeling**


In [49]:
class mynet(nn.Module):
    def __init__(self,in_features,dropout_p,l1_out=64):
        super().__init__()
        self.linr1 = torch.nn.Linear(in_features,l1_out)
        self.relu1 = torch.nn.ReLU()
        self.d1 = torch.nn.Dropout(p=dropout_p)
        self.b1 = torch.nn.BatchNorm1d(l1_out)
        # 256
        
        self.linr2 = torch.nn.Linear(l1_out,l1_out//2)
        self.relu2 = torch.nn.ReLU()
        self.d2 = torch.nn.Dropout(p=dropout_p)
        self.b2 = torch.nn.BatchNorm1d(l1_out//2)
        # 128

        self.linr3 = torch.nn.Linear(l1_out//2,l1_out//4)
        self.relu3 = torch.nn.ReLU()
        self.d3 = torch.nn.Dropout(p=dropout_p)
        self.b3 = torch.nn.BatchNorm1d(l1_out//4)
        # 64
        self.linr4 = torch.nn.Linear(l1_out//4,l1_out//8)
        self.relu4 = torch.nn.ReLU()
        self.d4 = torch.nn.Dropout(p=dropout_p)
        self.b4 = torch.nn.BatchNorm1d(l1_out//8)
        # 32
        self.linr5 = torch.nn.Linear(l1_out//8,l1_out//16)
        self.relu5 = torch.nn.ReLU()
        self.d5 = torch.nn.Dropout(p=dropout_p)
        self.b5 = torch.nn.BatchNorm1d(l1_out//16)
        #16
        self.linr6 = torch.nn.Linear(l1_out//16,3)

    def forward(self,x):
        out = self.b1(self.d1(self.relu1(self.linr1(x))))
        out = self.b2(self.d2(self.relu2(self.linr2(out))))
        out = self.b3(self.d3(self.relu3(self.linr3(out))))
        out = self.b4(self.d4(self.relu4(self.linr4(out))))
        out = self.b5(self.d5(self.relu5(self.linr5(out))))
        out = self.linr6(out)
        #out = self.b3(self.d3(self.relu3(self.linr3(x))))
        return out

In [50]:
from sklearn.model_selection import StratifiedKFold
import random
epochs_list = [i for i in range(700,1500,20)]
weight_decay = np.linspace(0.1,0.01,500).tolist()
lr = np.linspace(1e-2,1e-5,500).tolist()
hidden_nodes = [i for i in range(100,300)]
dropout_p = np.linspace(0.1,0.3,1000).tolist()
rs = [i for i in range(0,500)]

In [51]:
def dl_cv(epochs,weight_decay,learning_rate,hidden1_nodes,dropout_p,rs):

    try_number=0
    skf = StratifiedKFold(n_splits=5,shuffle=True)
    skf.get_n_splits(X_train_ohe,Y_train_ohe)
    
    while True:
        try_number+=1
        print(f'try:{try_number}...')
        train_accs = []
        val_accs = []
        hdly1 = random.sample(hidden1_nodes,1)[0]
        lr = random.sample(learning_rate,1)[0]
        wght_decay = random.sample(weight_decay,1)[0]
        epoch = random.sample(epochs,1)[0]
        drop_p = random.sample(dropout_p,1)[0]
        r_seed = random.sample(rs,1)[0]
        print(f'lr:{lr} wght_decay:{wght_decay} epochs:{epoch} hidden_l1nodes:{hdly1} dropout_prob:{drop_p}')
        for train_index,valid_index in skf.split(X_train_ohe,Y_train_ohe): 
            torch.manual_seed(r_seed)
            net = mynet(47,drop_p,l1_out=hdly1)
            optimizer = torch.optim.Adam(net.parameters(),lr=lr,weight_decay = wght_decay)        
            loss_fn = torch.nn.CrossEntropyLoss()
            #KFold
            train_index = train_index.tolist();valid_index = valid_index.tolist()
            X_tr = X_train_ohe[train_index,:];y_tr = Y_train_ohe[train_index]
            X_tst = X_train_ohe[valid_index,:];y_valid = Y_train_ohe[valid_index]

            #fitting
            for ep in range(epoch):
                net.train()
                #1 yhat
                yhat = net(X_tr)
                #2 loss
                loss = loss_fn(yhat,y_tr)
                if ep % 50 == 0:
                    pass
                    #print(loss)
                #3 derivative
                loss.backward()
                #4 update
                optimizer.step()
                optimizer.zero_grad()
            train_yhat = torch.argmax(net(X_tr),dim=1)
            train_acc = torch.mean((train_yhat==y_tr).float())
            print("trainacc : ",train_acc)
            train_accs.append(train_acc)
            net.eval()
            with torch.no_grad():
                val_yhat = torch.argmax(net(X_tst),dim=1)
                val_acc = torch.mean((val_yhat == y_valid).float()).tolist()
                print("validacc : ",val_acc)
                val_accs.append(val_acc)
            if val_acc < 0.95:
                break

        valid_accuracy = torch.mean(torch.tensor(val_accs))
        print(f"K-Fold train accuracy {torch.mean(torch.tensor(train_accs))}")
        print(f"K-Fold valid accuracy {torch.mean(torch.tensor(val_accs))}")
        print("==========================================================")
        if valid_accuracy > 0.98:
            path = "./model{}.pth".format(try_number)
            torch.save(net,path)
t = dl_cv(epochs_list,weight_decay,lr,hidden_nodes,dropout_p,rs)

try:1...
lr:0.0016116032064128257 wght_decay:0.03182364729458918 epochs:1020 hidden_l1nodes:204 dropout_prob:0.15445445445445447
trainacc :  tensor(0.9948)
validacc :  0.9387755393981934
K-Fold train accuracy 0.9948453903198242
K-Fold valid accuracy 0.9387755393981934
try:2...
lr:0.006436432865731463 wght_decay:0.019919839679358717 epochs:1020 hidden_l1nodes:289 dropout_prob:0.1998998998998999
trainacc :  tensor(0.9948)
validacc :  0.9795918464660645
trainacc :  tensor(0.9948)
validacc :  0.9591836929321289
trainacc :  tensor(0.9897)
validacc :  0.9387755393981934
K-Fold train accuracy 0.993127167224884
K-Fold valid accuracy 0.9591836929321289
try:3...
lr:0.0023523446893787573 wght_decay:0.08647294589178357 epochs:920 hidden_l1nodes:254 dropout_prob:0.10900900900900902
trainacc :  tensor(1.)
validacc :  0.9591836929321289
trainacc :  tensor(1.)
validacc :  0.9591836929321289
trainacc :  tensor(0.9948)
validacc :  0.9591836929321289
trainacc :  tensor(1.)
validacc :  0.9583333134651184


KeyboardInterrupt: 

In [52]:
test_ohe

Unnamed: 0,trait_2,SNP_01_A G,SNP_01_G G,SNP_02_A G,SNP_02_G G,SNP_03_C A,SNP_03_C C,SNP_04_G A,SNP_04_G G,SNP_05_C A,...,has07AA_1,has07GG_1,has08GG_1,has09AA_1,has09GG_1,has11AA_1,has12AA_1,has12GG_1,has13AA_1,has14AA_1
0,0,1,0,0,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
1,1,0,1,1,0,0,1,0,1,0,...,1,0,0,1,0,0,1,0,1,1
2,1,0,1,1,0,0,0,0,0,1,...,1,0,0,1,0,1,0,0,0,1
3,1,0,1,1,0,1,0,0,0,0,...,1,0,0,1,0,1,0,0,0,1
4,0,0,0,0,1,0,0,0,1,0,...,0,1,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,1,1,0,0,1,0,1,0,0,1,...,1,0,1,1,0,0,1,0,1,1
171,1,0,1,0,0,0,0,0,0,1,...,1,0,0,1,0,1,1,0,0,1
172,1,0,1,0,0,0,0,0,0,1,...,1,0,0,1,0,0,1,0,0,1
173,1,1,0,0,1,1,0,1,0,0,...,1,0,0,1,0,0,1,0,1,1


# Submission

In [60]:
mymodel = torch.load("./model268.pth")

In [61]:
test_predict = torch.argmax(mymodel(torch.from_numpy(test_ohe.values).float()),dim=1)

In [63]:
result = pd.concat([pd.Series(id_test),pd.Series(test_predict)],axis=1)
result.columns = ["id","class"]
class_map_inv = {0:"A",1:"B",2:"C"}
result["class"] = result["class"].map(class_map_inv)

In [64]:
result

Unnamed: 0,id,class
0,TEST_000,A
1,TEST_001,B
2,TEST_002,C
3,TEST_003,B
4,TEST_004,A
...,...,...
170,TEST_170,B
171,TEST_171,C
172,TEST_172,C
173,TEST_173,B


In [None]:
result.to_csv("./submission_dl2.csv",index=False)

# <span style = "color:#00994c">**참고링크**


[링크1](https://dodonam.tistory.com/301)<br>
[링크2](https://yamalab.tistory.com/116)<br>