In [1]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import gc
import math
import random
import pickle
import pandas as pd
import numpy as np
import multiprocessing
from tqdm import tqdm

from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset, sampler
import torch.optim.lr_scheduler as lr_scheduler

from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from imblearn.over_sampling import SMOTE, SMOTENC, SMOTEN, RandomOverSampler, ADASYN
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, CondensedNearestNeighbour, OneSidedSelection, NeighbourhoodCleaningRule

import shap
import catboost
from catboost import Pool, cv

pd.set_option('display.max_columns', 500)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
train = pd.read_csv("./data/train.csv").drop(columns=['father', 'mother', 'gender'])
train.drop_duplicates(subset=train.columns.tolist()[5:20], inplace=True, ignore_index=True)
test = pd.read_csv("./data/test.csv").drop(columns=['father', 'mother', 'gender'])

train.iloc[:, 1:-1] = train.iloc[:, 1:-1].astype('category')
test.iloc[:, 1:] = test.iloc[:, 1:].astype('category')

answer = np.zeros(len(test)) - 1

train.info(), test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248 entries, 0 to 247
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   id      248 non-null    object  
 1   trait   248 non-null    category
 2   SNP_01  248 non-null    category
 3   SNP_02  248 non-null    category
 4   SNP_03  248 non-null    category
 5   SNP_04  248 non-null    category
 6   SNP_05  248 non-null    category
 7   SNP_06  248 non-null    category
 8   SNP_07  248 non-null    category
 9   SNP_08  248 non-null    category
 10  SNP_09  248 non-null    category
 11  SNP_10  248 non-null    category
 12  SNP_11  248 non-null    category
 13  SNP_12  248 non-null    category
 14  SNP_13  248 non-null    category
 15  SNP_14  248 non-null    category
 16  SNP_15  248 non-null    category
 17  class   248 non-null    object  
dtypes: category(16), object(2)
memory usage: 9.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data colu

(None, None)

In [3]:
# text 형태의 categorical 변수들을 숫자형태로 변경

for i in tqdm(range(1, 15+1)) :
    target = str(i) if i >= 10 else "0"+str(i)
    try :   
        cols = sorted(train[f"SNP_{target}"].unique().tolist())  
        train[f"SNP_{target}"] = train[f"SNP_{target}"].map(lambda x : 0 if x==cols[0] else (1 if x==cols[1] else 2))
        test[f"SNP_{target}"] = test[f"SNP_{target}"].map(lambda x : 0 if x==cols[0] else (1 if x==cols[1] else 2))
    except :
        continue

train.info(), test.info()

100%|██████████| 15/15 [00:00<00:00, 1879.17it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248 entries, 0 to 247
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   id      248 non-null    object  
 1   trait   248 non-null    category
 2   SNP_01  248 non-null    category
 3   SNP_02  248 non-null    category
 4   SNP_03  248 non-null    category
 5   SNP_04  248 non-null    category
 6   SNP_05  248 non-null    category
 7   SNP_06  248 non-null    category
 8   SNP_07  248 non-null    category
 9   SNP_08  248 non-null    category
 10  SNP_09  248 non-null    category
 11  SNP_10  248 non-null    category
 12  SNP_11  248 non-null    category
 13  SNP_12  248 non-null    category
 14  SNP_13  248 non-null    category
 15  SNP_14  248 non-null    category
 16  SNP_15  248 non-null    category
 17  class   248 non-null    object  
dtypes: category(16), object(2)
memory usage: 9.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data colu




(None, None)

In [4]:
X, y = train.iloc[:, 1:-1].to_numpy(), train['class'].map(lambda x : 0 if x=='A' else(1 if x=='B' else 2)).values
X_test = test.iloc[:,1:].to_numpy()

# Load the data
train_data = torch.from_numpy(X)
test_data = torch.from_numpy(X_test)

# Preprocess the data by one-hot encoding the categories
one_hot_data_01 = F.one_hot(train_data[:,:1]-1, num_classes=3).view(len(X), 3).float()
one_hot_data_02 = F.one_hot(train_data[:,1:], num_classes=3).view(len(X), 3*train_data[:,1:].size(1)).float()

one_hot_train = torch.concat([one_hot_data_01, one_hot_data_02], axis=1)

one_hot_data_01 = F.one_hot(test_data[:,:1]-1, num_classes=3).view(len(X_test), 3).float()
one_hot_data_02 = F.one_hot(test_data[:,1:], num_classes=3).view(len(X_test), 3*test_data[:,1:].size(1)).float()

one_hot_test = torch.concat([one_hot_data_01, one_hot_data_02], axis=1)

one_hot_train.shape, one_hot_test.shape

(torch.Size([248, 48]), torch.Size([175, 48]))

In [5]:
random_seed=133
strategy1 = {0 : 40, 1 : 70, 2 : 50}

under1 = RandomUnderSampler(sampling_strategy=strategy1, random_state=random_seed)
under2 = EditedNearestNeighbours()
under3 = RepeatedEditedNearestNeighbours()
under4 = AllKNN()
under5 = CondensedNearestNeighbour(random_state=random_seed)
under6 = OneSidedSelection(random_state=random_seed)
under7 = NeighbourhoodCleaningRule()

X, y = one_hot_train, train['class'].map(lambda x : 0 if x=='A' else (1 if x=='B' else 2)).values

X1, y1 = under1.fit_resample(X, y)
X2, y2 = under2.fit_resample(X, y)
X3, y3 = under3.fit_resample(X, y)
X4, y4 = under4.fit_resample(X, y)
X5, y5 = under5.fit_resample(X, y)
X6, y6 = under6.fit_resample(X, y)
X7, y7 = under7.fit_resample(X, y)

strategy2 = {0 : 100, 1 : 120, 2 : 110}

over1 = SMOTEN(sampling_strategy=strategy2, random_state=random_seed)
over2 = SMOTE(sampling_strategy=strategy2, random_state=random_seed)
over3 = RandomOverSampler(sampling_strategy=strategy2, random_state=random_seed)

X8, y8 = over1.fit_resample(X, y)
X9, y9 = over2.fit_resample(X, y)
X10, y10 = over3.fit_resample(X, y)

data = np.concatenate([one_hot_train, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10])
label = np.concatenate([y, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10])

data.shape, label.shape

((2372, 48), (2372,))

In [6]:
random_seed = 158
strategy1 = {0 : 40, 1 : 70, 2 : 50}

under1 = RandomUnderSampler(sampling_strategy=strategy1, random_state=random_seed)
under2 = EditedNearestNeighbours()
under3 = RepeatedEditedNearestNeighbours()
under4 = AllKNN()
under5 = CondensedNearestNeighbour(random_state=random_seed)
under6 = OneSidedSelection(random_state=random_seed)
under7 = NeighbourhoodCleaningRule()

X, y = one_hot_train, train['class'].map(lambda x : 0 if x=='A' else (1 if x=='B' else 2)).values

X1, y1 = under1.fit_resample(X, y)
X2, y2 = under2.fit_resample(X, y)
X3, y3 = under3.fit_resample(X, y)
X4, y4 = under4.fit_resample(X, y)
X5, y5 = under5.fit_resample(X, y)
X6, y6 = under6.fit_resample(X, y)
X7, y7 = under7.fit_resample(X, y)

strategy2 = {0 : 100, 1 : 120, 2 : 110}

over1 = SMOTEN(sampling_strategy=strategy2, random_state=random_seed)
over2 = SMOTE(sampling_strategy=strategy2, random_state=random_seed)
over3 = RandomOverSampler(sampling_strategy=strategy2, random_state=random_seed)

X8, y8 = over1.fit_resample(X, y)
X9, y9 = over2.fit_resample(X, y)
X10, y10 = over3.fit_resample(X, y)

data = np.concatenate([data, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10])
label = np.concatenate([label, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10])

data.shape, label.shape

((4495, 48), (4495,))

In [7]:
random_seed = 333
strategy1 = {0 : 40, 1 : 70, 2 : 50}

under1 = RandomUnderSampler(sampling_strategy=strategy1, random_state=random_seed)
under2 = EditedNearestNeighbours()
under3 = RepeatedEditedNearestNeighbours()
under4 = AllKNN()
under5 = CondensedNearestNeighbour(random_state=random_seed)
under6 = OneSidedSelection(random_state=random_seed)
under7 = NeighbourhoodCleaningRule()

X, y = one_hot_train, train['class'].map(lambda x : 0 if x=='A' else (1 if x=='B' else 2)).values

X1, y1 = under1.fit_resample(X, y)
X2, y2 = under2.fit_resample(X, y)
X3, y3 = under3.fit_resample(X, y)
X4, y4 = under4.fit_resample(X, y)
X5, y5 = under5.fit_resample(X, y)
X6, y6 = under6.fit_resample(X, y)
X7, y7 = under7.fit_resample(X, y)

strategy2 = {0 : 100, 1 : 120, 2 : 110}

over1 = SMOTEN(sampling_strategy=strategy2, random_state=random_seed)
over2 = SMOTE(sampling_strategy=strategy2, random_state=random_seed)
over3 = RandomOverSampler(sampling_strategy=strategy2, random_state=random_seed)

X8, y8 = over1.fit_resample(X, y)
X9, y9 = over2.fit_resample(X, y)
X10, y10 = over3.fit_resample(X, y)

data = np.concatenate([data, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10])
label = np.concatenate([label, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10])

data.shape, label.shape

((6597, 48), (6597,))

In [8]:
random_seed = 3558
strategy1 = {0 : 40, 1 : 70, 2 : 50}

under1 = RandomUnderSampler(sampling_strategy=strategy1, random_state=random_seed)
under2 = EditedNearestNeighbours()
under3 = RepeatedEditedNearestNeighbours()
under4 = AllKNN()
under5 = CondensedNearestNeighbour(random_state=random_seed)
under6 = OneSidedSelection(random_state=random_seed)
under7 = NeighbourhoodCleaningRule()

X, y = one_hot_train, train['class'].map(lambda x : 0 if x=='A' else (1 if x=='B' else 2)).values

X1, y1 = under1.fit_resample(X, y)
X2, y2 = under2.fit_resample(X, y)
X3, y3 = under3.fit_resample(X, y)
X4, y4 = under4.fit_resample(X, y)
X5, y5 = under5.fit_resample(X, y)
X6, y6 = under6.fit_resample(X, y)
X7, y7 = under7.fit_resample(X, y)

strategy2 = {0 : 100, 1 : 120, 2 : 110}

over1 = SMOTEN(sampling_strategy=strategy2, random_state=random_seed)
over2 = SMOTE(sampling_strategy=strategy2, random_state=random_seed)
over3 = RandomOverSampler(sampling_strategy=strategy2, random_state=random_seed)

X8, y8 = over1.fit_resample(X, y)
X9, y9 = over2.fit_resample(X, y)
X10, y10 = over3.fit_resample(X, y)

data = np.concatenate([data, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10])
label = np.concatenate([label, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10])

data.shape, label.shape

((8711, 48), (8711,))

In [9]:
random_seed = 3885
strategy1 = {0 : 40, 1 : 70, 2 : 50}

under1 = RandomUnderSampler(sampling_strategy=strategy1, random_state=random_seed)
under2 = EditedNearestNeighbours()
under3 = RepeatedEditedNearestNeighbours()
under4 = AllKNN()
under5 = CondensedNearestNeighbour(random_state=random_seed)
under6 = OneSidedSelection(random_state=random_seed)
under7 = NeighbourhoodCleaningRule()

X, y = one_hot_train, train['class'].map(lambda x : 0 if x=='A' else (1 if x=='B' else 2)).values

X1, y1 = under1.fit_resample(X, y)
X2, y2 = under2.fit_resample(X, y)
X3, y3 = under3.fit_resample(X, y)
X4, y4 = under4.fit_resample(X, y)
X5, y5 = under5.fit_resample(X, y)
X6, y6 = under6.fit_resample(X, y)
X7, y7 = under7.fit_resample(X, y)

strategy2 = {0 : 100, 1 : 120, 2 : 110}

over1 = SMOTEN(sampling_strategy=strategy2, random_state=random_seed)
over2 = SMOTE(sampling_strategy=strategy2, random_state=random_seed)
over3 = RandomOverSampler(sampling_strategy=strategy2, random_state=random_seed)

X8, y8 = over1.fit_resample(X, y)
X9, y9 = over2.fit_resample(X, y)
X10, y10 = over3.fit_resample(X, y)

data = np.concatenate([data, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10])
label = np.concatenate([label, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10])

data.shape, label.shape

((10834, 48), (10834,))

In [10]:
df = pd.DataFrame(data=data, columns=[f"var_{x}" for x in range(data.shape[1])])
df[df >= 0.5] = 1
df[df < 0.5] = 0
df = df.astype('int')
df['class'] = label
df.drop_duplicates(inplace=True, ignore_index=True)
df

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,var_11,var_12,var_13,var_14,var_15,var_16,var_17,var_18,var_19,var_20,var_21,var_22,var_23,var_24,var_25,var_26,var_27,var_28,var_29,var_30,var_31,var_32,var_33,var_34,var_35,var_36,var_37,var_38,var_39,var_40,var_41,var_42,var_43,var_44,var_45,var_46,var_47,class
0,0,1,0,0,0,1,0,1,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,1,0,0,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1
1,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,1,1,0,0,1,0,0,2
2,0,1,0,0,0,1,0,0,1,1,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1
3,1,0,0,1,0,0,0,0,1,1,0,0,0,1,0,1,0,0,0,0,1,0,0,1,1,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,1,0
4,0,1,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,1,0,2
396,0,1,0,0,0,1,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,2
397,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,2
398,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,0,1,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,2


In [11]:
class CategoricalTransformer(nn.Module):
    def __init__(self, num_categories, embedding_dim, hidden_dim, num_layers, num_classes, dropout_prob=0.5):
        super(CategoricalTransformer, self).__init__()
        
        # Embedding layer to map categorical features to dense embeddings
        self.embedding = nn.Embedding(num_categories, embedding_dim)
        
        # Transformer layers
        self.transformer = nn.Transformer(d_model=embedding_dim, nhead=hidden_dim, num_encoder_layers=num_layers, dropout=dropout_prob)
        
        # Fully connected layer to map transformer output to the output classes
        self.fc = nn.Linear(embedding_dim*num_categories, num_classes)

    def forward(self, x, tgt):
        # Map categorical inputs to dense embeddings using the embedding layer
        x = self.embedding(x)
        
        # Pass the embeddings through the transformer
        x = self.transformer(x, tgt)
        
        # Shape the transformer output for the fully connected layer
        x = x.view(x.size(0), -1)
        
        # Pass the transformer output through the fully connected layer
        x = self.fc(x)
        
        return x


In [34]:
num_categories = 48
embedding_dim = 32
hidden_dim = 8
num_layers = 4
num_classes = 3
num_epochs = 1000

# Instantiate the model
model = CategoricalTransformer(num_categories=num_categories, embedding_dim=embedding_dim, hidden_dim=hidden_dim, num_layers=num_layers, num_classes=num_classes)

# Move the model to the GPU if available
if torch.cuda.is_available():
    model = model.to(device)

# Define a loss function and optimizer
loss_fn = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters())

X, y = torch.LongTensor(df.iloc[:,:-1].values).to(device), torch.Tensor(df['class']).to(device)
y = F.one_hot((y).to(torch.int64), num_classes=3).float()
dummy_tgt = torch.zeros((len(X), num_categories, embedding_dim)).float().to(device)

# Train the model
for epoch in range(num_epochs):
    # Forward pass
    output = torch.softmax(model(X, dummy_tgt), axis=1)
    loss = loss_fn(output, y)
    
    acc = torch.mean((torch.argmax(output, axis=1) == torch.argmax(y, axis=1)).to(torch.float))
    
    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if epoch % 10 == 0 :
        print(f"[{epoch+1}/{num_epochs}] loss:{loss:.4f}, acc:{acc:.4f}")

[1/1000] loss:1.1087, acc:0.3475
[11/1000] loss:1.0944, acc:0.3775
[21/1000] loss:1.0970, acc:0.3625
[31/1000] loss:1.0945, acc:0.3600
[41/1000] loss:1.0867, acc:0.3875
[51/1000] loss:1.0985, acc:0.3500
[61/1000] loss:1.0930, acc:0.3825
[71/1000] loss:1.0954, acc:0.3525
[81/1000] loss:1.0958, acc:0.3525
[91/1000] loss:1.1019, acc:0.3300
[101/1000] loss:1.0978, acc:0.3500
[111/1000] loss:1.0981, acc:0.3425
[121/1000] loss:1.0988, acc:0.3250
[131/1000] loss:1.0936, acc:0.3675
[141/1000] loss:1.0948, acc:0.3500
[151/1000] loss:1.0921, acc:0.3800
[161/1000] loss:1.0855, acc:0.3950
[171/1000] loss:1.0945, acc:0.3625
[181/1000] loss:1.0952, acc:0.3925
[191/1000] loss:1.0964, acc:0.3725
[201/1000] loss:1.0919, acc:0.3525
[211/1000] loss:1.0995, acc:0.3500
[221/1000] loss:1.0893, acc:0.3700
[231/1000] loss:1.0897, acc:0.3850
[241/1000] loss:1.0904, acc:0.3825
[251/1000] loss:1.0911, acc:0.3825
[261/1000] loss:1.0957, acc:0.3500
[271/1000] loss:1.0926, acc:0.3725
[281/1000] loss:1.0898, acc:0.3

KeyboardInterrupt: 

In [37]:
class CategoricalFeedforward(nn.Module):
    def __init__(self, num_categories, embedding_dim, hidden_dim, num_classes, dropout_prob=0.5, l2_reg=0.01):
        super(CategoricalFeedforward, self).__init__()
        
        # Embedding layer to map categorical features to dense embeddings
        self.embedding = nn.Embedding(num_categories, embedding_dim)
        
        # Fully connected layers to map the embeddings to the output classes
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_classes)
        
        # Dropout layer to regularize the model
        self.dropout = nn.Dropout(dropout_prob)
        
        # L2 regularization
        self.l2_reg = l2_reg

    def forward(self, x):
        # Map categorical inputs to dense embeddings using the embedding layer
        x = self.embedding(x)
        
        # Pass the embeddings through the fully connected layers and apply dropout
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.fc2(x)
        
        return x

l2_reg=0.03

# Instantiate the model
model = CategoricalFeedforward(num_categories=num_categories, embedding_dim=embedding_dim, hidden_dim=hidden_dim, num_classes=num_classes, l2_reg=l2_reg)

# Move the model to the GPU if available
if torch.cuda.is_available():
    model = model.cuda()

# Define a loss function and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())


X, y = torch.LongTensor(df.iloc[:,:-1].values).to(device), torch.Tensor(df['class']).to(device)
y = F.one_hot((y).to(torch.int64), num_classes=3)


# Train the model
for epoch in range(num_epochs):
    # Forward pass
    output = model(X)
    loss = loss_fn(output, y)
    
    # Add L2 regularization term to the loss
    l2_reg_loss = sum(model.l2_reg * torch.sum(param ** 2) for param in model.parameters())
    loss += l2_reg_loss
    
    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if epoch % 10 == 0 :
        acc = torch.mean((torch.argmax(output, axis=1) == torch.argmax(y, axis=1)).to(torch.float))
        print(f"[{epoch+1}/{num_epochs}] loss:{loss:.4f}, acc:{acc:.4f}")


RuntimeError: The size of tensor a (3) must match the size of tensor b (400) at non-singleton dimension 1

In [39]:
output.shape

torch.Size([400, 48, 3])

In [40]:
torch.argmax(output, axis=2).shape

torch.Size([400, 48])

In [None]:
submit = pd.read_csv("submit_high1.csv")
submit['A_prob'] = np.round(pred_A, 4)
submit['B_prob'] = np.round(pred_B, 4)
submit['C_prob'] = np.round(pred_C, 4)
submit['BC_prob'] = np.round(pred_BC, 4)
# submit['class'] = submit['class'].map(lambda x : 'A' if x==0 else ('B' if x==1 else 'C'))
submit['total'] = np.argmax(submit[['A_prob', 'B_prob', 'C_prob']].values, axis=1)
submit['total'] = submit['total'].map(lambda x : 'A' if x==0 else ('B' if x==1 else 'C'))
submit

In [None]:
submit[submit['class'] != submit.total]

In [None]:
submit['total2'] = 1
submit.loc[submit.BC_prob >= 0.6, 'total2'] = 2
submit.loc[submit.A_prob >= 0.5, 'total2'] = 0
submit['total2'] = submit['total2'].map(lambda x : 'A' if x==0 else ('B' if x==1 else 'C'))
submit[submit['class'] != submit.total2]

In [None]:
submit['total'].value_counts()

In [None]:
submit['last1'] = pd.read_csv("submit_last1.csv")['class']
submit['last2'] = pd.read_csv("submit_last2.csv")['class']

submit[submit.total != submit.last2]

In [None]:
submit[submit.total2 != submit.last3]

In [None]:
submit.info()

In [None]:
df = pd.read_csv("submit_high1.csv")
df['class'] = submit['total']
df.to_csv("submit_last01.csv", index=False)
df

In [None]:
df['class'].value_counts()

In [None]:
df = pd.read_csv("submit_high1.csv")
df['class'] = submit['total2']
df.to_csv("submit_last02.csv", index=False)
df

In [None]:
df['class'].value_counts()