In [None]:
'''import os
import random
import warnings
from sklearn.model_selection import train_test_split
from custom_dataset import CustomDataSet
import librosa
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from tqdm.auto import tqdm
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

from transformers import AutoModelForAudioClassification, Wav2Vec2FeatureExtractor

warnings.filterwarnings(action='ignore')
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
'''
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from custom_dataset import CustomDataSet
'''
                            speech_file_to_array_fn as sfaf,
                            collate_fn,
                            create_data_loader,
                            validation,
                            train'''
from transformers import (Wav2Vec2FeatureExtractor,
                          Wav2Vec2Model,
                          Wav2Vec2Config,
                          Wav2Vec2ConformerForSequenceClassification,
                          AutoModelForAudioClassification)
import evaluate
import librosa
import random
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence 
from torch.utils.data import Dataset, DataLoader
from sklearn.tree import DecisionTreeClassifier
# from huggingface_hub import notebook_login
from datasets import load_dataset, Audio
import warnings
warnings.filterwarnings(action='ignore')
os.environ['CUDA_LAUNCH_BLOCKING']='1'

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
import evaluate
import librosa
import random
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from datasets import load_dataset, Audio
from sklearn.metrics import accuracy_score
from xgboost.sklearn import XGBClassifier
from sklearn.preprocessing import StandardScaler
# from lightgbm import LGBMClassifier

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [2]:
CFG = {
    'SR':16000,
    'N_MFCC':32, # Melspectrogram 벡터를 추출할 개수
    'SEED':42
}

In [None]:
CFG = {
    'SR':16_000,
    'SEED':42,
    'BATCH_SIZE':8, # out of Memory가 발생하면 줄여주세요
    'TOTAL_BATCH_SIZE':32, # 원하는 batch size
    'EPOCHS':1,
    'LR':1e-4,
}

In [None]:
MODEL_NAME = "facebook/wav2vec2-base"

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

In [None]:
train_df = pd.read_csv('./train.csv')

In [None]:
train_df, valid_df = train_test_split(train_df, test_size=0.2, random_state=CFG['SEED'])

In [None]:
train_df.reset_index(drop=True, inplace=True)
valid_df.reset_index(drop=True, inplace=True)

In [None]:
def speech_file_to_array_fn(df):
    feature = []
    for path in tqdm(df['path']):
        speech_array, _ = librosa.load(path, sr=CFG['SR'])
        feature.append(speech_array)
    return feature

In [None]:
train_x = speech_file_to_array_fn(train_df)
valid_x = speech_file_to_array_fn(valid_df)

In [None]:
processor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)

class CustomDataSet(torch.utils.data.Dataset):
    def __init__(self, x, y, processor):
        self.x = x
        self.y = y
        self.processor = processor

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        input_values = self.processor(self.x[idx], sampling_rate=CFG['SR'], return_tensors="pt", padding=True).input_values
        if self.y is not None:
            return input_values.squeeze(), self.y[idx]
        else:
            return input_values.squeeze()

In [None]:
def collate_fn(batch):
    x, y = zip(*batch)
    x = pad_sequence([torch.tensor(xi) for xi in x], batch_first=True)
    y = pad_sequence([torch.tensor([yi]) for yi in y], batch_first=True)  # Convert scalar targets to 1D tensors
    return x, y


In [None]:
def create_data_loader(dataset, batch_size, shuffle, collate_fn, num_workers=0):
    return DataLoader(dataset,
                      batch_size=batch_size,
                      shuffle=shuffle,
                      collate_fn=collate_fn,
                      num_workers=num_workers
                      )

train_dataset = CustomDataSet(train_x, train_df['label'], processor)
valid_dataset = CustomDataSet(valid_x, valid_df['label'], processor)

train_loader = create_data_loader(train_dataset, CFG['BATCH_SIZE'], False, collate_fn, 16)
valid_loader = create_data_loader(valid_dataset, CFG['BATCH_SIZE'], False, collate_fn, 16)

In [None]:
audio_model = AutoModelForAudioClassification.from_pretrained(MODEL_NAME)

In [None]:
class BaseModel(torch.nn.Module):
    def __init__(self):
        super(BaseModel, self).__init__()
        self.model = audio_model
        self.model.classifier = nn.Identity()
        self.classifier = nn.Linear(256, 8)

    def forward(self, x):
        output = self.model(x)
        output = self.classifier(output.logits)
        return output

In [None]:
def validation(model, valid_loader, creterion):
    model.eval()
    val_loss = []

    total, correct = 0, 0
    test_loss = 0

    with torch.no_grad():
        for x, y in tqdm(iter(valid_loader)):
            x = x.to(device)
            y = y.flatten().to(device)

            output = model(x)
            loss = creterion(output, y)

            val_loss.append(loss.item())

            test_loss += loss.item()
            _, predicted = torch.max(output, 1)
            total += y.size(0)
            correct += predicted.eq(y).cpu().sum()

    accuracy = correct / total

    avg_loss = np.mean(val_loss)

    return avg_loss, accuracy

In [None]:
def train(model, train_loader, valid_loader, optimizer, scheduler):
    accumulation_step = int(CFG['TOTAL_BATCH_SIZE'] / CFG['BATCH_SIZE'])
    model.to(device)
    creterion = nn.CrossEntropyLoss().to(device)

    best_model = None
    best_acc = 0

    for epoch in range(1, CFG['EPOCHS']+1):
        train_loss = []
        model.train()
        for i, (x, y) in enumerate(tqdm(train_loader)):
            x = x.to(device)
            y = y.flatten().to(device)

            optimizer.zero_grad()
            
            output = model(x)
            loss = creterion(output, y)
            loss.backward()

            if (i+1) % accumulation_step == 0:
                optimizer.step()
                optimizer.zero_grad()

            train_loss.append(loss.item())

        avg_loss = np.mean(train_loss)
        valid_loss, valid_acc = validation(model, valid_loader, creterion)

        if scheduler is not None:
            scheduler.step(valid_acc)

        if valid_acc > best_acc:
            best_acc = valid_acc
            best_model = model

        print(f'epoch:[{epoch}] train loss:[{avg_loss:.5f}] valid_loss:[{valid_loss:.5f}] valid_acc:[{valid_acc:.5f}]')
    
    print(f'best_acc:{best_acc:.5f}')

    return best_model

In [None]:
model = BaseModel()

optimizer = torch.optim.Adam(model.parameters(), lr=CFG['LR'])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=3, verbose=True)

infer_model = train(model, train_loader, valid_loader, optimizer, scheduler)

In [None]:
test_df = pd.read_csv('./test.csv')

In [None]:
def collate_fn_test(batch):
    x = pad_sequence([torch.tensor(xi) for xi in batch], batch_first=True)
    return x

In [None]:
test_x = speech_file_to_array_fn(test_df)

In [None]:
test_dataset = CustomDataSet(test_x, y=None, processor=processor)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, collate_fn=collate_fn_test)

In [None]:
def inference(model, test_loader):
    model.eval()
    preds = []

    with torch.no_grad():
        for x in tqdm(iter(test_loader)):
            x = x.to(device)

            output = model(x)

            preds += output.argmax(-1).detach().cpu().numpy().tolist()

    return preds

In [None]:
preds = inference(infer_model, test_loader)

In [None]:
submission = pd.read_csv('./sample_submission.csv')
submission['label'] = preds
submission.to_csv('./baseline_submission.csv', index=False)

In [3]:
train_df = pd.read_csv('./train.csv')
print(train_df.info())
print()
test_df = pd.read_csv('./test.csv')
print(test_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      5001 non-null   object
 1   path    5001 non-null   object
 2   label   5001 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 117.3+ KB
None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1881 entries, 0 to 1880
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      1881 non-null   object
 1   path    1881 non-null   object
dtypes: object(2)
memory usage: 29.5+ KB
None


In [4]:
# Folder Locations
dataset = "./"

In [5]:
def get_mfcc_feature(df):
    features = []
    for path in tqdm(df['path']):
        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG['N_MFCC'])
        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:
            y_feature.append(np.mean(e))
        features.append(y_feature)

    mfcc_df = pd.DataFrame(features, columns=['mfcc_'+str(x) for x in range(1,CFG['N_MFCC']+1)])
    return mfcc_df


In [6]:
train_x = get_mfcc_feature(train_df)
print(train_x)
test_x = get_mfcc_feature(test_df)
print(test_x)

  0%|          | 0/5001 [00:00<?, ?it/s]

          mfcc_1      mfcc_2     mfcc_3     mfcc_4     mfcc_5    mfcc_6  \
0    -414.755737  110.100639  46.699074  23.939814  14.766221  4.820827   
1    -399.769531   83.051300  55.473316  31.782587  22.040754  0.985082   
2    -341.145081   97.399071  38.274349  19.811539   0.731027  0.838704   
3    -376.963715  118.961670  34.490349  24.178417  -1.065604 -1.613391   
4    -352.863220  117.553337  29.948687  31.094315   5.406391 -5.591998   
...          ...         ...        ...        ...        ...       ...   
4996 -416.181305  112.938484  47.294231  23.111433  16.048231  6.795750   
4997 -237.811432   72.207787   3.371584  14.646128 -10.546066  7.671333   
4998 -368.228119  103.887871  40.081417  21.763754  14.120455  5.778781   
4999 -407.114288  103.880676  44.980690  23.265039  13.279220  6.904113   
5000 -359.754608   89.241714  20.517361  24.043827  16.926319 -7.636394   

        mfcc_7     mfcc_8    mfcc_9    mfcc_10  ...   mfcc_23   mfcc_24  \
0     1.445079  -0.92615

  0%|          | 0/1881 [00:00<?, ?it/s]

          mfcc_1      mfcc_2     mfcc_3     mfcc_4     mfcc_5    mfcc_6  \
0    -335.757324  125.215431  22.145767  14.351713  -1.045251  0.567860   
1    -295.973053   92.839684  24.976181  22.831310 -10.278670  7.804742   
2    -444.395996  109.385201  55.236771  26.486050  12.487712  9.069915   
3    -384.600220  112.463974  47.454556  17.336460  13.556947  4.677102   
4    -273.304077   97.817047  12.370095  24.591480   3.248469 -9.987856   
...          ...         ...        ...        ...        ...       ...   
1876 -250.254913   69.372955  17.328987  12.005389  -2.463175 -7.652928   
1877 -348.593842  107.067047  27.683287  17.207047   2.634121  1.812742   
1878 -295.658112   99.606911  -9.669126   4.676853   4.752311 -5.689676   
1879 -450.677094  122.865677  46.413559  21.616255  13.785479  5.433998   
1880 -366.914154  113.387276  43.263287  15.806440   8.695131  6.917760   

        mfcc_7     mfcc_8    mfcc_9    mfcc_10  ...   mfcc_23   mfcc_24  \
0    -0.666742  -8.11483

In [7]:
print(train_x.info())
print('\n \n \n')
print(test_x.info())
train_y = train_df['label']
print(train_y)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Data columns (total 32 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   mfcc_1   5001 non-null   float32
 1   mfcc_2   5001 non-null   float32
 2   mfcc_3   5001 non-null   float32
 3   mfcc_4   5001 non-null   float32
 4   mfcc_5   5001 non-null   float32
 5   mfcc_6   5001 non-null   float32
 6   mfcc_7   5001 non-null   float32
 7   mfcc_8   5001 non-null   float32
 8   mfcc_9   5001 non-null   float32
 9   mfcc_10  5001 non-null   float32
 10  mfcc_11  5001 non-null   float32
 11  mfcc_12  5001 non-null   float32
 12  mfcc_13  5001 non-null   float32
 13  mfcc_14  5001 non-null   float32
 14  mfcc_15  5001 non-null   float32
 15  mfcc_16  5001 non-null   float32
 16  mfcc_17  5001 non-null   float32
 17  mfcc_18  5001 non-null   float32
 18  mfcc_19  5001 non-null   float32
 19  mfcc_20  5001 non-null   float32
 20  mfcc_21  5001 non-null   float32
 21  mfcc_22  5001 

In [8]:
X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, test_size=0.2, random_state=CFG['SEED'])
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(4000, 32) (1001, 32) (4000,) (1001,)


In [36]:
result_files = glob.glob(dataset + "Results/*.csv")

max_temp = []
for file in result_files:
    filename = file.split(".")[1]
    filename = filename.split("/")[2]
    df = pd.read_csv(file)
    train_acc = pd.DataFrame(df["0"])
    val_acc = pd.DataFrame(df["1"])
    train_max = train_acc.idxmax()
    val_max = val_acc.idxmax()
    if train_max[-1] == val_max[-1]:
        max_temp.append([filename, train_acc.iloc[train_max[-1]][-1], val_acc.iloc[val_max[-1]][-1]])

max_temp_df = pd.DataFrame(max_temp)
max_temp_df = max_temp_df.rename(columns={0:'filename', 1:"train_acc", 2:"val_acc"})

filename = pd.DataFrame(max_temp_df['filename'])
train_acc = pd.DataFrame(max_temp_df['train_acc'])
val_acc = pd.DataFrame(max_temp_df['val_acc'])
train_max = train_acc.idxmax()
val_max = val_acc.idxmax()
if train_max[-1] == val_max[-1]:
    print(filename.iloc[train_max], train_acc.iloc[train_max], val_acc.iloc[val_max])
    val_max_filepath = dataset + "Results/" + filename.iloc[val_max].iat[0,0] +".csv"
    maxfile = pd.read_csv(filepath)
    max_max_train = pd.DataFrame(maxfile["0"])
    max_max_val= pd.DataFrame(maxfile["1"])
    train_max_max = max_max_train.idxmax()[-1]
    val_max_max = max_max_val.idxmax()[-1]
    if train_max_max == 0: # Random Forest
        rf = RandomForestClassifier(max_depth=maxfile["max_depth"].iat[0],
                                    n_estimators=int(maxfile["n_estimators"].iat[0]),
                                    min_samples_split=2,
                                    max_features=maxfile["max_features"].iat[0],
                                    n_jobs=-1,
                                    random_state=CFG['SEED']
                                   )
        rf.fit(X_train, y_train)
        X_test = pd.get_dummies(data=test_x)
        rf_pred = rf.predict(X_test)
        rf_submission = pd.read_csv(dataset + 'sample_submission.csv')
        rf_submission['label'] = rf_pred
        rf_submission.to_csv(dataset + "rf_submission.csv", index=False)
    elif train_max_max == 1: # Decision Tree
        dt = DecisionTreeClassifier(max_depth=maxfile["max_depth"].iat[1],
                                    min_samples_split=2,
                                    max_features=maxfile["max_features"].iat[1],
                                    random_state=CFG['SEED']
                                   )
        dt.fit(X_train, y_train)
        X_test = pd.get_dummies(data=test_x)
        dt_pred = dt.predict(X_test)
        dt_submission = pd.read_csv(dataset + 'sample_submission.csv')
        dt_submission['label'] = dt_pred
        dt_submission.to_csv(dataset + "dt_submission.csv", index=False)
    elif train_max_max == 2: # XG Boost
        xgboost = XGBClassifier(max_depth=maxfile['max_depth'].iat[2],
                                n_estimators=int(maxfile["n_estimators"].iat[2]),
                                grow_policy='depthwise',
                                n_jobs=-1,
                                random_state=CFG['SEED'],
                                tree_method='auto'
                               )
        xgboost.fit(X_train, y_train)
        X_test = pd.get_dummies(data=test_x)
        xgboost_pred = xgboost.predict(X_test)
        xgboost_submission = pd.read_csv(dataset + 'sample_submission.csv')
        xgboost_submission['label'] = xgboost_pred
        xgboost_submission.to_csv(dataset + "xgboost_submission.csv", index=False)
    else:
#         lgbm = LGBMClassifier(max_depth=maxfile['max_depth'].iat[3],
#                             n_estimators=int(maxfile["n_estimators"].iat[3]),
#                             max_features=maxfile["max_features"].iat[3],
#                             n_jobs=-1,
#                             num_leaves=int(maxfile["num_leaves"].iat[3]),
#                             random_state=CFG['SEED']
#                             )
#         lgbm.fit(X_train, y_train)
#         X_test = pd.get_dummies(data=test_x)
#         lgbm_pred = lgbm.predict(X_test)
#         lgbm_submission = pd.read_csv(dataset + 'sample_submission.csv')
#         lgbm_submission['label'] = lgbm_pred
#         lgbm_submission.to_csv(dataset + "lgbm_max_submission.csv", index=False)
        print("This is LGBM")
else:
    print(f"filename.iloc[train_max] = {filename.iloc[train_max]}, \n \n train_acc.iloc[train_max] = {train_acc.iloc[train_max]}")
    print()
    print()
#     print(f"filename.iloc[val_max][1] = {filename.iloc[val_max][1]}")
    print(f"filename.iloc[train_max].info() = {filename.iloc[train_max].info()}")
    train_max_filepath = dataset + "Results/" + filename.iloc[train_max].iat[0,0] +".csv"
    print(train_max_filepath)
    train_maxfile = pd.read_csv(train_max_filepath)
    val_max_train = pd.DataFrame(train_maxfile["0"])
    val_max_val= pd.DataFrame(train_maxfile["1"])
    train_max_max = val_max_train.idxmax()[-1]
    val_max_max = val_max_val.idxmax()[-1]
    if train_max_max == 0: # Random Forest
        rf = RandomForestClassifier(max_depth=train_maxfile["max_depth"].iat[0],
                                    n_estimators=int(train_maxfile["n_estimators"].iat[0]),
                                    min_samples_split=2,
                                    max_features=train_maxfile["max_features"].iat[0],
                                    n_jobs=-1,
                                    random_state=CFG['SEED']
                                   )
        rf.fit(X_train, y_train)
        X_test = pd.get_dummies(data=test_x)
        rf_pred = rf.predict(X_test)
        rf_submission = pd.read_csv(dataset + 'sample_submission.csv')
        rf_submission['label'] = rf_pred
        rf_submission.to_csv(dataset + "rf_max_submission.csv", index=False)
    elif train_max_max == 1: # Decision Tree
        dt = DecisionTreeClassifier(max_depth=train_maxfile["max_depth"].iat[1],
                                    min_samples_split=2,
                                    max_features=train_maxfile["max_features"].iat[1],
                                    random_state=CFG['SEED']
                                   )
        dt.fit(X_train, y_train)
        X_test = pd.get_dummies(data=test_x)
        dt_pred = dt.predict(X_test)
        dt_submission = pd.read_csv(dataset + 'sample_submission.csv')
        dt_submission['label'] = dt_pred
        dt_submission.to_csv(dataset + "dt_max_submission.csv", index=False)
    elif train_max_max == 2: # XG Boost
        xgboost = XGBClassifier(max_depth=train_maxfile['max_depth'].iat[2],
                                n_estimators=int(train_maxfile["n_estimators"].iat[2]),
                                grow_policy='depthwise',
                                n_jobs=-1,
                                random_state=CFG['SEED'],
                                tree_method='auto'
                               )
        xgboost.fit(X_train, y_train)
        X_test = pd.get_dummies(data=test_x)
        xgboost_pred = xgboost.predict(X_test)
        xgboost_submission = pd.read_csv(dataset + 'sample_submission.csv')
        xgboost_submission['label'] = xgboost_pred
        xgboost_submission.to_csv(dataset + "xgboost_max_submission.csv", index=False)
    else:
#         lgbm = LGBMClassifier(max_depth=val_maxfile['max_depth'].iat[3],
#                             n_estimators=int(val_maxfile["n_estimators"].iat[3]),
#                             max_features=val_maxfile["max_features"].iat[3],
#                             n_jobs=-1,
#                             num_leaves=int(val_maxfile["num_leaves"].iat[3]),
#                             random_state=CFG['SEED']
#                             )
#         lgbm.fit(X_train, y_train)
#         X_test = pd.get_dummies(data=test_x)
#         lgbm_pred = lgbm.predict(X_test)
#         lgbm_submission = pd.read_csv(dataset + 'sample_submission.csv')
#         lgbm_submission['label'] = lgbm_pred
#         lgbm_submission.to_csv(dataset + "lgbm_max_submission.csv", index=False)
        print("This is LGBM")
    

filename.iloc[train_max] =         filename
222  Results5819, 
 
 train_acc.iloc[train_max] =      train_acc
222      0.452


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1 entries, 222 to 222
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  1 non-null      object
dtypes: object(1)
memory usage: 16.0+ bytes
filename.iloc[val_max].info() = None
./Results/Results5819.csv
This is LGBM


In [38]:
min_temp = []
for file in result_files:
    filename = file.split(".")[1]
    filename = filename.split("/")[2]
    df = pd.read_csv(file)
    train_acc = pd.DataFrame(df["0"])
    val_acc = pd.DataFrame(df["1"])
    train_min = train_acc.idxmin()
    val_min = val_acc.idxmin()
    if train_min[-1] == val_min[-1]:
        min_temp.append([filename, train_acc.iloc[train_min[-1]][-1], val_acc.iloc[val_min[-1]][-1]])

min_temp_df = pd.DataFrame(min_temp)
min_temp_df = min_temp_df.rename(columns={0:'filename', 1:"train_acc", 2:"val_acc"})

filename = pd.DataFrame(min_temp_df['filename'])
train_acc = pd.DataFrame(min_temp_df['train_acc'])
val_acc = pd.DataFrame(min_temp_df['val_acc'])
train_min = train_acc.idxmin()
val_min = val_acc.idxmin()
if train_min[-1] == val_min[-1]:
#     print(filename.iloc[train_min], train_acc.iloc[train_min], val_acc.iloc[val_min])
    val_min_filepath = dataset + "Results/" + filename.iloc[val_min].iat[0,0] +".csv"
    minfile = pd.read_csv(filepath)
    min_min_train = pd.DataFrame(minfile["0"])
    min_min_val= pd.DataFrame(minfile["1"])
    train_min_min = min_min_train.idxmin()[-1]
    val_min_min = min_min_val.idxmin()[-1]
    if train_min_min == 0: # Random Forest
        rf = RandomForestClassifier(max_depth=minfile["max_depth"].iat[0],
                                    n_estimators=int(minfile["n_estimators"].iat[0]),
                                    min_samples_split=2,
                                    max_features=minfile["max_features"].iat[0],
                                    n_jobs=-1,
                                    random_state=CFG['SEED']
                                   )
        rf.fit(X_train, y_train)
        X_test = pd.get_dummies(data=test_x)
        rf_pred = rf.predict(X_test)
        rf_submission = pd.read_csv(dataset + 'sample_submission.csv')
        rf_submission['label'] = rf_pred
        rf_submission.to_csv(dataset + "rf_min_submission.csv", index=False)
    elif train_min_min == 1: # Decision Tree
        dt = DecisionTreeClassifier(max_depth=minfile["max_depth"].iat[1],
                                    min_samples_split=2,
                                    max_features=minfile["max_features"].iat[1],
                                    random_state=CFG['SEED']
                                   )
        dt.fit(X_train, y_train)
        X_test = pd.get_dummies(data=test_x)
        dt_pred = dt.predict(X_test)
        dt_submission = pd.read_csv(dataset + 'sample_submission.csv')
        dt_submission['label'] = dt_pred
        dt_submission.to_csv(dataset + "dt_min_submission.csv", index=False)
    elif train_min_min == 2: # XG Boost
        xgboost = XGBClassifier(max_depth=minfile['max_depth'].iat[2],
                                n_estimators=int(minfile["n_estimators"].iat[2]),
                                grow_policy='depthwise',
                                n_jobs=-1,
                                random_state=CFG['SEED'],
                                tree_method='auto'
                               )
        xgboost.fit(X_train, y_train)
        X_test = pd.get_dummies(data=test_x)
        xgboost_pred = xgboost.predict(X_test)
        xgboost_submission = pd.read_csv(dataset + 'sample_submission.csv')
        xgboost_submission['label'] = xgboost_pred
        xgboost_submission.to_csv(dataset + "xgboost_min_submission.csv", index=False)
    else:
#         lgbm = LGBMClassifier(max_depth=minfile['max_depth'].iat[3],
#                             n_estimators=int(minfile["n_estimators"].iat[3]),
#                             max_features=minfile["max_features"].iat[3],
#                             n_jobs=-1,
#                             num_leaves=int(minfile["num_leaves"].iat[3]),
#                             random_state=CFG['SEED']
#                             )
#         lgbm.fit(X_train, y_train)
#         X_test = pd.get_dummies(data=test_x)
#         lgbm_pred = lgbm.predict(X_test)
#         lgbm_submission = pd.read_csv(dataset + 'sample_submission.csv')
#         lgbm_submission['label'] = lgbm_pred
#         lgbm_submission.to_csv(dataset + "lgbm_min_submission.csv", index=False)
        print("This is LGBM")
else:
    print(filename.iloc[train_min], train_acc.iloc[train_min])
    val_min_filepath = dataset + "Results/" + filename.iloc[train_min].iat[0,0] +".csv"
    val_minfile = pd.read_csv(val_min_filepath)
    val_min_train = pd.DataFrame(val_minfile["0"])
    val_min_val= pd.DataFrame(val_minfile["1"])
    train_min_min = val_min_train.idxmin()[-1]
    val_min_min = val_min_val.idxmin()[-1]
    if train_min_min == 0: # Random Forest
        rf = RandomForestClassifier(max_depth=val_minfile["max_depth"].iat[0],
                                        n_estimators=int(val_minfile["n_estimators"].iat[0]),
                                    min_samples_split=2,
                                    max_features=val_minfile["max_features"].iat[0],
                                    n_jobs=-1,
                                    random_state=CFG['SEED']
                                   )
        rf.fit(X_train, y_train)
        X_test = pd.get_dummies(data=test_x)
        rf_pred = rf.predict(X_test)
        rf_submission = pd.read_csv(dataset + 'sample_submission.csv')
        rf_submission['label'] = rf_pred
        rf_submission.to_csv(dataset + "rf_min_submission.csv", index=False)
    elif train_min_min == 1: # Decision Tree
        dt = DecisionTreeClassifier(max_depth=val_minfile["max_depth"].iat[1],
                                    min_samples_split=2,
                                    max_features=val_minfile["max_features"].iat[1],
                                    random_state=CFG['SEED']
                                   )
        dt.fit(X_train, y_train)
        X_test = pd.get_dummies(data=test_x)
        dt_pred = dt.predict(X_test)
        dt_submission = pd.read_csv(dataset + 'sample_submission.csv')
        dt_submission['label'] = dt_pred
        dt_submission.to_csv(dataset + "dt_min_submission.csv", index=False)
    elif train_min_min == 2: # XG Boost
        xgboost = XGBClassifier(max_depth=val_minfile['max_depth'].iat[2],
                                n_estimators=int(val_minfile["n_estimators"].iat[2]),
                                grow_policy='depthwise',
                                n_jobs=-1,
                                random_state=CFG['SEED'],
                                tree_method='auto'
                               )
        xgboost.fit(X_train, y_train)
        X_test = pd.get_dummies(data=test_x)
        xgboost_pred = xgboost.predict(X_test)
        xgboost_submission = pd.read_csv(dataset + 'sample_submission.csv')
        xgboost_submission['label'] = xgboost_pred
        xgboost_submission.to_csv(dataset + "xgboost_min_submission.csv", index=False)
    else:
#         lgbm = LGBMClassifier(max_depth=val_minfile['max_depth'].iat[3],
#                             n_estimators=int(val_minfile["n_estimators"].iat[3]),
#                             max_features=val_minfile["max_features"].iat[3],
#                             n_jobs=-1,
#                             num_leaves=int(val_minfile["num_leaves"].iat[3]),
#                             random_state=CFG['SEED']
#                             )
#         lgbm.fit(X_train, y_train)
#         X_test = pd.get_dummies(data=test_x)
#         lgbm_pred = lgbm.predict(X_test)
#         lgbm_submission = pd.read_csv(dataset + 'sample_submission.csv')
#         lgbm_submission['label'] = lgbm_pred
#         lgbm_submission.to_csv(dataset + "lgbm_min_submission.csv", index=False)
        print("This is LGBM")
    

       filename
102  Results712      train_acc
102      0.171
