In [1]:
import pandas as pd
import numpy as np
import random
import datetime
import optuna
import copy
import time

from sklearn.preprocessing import RobustScaler, MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
import torch
import torch.utils.data as data_torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
from tqdm import tqdm

#from ydata_profiling import ProfileReport # подробный разбор признаков

In [2]:
data_patch = '.\\'

In [3]:
def stratify_data(filename):
    
    dataframe = pd.read_csv(filename).copy()
    dataframe = dataframe.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    mask = dataframe['Age'].isna()
    dataframe.loc[mask, 'Age'] = np.random.randint(10, 50, mask.sum())
    dataframe['Embarked'] = dataframe['Embarked'].fillna(dataframe['Embarked'].mode()[0])
    dataframe['Sex'] = dataframe['Sex'].map({'male':1,'female':0})
    dataframe['Embarked'] = dataframe['Embarked'].map({'S':0,'C':1, 'Q':2})
    
    survived = dataframe['Survived']
    
    features = dataframe.drop(['Survived'], axis=1)
    features = pd.get_dummies(features, columns=['Pclass', 'SibSp', 'Parch', 'Embarked'], dtype=int)
    poly = PolynomialFeatures(degree=2, include_bias=False)
    features = poly.fit_transform(features)
        
    features = RobustScaler().fit_transform(features)
    features = MinMaxScaler().fit_transform(features)
    
    data_scaled = pd.DataFrame(data=features)
        
    X_train, X_val, y_train, y_val = train_test_split(data_scaled, survived, test_size=0.2, random_state=2, stratify=survived)
    
    X_train['Survived'] = y_train
    X_val['Survived'] = y_val
    
    X_train.to_csv('train_data.csv', sep=',', index=False)
    X_val.to_csv('val_data.csv', sep=',', index=False)

In [4]:
stratify_data(data_patch + 'train.csv')

In [5]:
class TitanicDataset(data_torch.Dataset):
    
    def __init__(self, filename, Train=True):
        self.dataframe = pd.read_csv(filename).copy()
                      
        self.Train = Train
        
    def __len__(self):
        return self.dataframe.shape[0]
    
    def __getitem__(self, idx):
        if(self.Train):
            survived = self.dataframe['Survived']
            survived = np.array(survived)[idx]
            
        features = self.dataframe.drop(['Survived'], axis=1)
        features = np.array(features)[idx]
                
        if(self.Train):
            return features, survived
        else:
            return features
          
    def infoo(self):
        return self.dataframe.info()

In [6]:
train_dataset = TitanicDataset(data_patch + 'train_data.csv')
val_dataset = TitanicDataset(data_patch + 'val_data.csv')
testing_dataset = TitanicDataset(data_patch + 'test.csv', Train=False)

In [7]:
train_dataset.__getitem__(5)

(array([0.00000000e+00, 3.34003518e-01, 2.70496001e-02, 0.00000000e+00,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 1.13881826e-01, 2.02872001e-02,
        0.00000000e+00, 3.85714286e-01, 0.00000000e+00, 0.00000000e+00,
        3.85714286e-01, 0.00000000e+00, 0.00000000e+00, 0.000000

In [8]:
train_dataset.infoo()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Columns: 300 entries, 0 to Survived
dtypes: float64(299), int64(1)
memory usage: 1.6 MB


In [11]:
size_columns = len(next(iter(train_dataset))[0])
hidden_layer1_coeff = 15
hidden_layer2_coeff = 7
hidden_layer3_coeff = 3.5
hidden_layer4_coeff = 2
#lr = 0.01
momentum = 0.9
epochs = 10
hidden_layer1 = round(hidden_layer1_coeff * size_columns)
hidden_layer2 = round(hidden_layer2_coeff * size_columns)
hidden_layer3 = round(hidden_layer3_coeff * size_columns)
hidden_layer4 = round(hidden_layer4_coeff * size_columns)

In [10]:
class Net(nn.Module):
    def __init__(self, col, a, b, c, d, out1, out2):
        super(Net, self).__init__()
        self.b1 = nn.Sequential(
            nn.Linear(in_features=col,out_features=a),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.BatchNorm1d(a),
            nn.Linear(in_features=a,out_features=b),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.BatchNorm1d(b),
            nn.Linear(in_features=b,out_features=c),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.BatchNorm1d(c),
            nn.Linear(in_features=c,out_features=d),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.BatchNorm1d(d),
            nn.Linear(in_features=d,out_features=int(out1)),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.BatchNorm1d(int(out1)),
            nn.Linear(in_features=int(out1),out_features=out2),
            nn.ReLU(),
            nn.BatchNorm1d(out2),
            nn.Linear(in_features=out2,out_features=1),
            nn.Sigmoid()
        )
    def forward(self,x):
        x = self.b1(x)
        return x

In [16]:
#optimizer = optim.SGD(net.parameters(), lr = lr, momentum = momentum)
optimizer = optim.Adam(net.parameters(), lr = lr)
criterion = nn.BCELoss()

In [9]:
optuna.logging.set_verbosity(optuna.logging.WARN)

def objective(trial):
    since = time.time()
    dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
    lr = trial.suggest_float('lr', 0.03, 1.05, step=0.03)    
    batch_size = trial.suggest_int('batch_size', 200, 400, step=100)
    out1 = trial.suggest_int('out1', 32, 128, step=4)
    out2 = trial.suggest_int('out2', 4, 32, step=2)
    
    model = Net(size_columns, hidden_layer1, hidden_layer2, hidden_layer3, hidden_layer4, out1, out2).to(dev)
       
    train_dataload = data_torch.DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
    val_dataload = data_torch.DataLoader(val_dataset, shuffle=True, batch_size=batch_size)
    
    # Установить случайное начальное число
    seed=42
    torch.manual_seed(seed)
           
    # Обучение
    n = 30
    for epoch in range(n):
        model.train()
        for X, y in train_dataload:
            X, y = X.to(dev), y.to(dev)
            features = len(next(iter(train_dataload))[0][0])
            X = X.view(-1, features)
            y = y.view(-1, 1)
            optimizer.zero_grad()
            output = model(X.float())
            loss = criterion(output, y.float())
            loss.backward()
            optimizer.step()
        
    
    # Проверка
    model.eval()
    correct = 0
    with torch.no_grad():
        for X, y in val_dataload:
            X, y = X.to(dev), y.to(dev)
            features = len(next(iter(val_dataload))[0][0])
            X = X.view(-1, features)
            y = y.view(-1, 1)
            output = model(X.float())
            y_pred = output.argmax(dim=1, keepdim=True)
            correct += y_pred.eq(y.view_as(y_pred)).sum().item()

        accuracy = correct / len(val_dataload.dataset)
   
    time_use = time.time() - since
        
    print('------------')
    
    print("Время на этап подбора：{:.0f}m{:.0f}s".format(time_use//60,time_use%60))
    
    return accuracy

In [12]:
dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Создаем учебные объекты Optuna
study = optuna.create_study(direction='maximize')

# Оптимизация с использованием индикатора выполнения tqdm
n_trials = 2 # Кол-во попыток

study.optimize(objective, n_trials=n_trials)
#study.optimize(objective, timeout=4) # 4 минуты перебора параметров

print("Наилучшая accuracy:")
best_trial = study.best_trial
print("  accuracy: ", best_trial.value)
print("  Параметры: ")
for key, value in best_trial.params.items():
    print("    {}: {}".format(key, value))

------------
Время на этап подбора：1m40s
------------
Время на этап подбора：1m36s
Наилучшая accuracy:
  accuracy:  0.6145251396648045
  Параметры: 
    lr: 0.42000000000000004
    batch_size: 300
    out1: 32
    out2: 8


In [13]:
net = Net(size_columns, hidden_layer1, hidden_layer2, hidden_layer3, hidden_layer4, study.best_trial.params['out1'], study.best_trial.params['out2'])
print(net)

Net(
  (b1): Sequential(
    (0): Linear(in_features=299, out_features=4485, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): BatchNorm1d(4485, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): Linear(in_features=4485, out_features=2093, bias=True)
    (5): ReLU()
    (6): Dropout(p=0.5, inplace=False)
    (7): BatchNorm1d(2093, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (8): Linear(in_features=2093, out_features=1046, bias=True)
    (9): ReLU()
    (10): Dropout(p=0.4, inplace=False)
    (11): BatchNorm1d(1046, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (12): Linear(in_features=1046, out_features=598, bias=True)
    (13): ReLU()
    (14): Dropout(p=0.4, inplace=False)
    (15): BatchNorm1d(598, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (16): Linear(in_features=598, out_features=84, bias=True)
    (17): ReLU()
    (18): Dropout(p=0.3, inplace=False)
    (19): Bat

In [14]:
num_workers = 4
epochs = 30

batch_size = study.best_trial.params['batch_size']
lr = study.best_trial.params['lr']

In [15]:
train_dataload = data_torch.DataLoader(dataset=train_dataset, shuffle=True, batch_size=batch_size, num_workers=num_workers)
val_dataload = data_torch.DataLoader(dataset=val_dataset, shuffle=True, batch_size=batch_size, num_workers=num_workers)
#test_dataload = data_torch.DataLoader(dataset=testing_dataset , batch_size=batch_size,shuffle=False, num_workers=num_workers)

seed=42
torch.manual_seed(seed)  

<torch._C.Generator at 0x23ee0b790b0>

net = Net(size_columns, hidden_layer1, hidden_layer2)
print(net)

In [17]:
def train(model, train_dataload, val_dataload, epochs=10, lr=0.001):
    dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(dev)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.BCELoss()

    best_acc, best_epoch = 0, 0
    best_model_wts = copy.deepcopy(model.state_dict())
    train_loss_all, val_loss_all = [], []
    train_acc_all, val_acc_all = [], []

    since = time.time()

    for epoch in range(epochs):
        train_loss, train_corrects, train_num = 0, 0, 0
        val_loss, val_corrects, val_num = 0, 0, 0

        print(("\n" + "%11s" * 4) % ("Epoch", "GPU_mem", "train_loss", "train_acc"))
        pbar = tqdm(train_dataload,bar_format='{l_bar}{bar:15}{r_bar}')

        for X , y in pbar:
            X , y = X.to(dev), y.to(dev)
            model.train()
            output = model(X)
            loss = criterion(output,y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            y_pred = torch.argmax(output,dim=1)

            train_loss += loss.item() * X.size(0)
            train_corrects += torch.sum(y_pred == y.data).item()
            train_num += X.size(0)

            mem = f"{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G" 
            pbar.set_description(
                ("%11s" *2 +"%11.4g" *2)
                % (f"{epoch+1}/{epochs}",mem,train_loss/train_num,train_corrects/train_num)
            )

        pbar2 = tqdm(val_dataload,bar_format='{l_bar}{bar:15}{r_bar}')
        
        for X , y in pbar2:
            X , y = X.to(dev) , y.to(dev)
            model.eval()
            output = model(X)
            loss = criterion(output,y)

            y_pred = torch.argmax(output,dim=1)

            val_loss += loss.item() * X.size(0)
            val_corrects += torch.sum(y_pred == y.data).item()
            val_num += X.size(0)
            pbar2.set_description(
                (("%11s" +"%11.4g")*2)
                % ("val_loss",val_loss/val_num,"val_acc",val_corrects/val_num)
            )
        
        train_loss_all.append(train_loss / train_num)
        train_acc_all.append(train_corrects / train_num)
        val_loss_all.append(val_loss / val_num)
        val_acc_all.append(val_corrects / val_num)

        if val_acc_all[-1] > best_acc:
            best_epoch = epoch
            best_acc = val_acc_all[-1]
            best_model_wts = copy.deepcopy(model.state_dict())

    time_use = time.time() - since
    print("Общее время обучения：{:.0f}m{:.0f}s".format(time_use//60,time_use%60))
    print('best acc:',best_acc,'best epoch:',best_epoch)
    
    torch.save(best_model_wts,"./best_model.csv")

    train_process = pd.DataFrame(data={
        "epoch":range(epochs),
        "train_loss_all":train_loss_all,
        "val_loss_all":val_loss_all,
        "train_acc_all":train_acc_all,
        "val_acc_all":val_acc_all
    })
    return train_process

In [None]:
model = Net(size_columns, hidden_layer1, hidden_layer2, hidden_layer3, hidden_layer4, study.best_trial.params['out1'], study.best_trial.params['out2'])
train_process = train(model, train_dataload, val_dataload, epochs=epochs, lr=lr)

In [None]:
def calc_test_acc(model, data):
    correct = 0
    total = 0
    with torch.no_grad():
        for X, y in data:
            X = X.squeeze()
            y = y.squeeze()
            prob_y = model(X.float())
            y_pred = output.argmax(dim=1, keepdim=True)
            total += len(y_pred)
            correct += (y_pred == y).sum().item()
    return round((correct/total)*100, 3)

In [None]:
calc_test_acc(net, train_dataload)

In [None]:
calc_test_acc(net, val_dataload)