In [1]:
import pandas as pd
import numpy as np
import datetime
import torch
import torch.utils.data as data_torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import torchvision

from torchvision import models
from sklearn.preprocessing import RobustScaler, MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from ydata_profiling import ProfileReport # подробный разбор признаков

In [2]:
data_patch = '.\\'

In [3]:
def stratify_data(filename):
    
    dataframe = pd.read_csv(filename).copy()
    dataframe = dataframe.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    mask = dataframe['Age'].isna()
    dataframe.loc[mask, 'Age'] = np.random.randint(10, 50, mask.sum())
    dataframe['Embarked'] = dataframe['Embarked'].fillna(dataframe['Embarked'].mode()[0])
    dataframe['Sex'] = dataframe['Sex'].map({'male':1,'female':0})
    dataframe['Embarked'] = dataframe['Embarked'].map({'S':0,'C':1, 'Q':2})
    
    survived = dataframe['Survived']
    #survived = pd.concat([survived, survived.copy()], ignore_index=True)
    features = dataframe.drop(['Survived'], axis=1)
    
    features = pd.get_dummies(features, columns=['Pclass', 'SibSp', 'Parch', 'Embarked'], dtype=int)
    poly = PolynomialFeatures(degree=2, include_bias=False)
    features = poly.fit_transform(features)
        
    features = RobustScaler().fit_transform(features)
    features = MinMaxScaler().fit_transform(features)
    
    data_scaled = pd.DataFrame(data=features)
    #data_scaled = pd.concat([data_scaled, data_scaled.copy()], ignore_index=True)
    
    X_train, X_val, y_train, y_val = train_test_split(data_scaled, survived, test_size=0.2, random_state=2, stratify=survived)
    
    X_train['Survived'] = y_train
    X_val['Survived'] = y_val
    
    X_train.to_csv('train_data.csv', sep=',', index=False)
    X_val.to_csv('val_data.csv', sep=',', index=False)

In [4]:
stratify_data(data_patch + 'train.csv')

In [48]:
class TitanicDataset(data_torch.Dataset):
    
    def __init__(self, filename, Train=True):
        self.dataframe = pd.read_csv(filename).copy()
        self.Train = Train
        
    def __len__(self):
        return self.dataframe.shape[0]
    
    def __getitem__(self, idx):
        if(self.Train):
            survived = self.dataframe['Survived']
            survived = np.array(survived)[idx]
            
        features = self.dataframe.drop(['Survived'], axis=1)
        features = np.array(features)[idx]
                    
        if(self.Train):
            return features, survived
        else:
            return features
          
    def infoo(self):
        return self.dataframe.info()

In [50]:
train_dataset = TitanicDataset(data_patch + 'train_data.csv')
val_dataset = TitanicDataset(data_patch + 'val_data.csv')
testing_dataset = TitanicDataset(data_patch + 'test.csv', Train=False)

In [51]:
train_dataset.__getitem__(5)

(array([0.00000000e+00, 3.34003518e-01, 2.70496001e-02, 0.00000000e+00,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 1.13881826e-01, 2.02872001e-02,
        0.00000000e+00, 3.85714286e-01, 0.00000000e+00, 0.00000000e+00,
        3.85714286e-01, 0.00000000e+00, 0.00000000e+00, 0.000000

In [8]:
train_dataset.infoo()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 712 entries, 0 to 711
Columns: 300 entries, 0 to Survived
dtypes: float64(299), int64(1)
memory usage: 1.6 MB


In [52]:
train_dataload = data_torch.DataLoader(train_dataset, shuffle=True, batch_size=100)
t = iter(train_dataload)
print(next(t))

[tensor([[0.0000, 0.1455, 0.0303,  ..., 0.0000, 0.0000, 1.0000],
        [1.0000, 0.1078, 0.0401,  ..., 0.0000, 0.0000, 0.0000],
        [1.0000, 0.0450, 0.0610,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [1.0000, 0.7110, 0.0241,  ..., 0.0000, 0.0000, 1.0000],
        [0.0000, 0.5225, 0.4441,  ..., 1.0000, 0.0000, 0.0000],
        [1.0000, 0.1958, 0.0205,  ..., 0.0000, 0.0000, 0.0000]],
       dtype=torch.float64), tensor([1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
        0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0,
        1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
        0, 0, 1, 0])]


In [10]:
val_dataload = data_torch.DataLoader(val_dataset, shuffle=True, batch_size=100)
t = iter(val_dataload)
print(next(t))

[tensor([[[[[0.0000]]],


         [[[0.1958]]],


         [[[0.0497]]],


         ...,


         [[[0.0000]]],


         [[[0.0000]]],


         [[[0.0000]]]],



        [[[[1.0000]]],


         [[[0.3089]]],


         [[[0.1082]]],


         ...,


         [[[1.0000]]],


         [[[0.0000]]],


         [[[0.0000]]]],



        [[[[1.0000]]],


         [[[0.7487]]],


         [[[0.0518]]],


         ...,


         [[[0.0000]]],


         [[[0.0000]]],


         [[[0.0000]]]],



        ...,



        [[[[1.0000]]],


         [[[0.2963]]],


         [[[0.0314]]],


         ...,


         [[[0.0000]]],


         [[[0.0000]]],


         [[[0.0000]]]],



        [[[[0.0000]]],


         [[[0.3968]]],


         [[[0.1489]]],


         ...,


         [[[1.0000]]],


         [[[0.0000]]],


         [[[0.0000]]]],



        [[[[1.0000]]],


         [[[0.2712]]],


         [[[0.0141]]],


         ...,


         [[[1.0000]]],


         [[[0.0000]]],


  

In [11]:
size_columns = len(next(iter(train_dataload))[0][0])
hidden_layer1_coeff = 15
hidden_layer2_coeff = 7
hidden_layer3_coeff = 3.5
hidden_layer4_coeff = 2
lr = 0.01
momentum = 0.9
epochs = 10
hidden_layer1 = round(hidden_layer1_coeff * size_columns)
hidden_layer2 = round(hidden_layer2_coeff * size_columns)
hidden_layer3 = round(hidden_layer3_coeff * size_columns)
hidden_layer4 = round(hidden_layer4_coeff * size_columns)

In [43]:
weights = models.Swin_S_Weights.DEFAULT
transfer_model = models.swin_s(weights=weights)
transfer_model.features
transfer_model.head = nn.Linear(transfer_model.head.in_features, out_features=1)
transfer_model.features[0][0] = nn.Conv2d(size_columns, 96, kernel_size=(4, 4), stride=(4, 4))

transfer_model.features[0] = nn.Sequential(
    nn.Linear(size_columns, 96),
    nn.ReLU())

In [13]:
transfer_model.features[0][1]

Permute()

In [44]:
# Замораживаем веса предобученных слоев
for param in transfer_model.parameters():
    param.requires_grad = False

# Обучаем только новые полносвязные слои
transfer_model.head.weight.requires_grad = True
transfer_model.head.bias.requires_grad = True
transfer_model.features[0][0].weight.requires_grad = True
transfer_model.features[0][0].bias.requires_grad = True

In [45]:
#optimizer = optim.SGD(transfer_model.parameters(), lr = lr, momentum = momentum)
optimizer = optim.Adam(transfer_model.parameters(), lr = lr)
criterion = nn.BCELoss()

In [46]:
def train(model, training_data, optimizer, criterion, epochs=10):
    dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(dev)
    losses = []
    accuracy = []
    for epoch in range(epochs):
        running_loss = 0.0
        batch = 0
        for  X, y in training_data:
            X , y = X.to(dev) , y.to(dev)
            
            features = len(next(iter(training_data))[0][0])
            X = X.view(-1, features)
            y = y.view(-1, 1)
            
            optimizer.zero_grad()
            
            prob_y = model(X.float())
            
            y_pred = (prob_y > torch.Tensor([0.5])).long().squeeze()
            
            accuracy.append((y_pred==y).sum().item()/len(y_pred))
            
            loss = criterion(prob_y, y.float())
            
            loss.backward()
            
            optimizer.step()
            
            running_loss += loss.item()
            
            batch += 1
        # средние потери на каждой эпохе    
        losses.append(running_loss/batch)
        if epoch%5==0:
            print('epochs {} done'.format(epoch+5))
        
    print("Fin")
    return (losses, accuracy)

In [53]:
losses, accuracy = train(transfer_model, train_dataload, optimizer, criterion, epochs=epochs)

RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [100, 299]

In [None]:
def plot_loss_acc(losses, accuracy):
    figure = plt.figure(figsize=(9,3))
    plt.subplot(121, title="losses")
    plt.plot(range(1, len(losses)+1), losses)
    plt.xlabel("epoch №")
    plt.ylabel("loss")
    plt.subplot(122, title="accuracy")
    plt.plot(range(1, len(accuracy)+1), accuracy)
    plt.xlabel("batch №")
    plt.ylabel("accuracy")

In [None]:
plot_loss_acc(losses, accuracy)

MODEL_PATH = '.\\titanic_transfer_model.pth'
torch.save(transfer_model.state_dict(), MODEL_PATH)

In [None]:
def calc_test_acc(model, data):
    correct = 0
    total = 0
    with torch.no_grad():
        for X, y in data:
            X = X.squeeze()
            y = y.squeeze()
            prob_y = model(X.float())
            y_pred = (prob_y > torch.Tensor([0.5])).long().squeeze()
            total += len(y_pred)
            correct += (y_pred == y).sum().item()
    return round((correct/total)*100, 3)

In [None]:
calc_test_acc(transfer_model, train_dataload)

In [None]:
calc_test_acc(transfer_model, val_dataload)

In [14]:
import pytorch_lightning as pl
dir(pl)

['Callback',
 'LIGHTNING_LOGO',
 'LightningDataModule',
 'LightningModule',
 'Trainer',
 '__about__',
 '__all__',
 '__annotations__',
 '__author__',
 '__author_email__',
 '__builtins__',
 '__cached__',
 '__copyright__',
 '__doc__',
 '__docs__',
 '__docs_url__',
 '__file__',
 '__homepage__',
 '__license__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 '_graveyard',
 '_logger',
 '_root_logger',
 'accelerators',
 'callbacks',
 'cli_lightning_logo',
 'core',
 'loggers',
 'logging',
 'loops',
 'module_available',
 'os',
 'overrides',
 'plugins',
 'profilers',
 'pytorch_lightning',
 'seed_everything',
 'strategies',
 'trainer',
 'tuner',
 'utilities']

In [5]:
import torch
import pytorch_lightning as pl
from pl.models.base import LightningModule
from pl.callbacks import EarlyStopping, ModelCheckpoint
from pl.trainer import Trainer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd

# Загрузка данных
data = pd.read_csv('titanic.csv')
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
X = data[features]
y = data['Survived']

# Преобразование данных в тензоры
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

# Использование предобученной модели из PyTorch Lightning
class TitanicModel(LightningModule):
    def __init__(self):
        super().__init__()
        self.model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)
        self.model.fc = nn.Linear(self.model.fc.in_features, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.model(x)
        x = self.sigmoid(x)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = nn.BCELoss()(y_hat, y.unsqueeze(1))
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = nn.BCELoss()(y_hat, y.unsqueeze(1))
        return loss

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        y_pred = (y_hat > 0.5).float()
        accuracy = (y_pred == y).float().mean()
        return accuracy

model = TitanicModel()
trainer = Trainer(accelerator='gpu', devices=1, max_epochs=10, callbacks=[EarlyStopping(monitor='val_loss')])
trainer.fit(model, (X_train, y_train), (X_test, y_test))

# В этом примере, мы используем предварительно обученную модель resnet18 из PyTorch, которая была обучена на наборе данных ImageNet. Мы адаптируем последний полносвязный слой модели для решения задачи классификации выживших пассажиров на Титанике.

# Вы можете также попробовать использовать другие предобученные модели, доступные в PyTorch, такие как vgg16, densenet121 или mobilenet_v2, в зависимости от характеристик ваших данных.

# Использование предобученных моделей может значительно ускорить и упростить процесс обучения, особенно если у вас нет больших наборов данных или вычислительных ресурсов для обучения модели с нуля.

ModuleNotFoundError: No module named 'pl'