In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from sklearn.metrics import classification_report, f1_score, accuracy_score, balanced_accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Normalizer, KBinsDiscretizer, LabelEncoder, StandardScaler, normalize
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from catboost import CatBoostClassifier

from skorch.classifier import NeuralNetClassifier
from skorch.callbacks import EpochScoring

from functools import lru_cache
from faiss import IndexLSH, IndexFlatL2, IndexIVFFlat, index_factory, omp_set_num_threads
omp_set_num_threads(16) # faiss parallelism

### Loading data

In [2]:
train_df = pd.read_csv('data/my_data_train_val.csv')
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5586011 entries, 0 to 5586010
Data columns (total 5 columns):
 #   Column     Dtype  
---  ------     -----  
 0   x          float64
 1   y          float64
 2   z          float64
 3   intensity  float64
 4   class      int64  
dtypes: float64(4), int64(1)
memory usage: 213.1 MB


In [3]:
train_df.describe().round(2)

Unnamed: 0,x,y,z,intensity,class
count,5586011.0,5586011.0,5586011.0,5586011.0,5586011.0
mean,-237.41,2.78,-5.86,0.29,47.79
std,120.91,14.37,2.32,0.14,23.82
min,-536.61,-88.92,-29.01,0.0,0.0
25%,-326.85,-7.12,-7.67,0.23,40.0
50%,-223.92,1.63,-5.74,0.31,48.0
75%,-137.77,12.85,-4.05,0.37,72.0
max,-50.0,94.58,4.34,0.99,80.0


In [4]:
scaler = MinMaxScaler()
_train_df = pd.DataFrame(scaler.fit_transform(train_df.drop('class', axis=1)), columns=scaler.feature_names_in_)
train_df = pd.concat([_train_df, train_df['class']], axis=1)
train_df.describe().round(3)

Unnamed: 0,x,y,z,intensity,class
count,5586011.0,5586011.0,5586011.0,5586011.0,5586011.0
mean,0.615,0.5,0.694,0.295,47.786
std,0.248,0.078,0.069,0.14,23.824
min,0.0,0.0,0.0,0.0,0.0
25%,0.431,0.446,0.64,0.232,40.0
50%,0.643,0.493,0.698,0.313,48.0
75%,0.82,0.555,0.748,0.374,72.0
max,1.0,1.0,1.0,1.0,80.0


In [5]:
test_df = pd.read_csv('data/my_data_test.csv')
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 827513 entries, 0 to 827512
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   x          827513 non-null  float64
 1   y          827513 non-null  float64
 2   z          827513 non-null  float64
 3   intensity  827513 non-null  float64
dtypes: float64(4)
memory usage: 25.3 MB


In [6]:
test_df.describe().round(2)

Unnamed: 0,x,y,z,intensity
count,827513.0,827513.0,827513.0,827513.0
mean,-29.8,6.23,-1.88,0.31
std,12.13,12.17,0.93,0.14
min,-50.0,-59.65,-10.7,0.0
25%,-40.04,-3.86,-2.54,0.24
50%,-29.43,3.27,-2.18,0.32
75%,-19.22,15.05,-1.34,0.38
max,-10.0,69.48,3.86,0.99


In [7]:
test_df = pd.DataFrame(scaler.transform(test_df), columns=scaler.feature_names_in_)
test_df.describe().round(3)

Unnamed: 0,x,y,z,intensity
count,827513.0,827513.0,827513.0,827513.0
mean,1.042,0.519,0.813,0.309
std,0.025,0.066,0.028,0.143
min,1.0,0.16,0.549,0.0
25%,1.02,0.464,0.794,0.242
50%,1.042,0.502,0.804,0.323
75%,1.063,0.567,0.83,0.384
max,1.082,0.863,0.985,1.0


In [8]:
# fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True, figsize=(12, 8))
# sns.histplot(train_df.drop('class', axis=1).sample(frac=0.4), ax=axes[0]).set_title('Train')
# sns.histplot(test_df, ax=axes[1]).set_title('Test')
# # plt.xlim([-300, 50])
# plt.show()

In [9]:
label_encoder = LabelEncoder()
label_encoder.fit(train_df['class'])
label_encoder.classes_

array([ 0, 10, 40, 44, 48, 50, 51, 70, 71, 72, 80])

### Modeling

In [10]:
import torch
import torch.nn as nn
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import Dataset
from torch.utils.data import random_split, DataLoader
from torch.utils.data.sampler import RandomSampler, BatchSampler
import torch.nn.functional as F
torch.cuda.is_available()

True

In [11]:
use_cols = ['x', 'y', 'z']

In [12]:
class PointCloudDataset(Dataset):

    def __init__(self, df, use_cols, n_neigh, use_neighs, q_bins, is_train = True):

        self.df = df
        self.is_train = is_train
        self.use_cols = use_cols
        if is_train:
            self.target = label_encoder.transform(self.df['class'])
            self.df = self.df.drop(['class'], axis=1)

        self.nn_index = index_factory(len(self.use_cols), "HNSW86,Flat")
        self.nn_index.parallel_mode = 1
        self.nn_index.train(self.df[self.use_cols])
        self.nn_index.add(self.df[self.use_cols])

        self.n_neigh = n_neigh + 1
        self.use_neighs = use_neighs

        sample = self.nn_index.search(self.df[use_cols].iloc[np.random.randint(0, len(self.df), len(self.df) // 10)], self.n_neigh)[0][:, 1:]
        self.quantilies  = torch.tensor(np.quantile(sample, np.linspace(0, 1, q_bins)))

        # self.left_rule = torch.tensor(quantilies)
        # self.right_rule = torch.tensor(np.concatenate([quantilies, [[[np.inf]]]])[1:])


    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        batch = self.df.iloc[idx]
        nbrs_dists, _ = self.nn_index.search(batch[self.use_cols], self.n_neigh)
        nbrs_dists = torch.tensor(nbrs_dists[:, 1:])

        quantile_features = torch.stack([torch.bitwise_and(nbrs_dists > q1, nbrs_dists < q2).sum(1) for q1, q2 in zip(self.quantilies, self.quantilies[1:])], dim=1)
        intensity = torch.tensor((batch['intensity'] > 0.35).values, dtype=torch.float32).unsqueeze(1)
        feat_array = torch.concat([nbrs_dists[:, :self.use_neighs], quantile_features, intensity], dim=1)

        # quantile_features = torch.bitwise_and(nbrs_dists > self.left_rule, nbrs_dists < self.right_rule).sum(2).T
        # intensity = torch.tensor((batch['intensity'] > 0.35).values, dtype=torch.float32).unsqueeze(1)
        # feat_array = torch.concat([nbrs_dists[:, :self.use_neighs], quantile_features, intensity], dim=1)

        if self.is_train:
            return feat_array, torch.tensor(self.target[idx])
        else:
            return feat_array

    @property
    def n_features_in(self):
        return self.use_neighs + self.quantilies.shape[0]

In [13]:
split_ratio = 0.85
train_set = train_df.sample(frac=split_ratio)
val_set = train_df.iloc[~train_df.index.isin(train_set.index)]

In [14]:
train_dataset = PointCloudDataset(df=train_set, use_cols=use_cols, is_train=True, q_bins=15, n_neigh=200, use_neighs=10)
len(train_dataset)

4468809

In [15]:
val_dataset = PointCloudDataset(df=val_set, use_cols=use_cols, is_train=True, q_bins=15, n_neigh=200, use_neighs=10)
len(val_dataset)

1117202

In [16]:
class TorchMLPClassifier(nn.Module):

    def __init__(self,
                 input_dim,
                 out_dim,
                 dropout_p = 0.1,
                 n_layers = 3
                 ):

        super().__init__()

        hidden_dim = 2 * input_dim

        layers = [nn.Sequential(
            nn.BatchNorm1d(input_dim if i == 0 else hidden_dim),
            nn.Linear(input_dim if i == 0 else hidden_dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout_p)
        ) for i in range(n_layers)]

        self.network = nn.Sequential(
            *layers,
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, out_dim)
        )

    def forward(self, x):
        return self.network(x)

In [17]:
model = TorchMLPClassifier(input_dim=train_dataset.n_features_in, out_dim=11)
model

TorchMLPClassifier(
  (network): Sequential(
    (0): Sequential(
      (0): BatchNorm1d(45, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Linear(in_features=45, out_features=90, bias=True)
      (2): GELU(approximate=none)
      (3): Dropout(p=0.01, inplace=False)
    )
    (1): Sequential(
      (0): BatchNorm1d(90, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Linear(in_features=90, out_features=90, bias=True)
      (2): GELU(approximate=none)
      (3): Dropout(p=0.01, inplace=False)
    )
    (2): Sequential(
      (0): BatchNorm1d(90, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Linear(in_features=90, out_features=90, bias=True)
      (2): GELU(approximate=none)
      (3): Dropout(p=0.01, inplace=False)
    )
    (3): BatchNorm1d(90, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): Linear(in_features=90, out_features=11, bias=True)
  )
)

In [18]:
def validate(model, device, batch_size, criterion, p_bar):
    model.to(device)
    model.eval()

    with torch.no_grad():

        val_losses = []
        val_f1 = []
        val_acc = []

        val_sampler = BatchSampler(RandomSampler(val_dataset), batch_size=batch_size, drop_last=True)
        for b, batch_idx in enumerate(val_sampler):
            p_bar.set_description(f'Validation | Batch {b}/{len(val_sampler)} | Epoch')

            x, target = val_dataset[batch_idx]
            x, target = x.to(device), target

            preds = model(x).detach().cpu()

            val_losses += [criterion(preds, target).item()]
            val_f1 += [f1_score(target, preds.argmax(1), average='weighted')]
            val_acc += [accuracy_score(target, preds.argmax(1))]

        return np.array(val_losses).mean(), np.array(val_f1).mean(), np.array(val_acc).mean()


In [19]:
def train(model, device='cuda', batch_size=1024, epochs=10, lr=0.007):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr, weight_decay=0)
    model.to(device)

    p_bar = tqdm(enumerate(range(epochs)), total=epochs)
    for i, epoch in p_bar:
        epoch_losses = []

        model.train()
        train_sampler = BatchSampler(RandomSampler(train_dataset), batch_size=batch_size, drop_last=True)
        for b, batch_idx in enumerate(train_sampler):
            p_bar.set_description(f'Train | Batch {b}/{len(train_sampler)} | Epoch')

            x, target = train_dataset[batch_idx]
            x, target = x.to(device), target.to(device)

            preds = model(x)
            loss = criterion(preds, target)

            optimizer.zero_grad()
            loss.backward()

            clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

            epoch_losses += [loss.item()]

        val_l, val_f1, val_acc = validate(model, device, batch_size, criterion, p_bar)

        print(f'Epoch {i} |'
              f' Train Loss: {round(np.array(epoch_losses).mean(), 4)} |'
              f' Val Loss: {round(val_l, 4)} |'
              f' Val F1: {round(val_f1, 4)} |'
              f' Val Acc: {round(val_acc, 4)}'
              )


In [20]:
train(model, device='cuda', epochs=30, batch_size=8096)

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch 0 | Train Loss: 1.5109 | Val Loss: 1.5725 | Val F1: 0.3735 | Val Acc: 0.4092
Epoch 1 | Train Loss: 1.493 | Val Loss: 1.5716 | Val F1: 0.3718 | Val Acc: 0.412
Epoch 2 | Train Loss: 1.4915 | Val Loss: 1.5676 | Val F1: 0.3758 | Val Acc: 0.4098
Epoch 3 | Train Loss: 1.4903 | Val Loss: 1.5845 | Val F1: 0.3728 | Val Acc: 0.403
Epoch 4 | Train Loss: 1.4898 | Val Loss: 1.5726 | Val F1: 0.3785 | Val Acc: 0.4136
Epoch 5 | Train Loss: 1.4894 | Val Loss: 1.5614 | Val F1: 0.3756 | Val Acc: 0.4162
Epoch 6 | Train Loss: 1.4889 | Val Loss: 1.5719 | Val F1: 0.3744 | Val Acc: 0.4108
Epoch 7 | Train Loss: 1.4885 | Val Loss: 1.5666 | Val F1: 0.3706 | Val Acc: 0.4143
Epoch 8 | Train Loss: 1.4886 | Val Loss: 1.5723 | Val F1: 0.3717 | Val Acc: 0.4104
Epoch 9 | Train Loss: 1.4886 | Val Loss: 1.577 | Val F1: 0.3741 | Val Acc: 0.4108
Epoch 10 | Train Loss: 1.4882 | Val Loss: 1.5704 | Val F1: 0.3754 | Val Acc: 0.4128
Epoch 11 | Train Loss: 1.488 | Val Loss: 1.5711 | Val F1: 0.3771 | Val Acc: 0.4112
Epoch 1

In [19]:
model.to('cpu')

TorchMLPClassifier(
  (network): Sequential(
    (0): Sequential(
      (0): BatchNorm1d(26, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Linear(in_features=26, out_features=52, bias=True)
      (2): GELU(approximate=none)
      (3): Dropout(p=0.1, inplace=False)
    )
    (1): Sequential(
      (0): BatchNorm1d(52, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Linear(in_features=52, out_features=52, bias=True)
      (2): GELU(approximate=none)
      (3): Dropout(p=0.1, inplace=False)
    )
    (2): BatchNorm1d(52, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Linear(in_features=52, out_features=11, bias=True)
  )
)

<torch.utils.data.dataset.Subset at 0x7fca7e766f70>

In [None]:
sample_sub = pd.read_csv('data/SampleSubmission.csv')
sample_sub['сlass'] = test_predict
sample_sub.to_csv('data/my_submission.csv', index=None)