In [1]:
import numpy as np
import pandas as pd

In [2]:
train_data = pd.read_csv('/kaggle/input/gazprom-fixed/train_data_fix_concat.csv')
test_data = pd.read_csv('/kaggle/input/gazprom-fixed/test_data_fix_concat.csv')
smiles_train = pd.read_csv('/kaggle/input/gazprom-fixed/smiles_train_set.csv')
target = smiles_train.drop_duplicates(subset=['blend_id', 'oil_property_param_value'])[['oil_property_param_value', 'blend_id']]
target = target.dropna(subset = 'oil_property_param_value')
train_data = train_data[train_data.blend_id.isin(target['blend_id'])]

In [3]:
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

comp_name_enc = LabelEncoder()
train_data['comp_name_label'] = comp_name_enc.fit_transform(train_data['component_name'])
comp_name_mapping = dict(zip(comp_name_enc.classes_, comp_name_enc.transform(comp_name_enc.classes_)))
blend_id_enc = LabelEncoder()
train_data['blend_id_label'] = blend_id_enc.fit_transform(train_data['blend_id'])
target['blend_id_label'] = blend_id_enc.transform(target['blend_id'])
blend_id_mapping = dict(zip(blend_id_enc.classes_, blend_id_enc.transform(blend_id_enc.classes_)))

comp_arr = dict()

for (comp_name, blend_id), data in train_data.groupby(['comp_name_label', 'blend_id_label']):
    try:
        comp_arr[blend_id].append(comp_name)
    except:
        comp_arr[blend_id] = [comp_name]

In [4]:
comp_arr = dict(sorted(comp_arr.items()))
for key, value in comp_arr.items():
    while len(comp_arr[key]) < 10:
        value.append(112)

In [5]:
import torch
target = target.sort_values(by='blend_id')
labels = torch.FloatTensor(target['oil_property_param_value']).view([338, 1])
inp = torch.LongTensor(list(comp_arr.values()))

train_inds = np.random.choice(np.arange(338), size=int(0.8 * 338), replace=False)
train_labels = labels[train_inds]
test_labels = labels[~train_inds]
train_input = inp[train_inds,:]
test_input = inp[~train_inds,:]

from torch.utils.data import DataLoader, TensorDataset
dataset = TensorDataset(inp, labels)
loader = DataLoader(dataset, batch_size=338)
# Assuming train_input and train_labels are tensors
train_dataset = TensorDataset(train_input, train_labels)
train_loader = DataLoader(train_dataset, batch_size=338)

# Assuming val_input and val_labels are tensors
test_dataset = TensorDataset(test_input, test_labels)
test_loader = DataLoader(test_dataset, batch_size=338)

In [6]:
import torch
from torch import nn
import torch.nn.functional as F

class EmbModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.emb = nn.Embedding(113, 256)
        self.fc1 = nn.Linear(256, 256)
        self.fc2 = nn.Linear(256, 1)
        self.act = nn.ReLU()
        
    def forward(self, x):
        tens = self.emb(x)
        m = tens.max(dim=1)[0]
        return self.fc2(self.act(self.fc1(m)))

In [7]:
model = EmbModel()

In [8]:
from torch import optim
criterion =  nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
num_epochs = 8000
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for X_batch, y_batch in loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f'Epoch {epoch+1}, Loss: {running_loss}')
    
    model.eval()
    with torch.no_grad():
        running_loss = 0.0
        for X_batch, y_batch in loader:
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            running_loss += loss.item()
        print(f'Epoch {epoch+1}, Loss: {running_loss}')
    

Epoch 1, Loss: 73345.7890625
Epoch 1, Loss: 73328.71875
Epoch 2, Loss: 73328.71875
Epoch 2, Loss: 73307.40625
Epoch 3, Loss: 73307.40625
Epoch 3, Loss: 73278.65625
Epoch 4, Loss: 73278.65625
Epoch 4, Loss: 73242.15625
Epoch 5, Loss: 73242.15625
Epoch 5, Loss: 73197.484375
Epoch 6, Loss: 73197.484375
Epoch 6, Loss: 73144.2109375
Epoch 7, Loss: 73144.2109375
Epoch 7, Loss: 73081.8671875
Epoch 8, Loss: 73081.8671875
Epoch 8, Loss: 73009.953125
Epoch 9, Loss: 73009.953125
Epoch 9, Loss: 72927.9453125
Epoch 10, Loss: 72927.9453125
Epoch 10, Loss: 72835.3046875
Epoch 11, Loss: 72835.3046875
Epoch 11, Loss: 72731.4765625
Epoch 12, Loss: 72731.4765625
Epoch 12, Loss: 72615.859375
Epoch 13, Loss: 72615.859375
Epoch 13, Loss: 72487.859375
Epoch 14, Loss: 72487.859375
Epoch 14, Loss: 72346.8515625
Epoch 15, Loss: 72346.8515625
Epoch 15, Loss: 72192.1796875
Epoch 16, Loss: 72192.1796875
Epoch 16, Loss: 72023.1796875
Epoch 17, Loss: 72023.1796875
Epoch 17, Loss: 71839.609375
Epoch 18, Loss: 71839.6

In [9]:
from torch import optim
criterion =  nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.003)
num_epochs = 8000
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for X_batch, y_batch in loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f'Epoch {epoch+1}, Loss: {running_loss}')
    
    model.eval()
    with torch.no_grad():
        running_loss = 0.0
        for X_batch, y_batch in loader:
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            running_loss += loss.item()
        print(f'Epoch {epoch+1}, Loss: {running_loss}')

Epoch 1, Loss: 9230.791015625
Epoch 1, Loss: 9547.6240234375
Epoch 2, Loss: 9547.6240234375
Epoch 2, Loss: 9443.9033203125
Epoch 3, Loss: 9443.9033203125
Epoch 3, Loss: 9186.74609375
Epoch 4, Loss: 9186.74609375
Epoch 4, Loss: 9344.5283203125
Epoch 5, Loss: 9344.5283203125
Epoch 5, Loss: 9459.0498046875
Epoch 6, Loss: 9459.0498046875
Epoch 6, Loss: 9392.91796875
Epoch 7, Loss: 9392.91796875
Epoch 7, Loss: 9234.51953125
Epoch 8, Loss: 9234.51953125
Epoch 8, Loss: 9184.3583984375
Epoch 9, Loss: 9184.3583984375
Epoch 9, Loss: 9291.9072265625
Epoch 10, Loss: 9291.9072265625
Epoch 10, Loss: 9303.7314453125
Epoch 11, Loss: 9303.7314453125
Epoch 11, Loss: 9229.3203125
Epoch 12, Loss: 9229.3203125
Epoch 12, Loss: 9142.8916015625
Epoch 13, Loss: 9142.8916015625
Epoch 13, Loss: 9214.05078125
Epoch 14, Loss: 9214.05078125
Epoch 14, Loss: 9241.22265625
Epoch 15, Loss: 9241.22265625
Epoch 15, Loss: 9193.8916015625
Epoch 16, Loss: 9193.8916015625
Epoch 16, Loss: 9128.0703125
Epoch 17, Loss: 9128.070

In [10]:
test_data_for_blend_id = pd.read_csv('/kaggle/input/gazprom-fixed/test_data_fix_concat.csv')
test_target = pd.DataFrame({'blend_id': test_data_for_blend_id['blend_id'].unique()})
test_comp_arr = dict()
test_data['comp_name_label'] = np.nan
test_comp_name_label = []

In [11]:
blends_label = []
for ind, row in test_data.iterrows():
    try:
        test_comp_name_label.append(comp_name_mapping[row.component_name])
    except:
        test_comp_name_label.append(np.nan)
test_data['comp_name_label'] = test_comp_name_label
test_blend_id_enc = LabelEncoder()
test_data['blend_id_label'] = test_blend_id_enc.fit_transform(test_data['blend_id'])
test_target['blend_id_label'] = test_blend_id_enc.transform(test_target['blend_id'])
test_blend_id_mapping = dict(zip(test_blend_id_enc.classes_, test_blend_id_enc.transform(test_blend_id_enc.classes_)))
        
for (comp_name, blend_id), data in test_data.groupby(['comp_name_label', 'blend_id_label']):
    try:
        test_comp_arr[blend_id].append(int(comp_name))
    except:
        test_comp_arr[blend_id] = [int(comp_name)]

for blend in list(test_data['blend_id_label']):
    if blend not in test_comp_arr:
        test_comp_arr[blend] = []

In [12]:
test_comp_arr = dict(sorted(test_comp_arr.items()))
for key, value in test_comp_arr.items():
    while len(test_comp_arr[key]) < 10:
        value.append(112)

In [13]:
import torch
test_inp = torch.LongTensor(list(test_comp_arr.values()))
from torch.utils.data import DataLoader, TensorDataset

# Assuming val_input and val_labels are tensors
test_dataset = TensorDataset(test_inp)
test_loader = DataLoader(test_dataset, batch_size=138)

In [14]:
model.eval()
outputs = []
with torch.no_grad():
        for X_batch in test_loader:
            print(X_batch)
            print(type(X_batch))
            outputs = model(X_batch[0])

[tensor([[ 20,  38,  45,  ..., 112, 112, 112],
        [ 34,  36,  69,  ..., 112, 112, 112],
        [  3,  11,  14,  ..., 112, 112, 112],
        ...,
        [ 13,  34,  41,  ..., 112, 112, 112],
        [ 34,  36,  39,  ..., 112, 112, 112],
        [  3,  15,  20,  ..., 112, 112, 112]])]
<class 'list'>


In [15]:
outputs = outputs.numpy()

In [16]:
outputs

array([[  7641.1772],
       [ -8791.6045],
       [ 12012.982 ],
       [ 98733.98  ],
       [149969.45  ],
       [ 17895.736 ],
       [ 24392.17  ],
       [ -8999.423 ],
       [  8502.547 ],
       [  7884.8496],
       [ 29160.803 ],
       [141550.52  ],
       [165969.33  ],
       [ 17850.648 ],
       [ 16477.223 ],
       [143806.98  ],
       [ 38274.082 ],
       [  5138.6533],
       [266308.06  ],
       [ 13152.065 ],
       [ 12012.982 ],
       [  9075.51  ],
       [168546.34  ],
       [ 13000.445 ],
       [ 63890.066 ],
       [ 98733.98  ],
       [ -8999.423 ],
       [164922.39  ],
       [  1706.6687],
       [189701.03  ],
       [ 65546.58  ],
       [138253.67  ],
       [-19747.361 ],
       [ 28458.99  ],
       [  5138.6533],
       [  2719.9644],
       [ 28609.617 ],
       [ -7726.918 ],
       [  1234.2559],
       [  9075.51  ],
       [ 98733.98  ],
       [ 13152.065 ],
       [  5138.6533],
       [385036.06  ],
       [161265.55  ],
       [ 1

In [17]:
outputs[outputs < 0] = 0

In [None]:
outputs

In [None]:
test_comp_arr

In [18]:
obr_test_blend_id_mapping = dict(zip(test_blend_id_enc.transform(test_blend_id_enc.classes_), test_blend_id_enc.classes_))

In [19]:
list_of_test_ids = []
for i in range(138):
    list_of_test_ids.append(obr_test_blend_id_mapping[i])

In [20]:
outputs = outputs.flatten()

In [21]:
outputs.shape

(138,)

In [22]:
out_df = pd.DataFrame({'blend_id': list_of_test_ids, 'results': outputs})

In [23]:
out_df.to_csv('256_emb_with_2_linear.csv', index=False)