<a href="https://colab.research.google.com/github/isadays/Embeddings/blob/main/TestingEmbeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F

In [3]:
df = pd.read_csv('Clean_Dataset.csv')

In [4]:
df = df.drop('Unnamed: 0', axis=1)

In [5]:
df.describe()

Unnamed: 0,duration,days_left,price
count,39480.0,39480.0,39480.0
mean,10.852466,24.982295,6439.796505
std,7.175039,13.50455,3705.879401
min,1.92,1.0,1998.0
25%,5.5,14.0,4020.0
50%,9.25,24.0,5724.0
75%,14.83,36.0,7425.0
max,39.67,49.0,35145.0


In [6]:
df.columns

Index(['airline', 'flight', 'source_city', 'departure_time', 'stops',
       'arrival_time', 'destination_city', 'class', 'duration', 'days_left',
       'price'],
      dtype='object')

In [7]:
categorical_cols = ['airline', 'flight', 'source_city', 'departure_time',
                    'stops', 'arrival_time', 'destination_city', 'class']
numerical_cols = ['duration', 'days_left']
target_col = 'price'


In [8]:
label_encoders = {col: LabelEncoder().fit(df[col]) for col in categorical_cols}
for col in categorical_cols:
    df[col] = label_encoders[col].transform(df[col])

scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

class FlightDataset(Dataset):
    def __init__(self, df):
        self.cats = df[categorical_cols].values
        self.nums = df[numerical_cols].values.astype(np.float32)
        self.y = df[target_col].values.astype(np.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return {
            'categorical': torch.tensor(self.cats[idx], dtype=torch.long),
            'numerical': torch.tensor(self.nums[idx], dtype=torch.float),
            'target': torch.tensor(self.y[idx], dtype=torch.float)
        }

In [20]:
dataset = FlightDataset(df)
loader = DataLoader(dataset, batch_size=64, shuffle=True)

In [21]:
batch = next(iter(loader))

print("Categorical shape:", batch['categorical'].shape)
print("Numerical shape:", batch['numerical'].shape)
print("Target shape:", batch['target'].shape)

print("Categorical sample:", batch['categorical'][0])
print("Numerical sample:", batch['numerical'][0])
print("Target sample:", batch['target'][0])

Categorical shape: torch.Size([64, 8])
Numerical shape: torch.Size([64, 2])
Target shape: torch.Size([64])
Categorical sample: tensor([  5, 454,   0,   0,   0,   5,   0,   0])
Numerical sample: tensor([-0.4909,  0.7418])
Target sample: tensor(4496.)


In [10]:
class FlightPriceTransformer(nn.Module):
    def __init__(self, cat_cols, num_cols, cat_cardinalities, embedding_dim=32, nhead=4, num_layers=2):
        super().__init__()

        self.embeddings = nn.ModuleList([
            nn.Embedding(card, embedding_dim) for card in cat_cardinalities
        ])

        input_dim = embedding_dim * len(cat_cols) + len(num_cols)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=input_dim, nhead=nhead, dropout=0.1, batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.fc_out = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1)
        )

    def forward(self, categorical, numerical):
        embedded = [emb(categorical[:, i]) for i, emb in enumerate(self.embeddings)]
        x = torch.cat(embedded + [numerical], dim=1)
        x = x.unsqueeze(1)
        x = self.transformer_encoder(x)
        x = x.squeeze(1)
        out = self.fc_out(x)
        return out.squeeze()


In [18]:
cat_cardinalities = [df[col].nunique() for col in categorical_cols]

num_heads = 1
len_cats = len(categorical_cols)
len_nums = len(numerical_cols)
embedding_dim = 4

model = FlightPriceTransformer(
    categorical_cols, numerical_cols, cat_cardinalities,
    embedding_dim=embedding_dim,
    nhead=num_heads
)


In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

epochs = 20
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for batch in loader:
        categorical = batch['categorical'].to(device)
        numerical = batch['numerical'].to(device)
        target = batch['target'].to(device)

        optimizer.zero_grad()
        outputs = model(categorical, numerical)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(loader)
    print(f'Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}')


Epoch 1/20, Loss: nan


KeyboardInterrupt: 

In [13]:
model.eval()
sample = next(iter(loader))
categorical = sample['categorical'].to(device)
numerical = sample['numerical'].to(device)

with torch.no_grad():
    preds = model(categorical, numerical)

print(preds.cpu().numpy())


[nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan]


In [14]:
print(df[numerical_cols].isnull().sum())
print(np.isinf(df[numerical_cols]).sum())


duration     1
days_left    1
dtype: int64
duration     0
days_left    0
dtype: int64
