In [1]:
import pandas as pd
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from math import ceil


DATA_PATH = "../data/triplet_loss_model_dataset"
pd.set_option('display.max_columns', None)

In [2]:
raw_parts_features_dataset = pd.read_csv(DATA_PATH+"/prepared_parts_dataset.csv", index_col='transformed_mpn')
triplets_dataset = pd.read_csv(DATA_PATH+"/triplets_dataset.csv")
triplets_dataset = triplets_dataset.sample(frac=1)

In [3]:
NUMERICAL_FEATURES = [
    'FullCounterfeitData|CounterfeitOverallRisk',
    'FullCounterfeitData|ManCounterfeitReportsCount',
    'FullCounterfeitData|PlCounterfeitReportsCount',
    'FullCounterfeitData|TimeSinceMarketIntroduction',
    'LifeCycleData|EstimatedYearsToEOL',
    'LifeCycleData|MaximumEstimatedYearsToEOL',
    'LifeCycleData|MinimumEstimatedYearsToEOL',
    'LifeCycleData|OverallRisk',
    'PackageData|Feature>Product Depth:Value',
    'ParametricData|Features>Maximum Operating Temperature:Value',
    'ParametricData|Features>Minimum Operating Temperature:Value',
    'ParametricData|Features>Number of Terminals:Value',
    'ParametricData|Features>Product Height:Value',
    'ParametricData|Features>Product Length:Value',
    'ParametricData|Features>Tolerance:Value',
    'RiskData|NumberOfDistributors',
    'RiskData|NumberOfOtherSources'
]

CATEGORICAL_FEATURES = [
    'EnvironmentalDto|ChinaRoHS|EPUP',
    'EnvironmentalDto|ChinaRoHS|PBDEFlag',
    'EnvironmentalDto|ChinaRoHS|SourceType',
    'EnvironmentalDto|ConflictMineralStatus',
    'EnvironmentalDto|EICCMembership',
    'EnvironmentalDto|EICCTemplateVersion',
    'EnvironmentalDto|Exemption',
    'EnvironmentalDto|ExemptionCodes',
    'EnvironmentalDto|HalgonFree',
    'EnvironmentalDto|RareEarthElementInformation',
    'EnvironmentalDto|RoHSVersion',
    'EnvironmentalDto|RohsIdentifier',
    'EnvironmentalDto|SourceType',
    'FullCounterfeitData|HistoricalShortagesInventoryReported',
    'FullCounterfeitData|IsPopularPart',
    'LifeCycleData|LifeCycleRiskGrade',
    'LifeCycleData|PartLifecycleCode',
    'PackageData|Feature>Mounting:Value',
    'PackageData|Feature>Package/Case:Value',
    'ParametricData|Features>Life Cycle:Value',
    'ParametricData|Features>Mounting:Value',
    'ParametricData|Features>Packaging:Value',
    'ParametricData|Features>ROHS:Value',
    'ParametricData|Features>Technology:Value',
    'ParametricData|Features>Temperature Grade:Value',
    'ParametricData|Features>Termination Style:Value',
    'ReachData|ReachDto|CASNumber',
    'ReachData|ReachDto|ContainsSVHC',
    'ReachData|ReachDto|EchaNotification',
    'ReachData|ReachDto|ReachStatus',
    'ReachData|ReachDto|SourceType',
    'RiskData|CrossesPartCategory',
    'RiskData|InventoryRisk',
    'RiskData|LifecycleRisk',
    'RiskData|MultiSourcingRisk',
    'RiskData|RohsRisk',
    'SummaryData|AECQualified',
    'SummaryData|Automotive',
    'SummaryData|DoseLevel',
    'SummaryData|ECCN',
    'SummaryData|ESDClass',
    'SummaryData|PartMarking',
    'SummaryData|RadHard',
    'SummaryData|RoHSVersion',
    'SummaryData|UNSPSC',
    'SummaryData|USChinaTariffImpact'
]

TEXTUAL_FEATURES = ['SummaryData|PLName', 'SummaryData|PartDescription']

DATE_FEATURES = [
    'EnvironmentalDto|ExemptionExpirationDate',
    'LifeCycleData|LTBDate',
    'ReachData|ReachDto|SVHCDateOfInclusion',
    'ReachData|ReachDto|SVHCListVersion',
    'SummaryData|IntroductionDate',
    'SummaryData|LastCheckDate'
]

## Data preparation

In [4]:
parts_features = raw_parts_features_dataset.copy()

# Numerical Features imputation
imputer = KNNImputer(n_neighbors=5, weights='distance')
parts_features[NUMERICAL_FEATURES] = imputer.fit_transform(parts_features[NUMERICAL_FEATURES])

## Here starts the pipeline

In [5]:
parts_features = parts_features[CATEGORICAL_FEATURES+NUMERICAL_FEATURES]

In [6]:
ordinal_encoders = OrdinalEncoder(dtype=np.int64, handle_unknown='use_encoded_value', unknown_value=-1)
numerical_standard_scalers = StandardScaler()

ordinal_encoders.fit(X=parts_features[CATEGORICAL_FEATURES])
numerical_standard_scalers.fit(X=parts_features[NUMERICAL_FEATURES])

In [16]:
class PartsDataset(Dataset):
    def __init__(self, parts_features: pd.DataFrame, triplets: pd.DataFrame, num_features: list, cat_features: list, text_features: list, date_features: list) -> None:
        super(PartsDataset, self).__init__()
        self.parts_features = parts_features.copy()
        self.triplets = triplets.copy()
        self.triplets.reset_index(inplace=True, drop=True)

        self.num_features = num_features
        self.cat_features = cat_features
        self.text_features = text_features
        self.date_features = date_features
    
    def __len__(self):
        return len(self.triplets)
    
    def __getitem__(self, index):
        p1_num_features = numerical_standard_scalers.transform(self.parts_features.loc[self.triplets['anchor'][[index]], self.num_features])[0]
        p1_cat_features = ordinal_encoders.transform(self.parts_features.loc[self.triplets['anchor'][[index]], self.cat_features])[0]

        p2_num_features = numerical_standard_scalers.transform(self.parts_features.loc[self.triplets['positive'][[index]], self.num_features])[0]
        p2_cat_features = ordinal_encoders.transform(self.parts_features.loc[self.triplets['positive'][[index]], self.cat_features])[0]

        p3_num_features = numerical_standard_scalers.transform(self.parts_features.loc[self.triplets['negative'][[index]], self.num_features])[0]
        p3_cat_features = ordinal_encoders.transform(self.parts_features.loc[self.triplets['negative'][[index]], self.cat_features])[0]

        return p1_num_features, p1_cat_features, p2_num_features, p2_cat_features, p3_num_features, p3_cat_features


In [17]:
class PartEncoder(torch.nn.Module):
    def __init__(self, cat_emb_dims: list, final_part_emb_dim: int):
        super(PartEncoder, self).__init__()
        self.cat_emb_layers = torch.nn.ModuleList([torch.nn.Embedding(x, y) for x, y in cat_emb_dims])
        self.fc1 = torch.nn.Linear(in_features=len(NUMERICAL_FEATURES)+sum([y for x, y in cat_emb_dims]), out_features=100)
        self.batch_norm1 = torch.nn.BatchNorm1d(100)
        self.fc2 = torch.nn.Linear(in_features=100, out_features=50)
        self.droup_out = torch.nn.Dropout(p=0.2)
        self.fc3 = torch.nn.Linear(in_features=50, out_features=final_part_emb_dim)

    def forward(self, input_num_data, input_cat_data):
        x = [emb_layer(input_cat_data[:, i]) for i, emb_layer in enumerate(self.cat_emb_layers)]
        x = torch.cat(x, 1)
        x = torch.cat([x, input_num_data], 1)
        x = torch.nn.functional.relu(self.fc1(x.float()))
        x = self.batch_norm1(x)
        x = torch.nn.functional.relu(self.fc2(x.float()))
        x = self.droup_out(x)
        x = self.fc3(x)
        return x

In [18]:
class TripletLoss(torch.nn.Module):
    def __init__(self, margin=1.0):
        super(TripletLoss, self).__init__()
        self.margin = margin
        
    def calc_euclidean(self, x1, x2):
        return (x1 - x2).pow(2).sum(1)
    
    def forward(self, anchor: torch.Tensor, positive: torch.Tensor, negative: torch.Tensor) -> torch.Tensor:
        distance_positive = self.calc_euclidean(anchor, positive)
        distance_negative = self.calc_euclidean(anchor, negative)
        losses = torch.relu(distance_positive - distance_negative + self.margin)

        return losses.mean()


In [19]:
cat_features_emb_dimensions = parts_features[CATEGORICAL_FEATURES].describe().loc['unique'].apply(lambda x: (x, min(50, ceil(1.6*x**0.56)))).values
final_part_emb_dimension = ceil(1.6*parts_features.shape[1]**0.56)

In [20]:
train_triplets_dataset, test_triplets_dataset = train_test_split(triplets_dataset, test_size=0.2)
train_dataset = PartsDataset(parts_features, train_triplets_dataset, NUMERICAL_FEATURES, CATEGORICAL_FEATURES, TEXTUAL_FEATURES, DATE_FEATURES)
train_data_loader = DataLoader(train_dataset, batch_size=100)

In [21]:
part_encoder_model = PartEncoder(cat_features_emb_dimensions, final_part_emb_dimension)
optimizer = torch.optim.SGD(part_encoder_model.parameters(), lr=0.1, momentum=0.9)
criterion = TripletLoss()

In [None]:
EPOCHS = 3
part_encoder_model.train()
for epoch in tqdm(range(EPOCHS), desc="Epochs"):
    running_loss = []
    for step, (p1_num_features, p1_cat_features, p2_num_features, p2_cat_features, p3_num_features, p3_cat_features) in enumerate(tqdm(train_data_loader, desc="Training", leave=False)):
        optimizer.zero_grad()
        
        anchor_out = part_encoder_model(p1_num_features, p1_cat_features)
        positive_out = part_encoder_model(p2_num_features, p2_cat_features)
        negative_out = part_encoder_model(p3_num_features, p3_cat_features)
        
        loss = criterion(anchor_out, positive_out, negative_out)
        loss.backward()
        optimizer.step()
        
        running_loss.append(loss.detach().numpy())
    print("Epoch: {}/{} - Loss: {:.4f}".format(epoch+1, EPOCHS, np.mean(running_loss)))
