# Data preprocessing

In [1]:
import numpy as np      
import pandas as pd
needed_data = pd.read_csv('needed.csv')

  needed_data = pd.read_csv('needed.csv')


In [180]:
needed_data

Unnamed: 0,SUBSIDIARY_NAME,PREV_RECEIPT_SYMPTOM,PREV_ENGINEER_SYMPTOM,PREV_ENGINEER_REPAIR,RECEIPT_SYMPTOM,ENGINEER_SYMPTOM,ENGINEER_REPAIR
0,LGEAI,Power,Image Retention,Adjustment external (give ref#),,No indication of power and totally inoperative...,Part replaced-electrical (ref#)
1,LGEAI,Audio,No indication of power and totally inoperative...,Exchange-PC Board,Repair Status/Issue,No indication of power and totally inoperative...,Exchange-PC Board
2,LGEAI,Power,No indication of power and totally inoperative...,Part replaced-electrical (ref#),Power,Distorted/noisy or snow picture,Part replaced-electrical (ref#)
3,LGEAI,Power,No indication of power and totally inoperative...,Part replaced-electrical (ref#),,Display malfuction,Part replaced-electrical (ref#)
4,LGEAI,Power,No indication of power and totally inoperative...,Alignment eletrical,,No indication of power and totally inoperative...,Part replaced-electrical (ref#)
...,...,...,...,...,...,...,...
13878847,LGEIL,,,,,Installation of a Product,Installation of new product
13878848,LGEIL,,,,Installation,Installation of a Product,Installation of new product
13878849,LGEIL,,,,Specification,Installation of a Product,Installation of new product
13878850,LGEIL,,,,Specification,Installation of a Product,Installation of new product


In [81]:
usa = needed_data[needed_data['SUBSIDIARY_NAME']=='LGEAI']
india = needed_data[needed_data['SUBSIDIARY_NAME']=='LGEIL']

In [82]:
india = india[(india['PREV_ENGINEER_SYMPTOM'] != 'Installation of a Product') & (india['PREV_ENGINEER_SYMPTOM']!= 'Demo')]

In [83]:
india_inapp = india[['PREV_ENGINEER_SYMPTOM','PREV_ENGINEER_REPAIR']].dropna()
india_inapp['LABEL'] = 0
india_inapp.columns = ['ENGINEER_SYMPTOM','ENGINEER_REPAIR', 'LABEL']

usa_inapp = usa[['PREV_ENGINEER_SYMPTOM','PREV_ENGINEER_REPAIR']].dropna()
usa_inapp['LABEL'] = 0
usa_inapp.columns = ['ENGINEER_SYMPTOM','ENGINEER_REPAIR', 'LABEL']

In [84]:
india_app = india[['ENGINEER_SYMPTOM','ENGINEER_REPAIR']].dropna()
india_app['LABEL'] = 1

usa_app = usa[['ENGINEER_SYMPTOM','ENGINEER_REPAIR']].dropna()
usa_app['LABEL'] = 1

In [85]:
india_labeled = pd.concat([india_inapp,india_app]).reset_index(drop=True)
india_labeled = india_labeled.drop_duplicates()

usa_labeled = pd.concat([usa_inapp,usa_app]).reset_index(drop=True)
usa_labeled = usa_labeled.drop_duplicates()

In [86]:
india_labeled['ENGINEER_SYMPTOM']=india_labeled['ENGINEER_SYMPTOM'].apply(lambda x: x.lower())
india_labeled['ENGINEER_REPAIR']=india_labeled['ENGINEER_REPAIR'].apply(lambda x: x.lower())
india_labeled = india_labeled.drop_duplicates(['ENGINEER_SYMPTOM','ENGINEER_REPAIR'])

usa_labeled['ENGINEER_SYMPTOM']=usa_labeled['ENGINEER_SYMPTOM'].apply(lambda x: x.lower())
usa_labeled['ENGINEER_REPAIR']=usa_labeled['ENGINEER_REPAIR'].apply(lambda x: x.lower())
usa_labeled = usa_labeled.drop_duplicates(['ENGINEER_SYMPTOM','ENGINEER_REPAIR'])

In [90]:
all_labeled = pd.concat([usa_labeled,india_labeled]).reset_index(drop=True).drop_duplicates()
all_labeled

Unnamed: 0,ENGINEER_SYMPTOM,ENGINEER_REPAIR,LABEL
0,image retention,adjustment external (give ref#),0
1,no indication of power and totally inoperative...,exchange-pc board,0
2,no indication of power and totally inoperative...,part replaced-electrical (ref#),0
3,no indication of power and totally inoperative...,alignment eletrical,0
4,no indication of power and totally inoperative...,authorized installation,0
...,...,...,...
4800,defect caused due to wrong usage by customer,set replacement (customer),1
4801,speaker_pdr,"product exchange (too many visits, repairs)",1
4802,top lid / magnet connection loose,re-installation,1
4803,bulb adjusted,set replacement (customer),1


# Tokenizer

In [91]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel

# Load a pre-trained model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [92]:
def generate_embeddings(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

In [93]:
def return_data(df):
    symptom_embeddings = generate_embeddings(df['ENGINEER_SYMPTOM'].tolist())
    repair_embeddings = generate_embeddings(df['ENGINEER_REPAIR'].tolist())
    labels = df['LABEL'].tolist()
    
    return symptom_embeddings, repair_embeddings, labels

symptom_embeddings, repair_embeddings, labels = return_data(all_labeled)

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Training on GPU.")
else:
    device = torch.device("cpu")
    print("GPU not available, training on CPU.")

Training on GPU.


# Siamese Network and Contrastive Loss for Contrastive Learning

In [94]:
class RepairDataset(Dataset):
    def __init__(self, symptoms, repairs, labels):
        self.symptoms = symptoms
        self.repairs = repairs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        symptom_embedding = self.symptoms[idx]
        repair_embedding = self.repairs[idx]
        label = self.labels[idx]
        return symptom_embedding, repair_embedding, label

In [135]:
class SiameseNetwork(nn.Module):
    def __init__(self):
        super(SiameseNetwork, self).__init__()
        # Define the architecture
        self.fc_layers = nn.Sequential(
            nn.Linear(768, 512),
            nn.SELU(),  # SELU activation function
            nn.Linear(512, 256),
            nn.SELU(),
            nn.Linear(256, 128)
        )

    def forward_one(self, x):
        return self.fc_layers(x)

    def forward(self, input1, input2):
        output1 = self.forward_one(input1)
        output2 = self.forward_one(input2)
        return output1, output2

In [136]:
class ContrastiveLoss(nn.Module):
    def __init__(self, margin=0.1):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, output1, output2, label):
        euclidean_distance = nn.functional.pairwise_distance(output1, output2)
        loss_contrastive = torch.mean((1-label) * torch.pow(euclidean_distance, 2) +
                                      (label) * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2))
        return loss_contrastive

In [137]:
repair_dataset = RepairDataset(symptom_embeddings, repair_embeddings,labels)

train_dataset, val_dataset = torch.utils.data.random_split(repair_dataset, [0.8, 0.2])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [139]:
NUM_EPOCHS=5

# Initialize the network and loss function
net = SiameseNetwork()
criterion = ContrastiveLoss(margin=0.1)
optimizer = torch.optim.Adam(net.parameters(), lr=0.001, weight_decay=0.001)

# Move the model to GPU if available
net.to(device)

# Assuming net, criterion, optimizer, and num_epochs are already defined

for epoch in range(NUM_EPOCHS):
    net.train()  # Set the model to training mode
    running_loss = 0.0
    for symptom, repair, label in train_loader:
        symptom, repair, label = symptom.to(device), repair.to(device), label.to(device)

        optimizer.zero_grad()
        output1, output2 = net(symptom, repair)
        loss = criterion(output1, output2, label)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f'Epoch {epoch + 1}, Training Loss: {running_loss / len(train_loader):.4f}')
    
    # Validation
    net.eval()  # Set the model to evaluation mode
    val_loss = 0.0
    with torch.no_grad():
        for symptom, repair, label in val_loader:
            symptom, repair, label = symptom.to(device), repair.to(device), label.to(device)

            output1, output2 = net(symptom, repair)
            loss = criterion(output1, output2, label)
            val_loss += loss.item()

    print(f'Epoch {epoch + 1}, Validation Loss: {val_loss / len(val_loader):.4f}')

print('Finished Training')

Epoch 1, Training Loss: 0.0230
Epoch 1, Validation Loss: 0.0022
Epoch 2, Training Loss: 0.0020
Epoch 2, Validation Loss: 0.0022
Epoch 3, Training Loss: 0.0020
Epoch 3, Validation Loss: 0.0022
Epoch 4, Training Loss: 0.0019
Epoch 4, Validation Loss: 0.0023
Epoch 5, Training Loss: 0.0020
Epoch 5, Validation Loss: 0.0023
Finished Training


In [173]:
from sklearn.metrics import accuracy_score, f1_score, roc_curve

def compute_distances(model, dataloader):
    model.to(device)
    model.eval()
    distances = []
    labels = []
    with torch.no_grad():
        for symptom, repair, label in dataloader:
            symptom, repair, label = symptom.to(device), repair.to(device), label.to(device)
            output1, output2 = model(symptom, repair)
            distance = nn.functional.pairwise_distance(output1, output2)
            distances.extend(distance.cpu().numpy())
            labels.extend(label.cpu().numpy())
    return np.array(distances), np.array(labels)

# Assuming val_loader is your validation DataLoader
distances, true_labels = compute_distances(net, val_loader)

# Finding an optimal threshold
fpr, tpr, thresholds = roc_curve(true_labels, distances, pos_label=1) 
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]

# You can also evaluate the performance using this threshold
predicted_labels = distances <= optimal_threshold
accuracy = accuracy_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

print(f"Optimal Threshold: {optimal_threshold}")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

Optimal Threshold: 0.04533032700419426
Accuracy: 0.353125
F1 Score: 0.3257328990228013


In [149]:
def generate_symptom_embedding(symptom_text):
    symptom_embedding = generate_embeddings([symptom_text])
    return symptom_embedding.to(device)

In [150]:
def is_appropriate_pair(model, symptom_embedding, repair_embedding, threshold):
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        output1, output2 = model(symptom_embedding, repair_embedding)
        euclidean_distance = nn.functional.pairwise_distance(output1, output2)
        print(euclidean_distance)
        return euclidean_distance.item() < threshold

In [184]:
input_symptom = "Part failure"
input_repair = "I don't care"
symptom_embedding = generate_symptom_embedding(input_symptom)
repair_embedding = generate_symptom_embedding(input_repair)

In [185]:
threshold=optimal_threshold
appropriate = is_appropriate_pair(net, symptom_embedding, repair_embedding, threshold)

tensor([0.0559], device='cuda:0')


In [186]:
appropriate

False

In [59]:
import optuna
from optuna.trial import TrialState
def compute_distances(model, dataloader):
        model.eval()
        distances = []
        labels = []
        with torch.no_grad():
            for symptom, repair, label in dataloader:
                symptom, repair, label = symptom.to(device), repair.to(device), label.to(device)
                output1, output2 = model(symptom, repair)
                distance = nn.functional.pairwise_distance(output1, output2)
                distances.extend(distance.cpu().numpy())
                labels.extend(label.cpu().numpy())
        return np.array(distances), np.array(labels)


def objective(trial):
    # Generate the model.
    model = SiameseNetwork().to(device)

    # Generate the optimizers.
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "AdamW"])
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    _lambda = trial.suggest_float('lambda',1e-5,1e-1,log=True)
    optimizer = getattr(torch.optim, optimizer_name)(model.parameters(), lr=lr,weight_decay=_lambda)

    repair_dataset = RepairDataset(symptom_embeddings, repair_embeddings,labels)
    train_dataset, val_dataset = torch.utils.data.random_split(repair_dataset, [0.8, 0.2])
    
    # Create DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    valid_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    margin = trial.suggest_float('margin',1e-5,10,log=True)
    criterion = ContrastiveLoss(margin=margin)
    
    epochs = trial.suggest_int('epochs', 1, 30, log=True)
    
    
    
    # Training of the model.
    for epoch in range(epochs):
        model.train()
        for _, (symptom, repair, label) in enumerate(train_loader):
            symptom, repair, label = symptom.to(device), repair.to(device), label.to(device)
            
            
            
            optimizer.zero_grad()
            output1, output2 = model(symptom, repair)
            train_loss = criterion(output1, output2, label)
            train_loss.backward()
            optimizer.step()

        # # Validation of the model.
        # model.eval()
        # with torch.no_grad():
        #     for _, (symptom, repair, label) in enumerate(valid_loader):
               

        #         symptom, repair, label = symptom.to(device), repair.to(device), label.to(device)
                
        #         output1, output2 = model(symptom, repair)
                
        #         val_loss = criterion(output1, output2, label)

        # trial.report(train_loss, epoch)
        
        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
    
    return train_loss

        
    # # Assuming val_loader is your validation DataLoader
    # distances, true_labels = compute_distances(model, val_loader)

    # # Finding an optimal threshold
    # fpr, tpr, thresholds = roc_curve(true_labels, distances)
    # optimal_idx = np.argmax(tpr - fpr)
    # optimal_threshold = thresholds[optimal_idx]

    # # You can also evaluate the performance using this threshold
    # predicted_labels = distances <= optimal_threshold
    # accuracy = accuracy_score(true_labels, predicted_labels)
    # f1 = f1_score(true_labels, predicted_labels)
    
        

    # return accuracy

In [60]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100, timeout=600)

pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2023-12-05 22:11:42,734] A new study created in memory with name: no-name-403fc5ff-a796-42d2-ae92-49b47f5d73cf
[I 2023-12-05 22:11:44,533] Trial 0 finished with value: 0.0011100664269179106 and parameters: {'optimizer': 'Adam', 'lr': 0.003546497507000307, 'lambda': 6.162165921873029e-05, 'margin': 0.0709971252551807, 'epochs': 4}. Best is trial 0 with value: 0.0011100664269179106.
[I 2023-12-05 22:11:53,155] Trial 1 finished with value: 0.00441976822912693 and parameters: {'optimizer': 'AdamW', 'lr': 0.00010427861958611028, 'lambda': 2.957083308387833e-05, 'margin': 0.0006634147882599187, 'epochs': 20}. Best is trial 0 with value: 0.0011100664269179106.
[I 2023-12-05 22:11:53,592] Trial 2 finished with value: 0.2916943430900574 and parameters: {'optimizer': 'AdamW', 'lr': 0.0018935837996756249, 'lambda': 0.0004795996568026854, 'margin': 1.2581084760719654, 'epochs': 1}. Best is trial 0 with value: 0.0011100664269179106.
[I 2023-12-05 22:11:54,463] Trial 3 finished with value: 20384.

Study statistics: 
  Number of finished trials:  98
  Number of pruned trials:  0
  Number of complete trials:  98
Best trial:
  Value:  0.0
  Params: 
    optimizer: RMSprop
    lr: 0.0010505691980853566
    lambda: 0.07948347961084459
    margin: 5.72106327536661e-05
    epochs: 30


# Sentence Transformer

In [11]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
sentences_symptoms = list(usa_labeled['ENGINEER_SYMPTOM'])
s_emb_symptoms = model.encode(sentences_symptoms)

sentences_repairs = list(usa_labeled['ENGINEER_REPAIR'])
s_emb_repairs = model.encode(sentences_repairs)

In [13]:
s_emb_symptoms.shape

(4555, 384)

In [18]:
labels = usa_labeled['LABEL'].values

In [19]:
len(labels)

4555

In [25]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np

# Assuming embeddings and labels are loaded into these variables
# symptom_embeddings: numpy array of shape (154, 384)
# repair_embeddings: numpy array of shape (108, 384)
# labels: numpy array of shape (154,) with binary values

class CustomDataset(Dataset):
    def __init__(self, symptom_embeddings, repair_embeddings, labels):
        self.symptom_embeddings = symptom_embeddings
        self.repair_embeddings = repair_embeddings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        symptom_embedding = self.symptom_embeddings[idx]
        repair_embedding = self.repair_embeddings[idx % len(self.repair_embeddings)]  # Adjust index for repair_embeddings
        label = self.labels[idx]
        return symptom_embedding, repair_embedding, label

class ContrastiveLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, output1, output2, label):
        euclidean_distance = nn.functional.pairwise_distance(output1, output2)
        loss_contrastive = torch.mean((1-label) * torch.pow(euclidean_distance, 2) +
                                      label * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2))
        return loss_contrastive

class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(768, 128),  # Input dimension is 768 since we concatenate two embeddings
            nn.ReLU(),
            nn.Linear(128, 32),
            nn.ReLU(),
            nn.Linear(32,1),
        )

    def forward(self, x):
        return self.fc(x)

def train(model, dataloader, criterion, optimizer, epochs=10):
    for epoch in range(epochs):
        for symptom_emb, repair_emb, label in dataloader:
            combined_emb = torch.cat((symptom_emb, repair_emb), dim=1)  # Concatenate embeddings

            optimizer.zero_grad()

            # Forward pass
            output = model(combined_emb)

            # Contrastive loss
            loss = criterion(output, output, label.float())  # Using the same output as both inputs to the loss function

            # Backward and optimize
            loss.backward()
            optimizer.step()

        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.7f}')

In [23]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate(model, dataloader, epochs = 10):
    model.eval()  # Set the model to evaluation mode
    predictions, actuals = [], []
    with torch.no_grad():
        for symptom_emb, repair_emb, label in dataloader:
            combined_emb = torch.cat((symptom_emb, repair_emb), dim=1)
            output = model(combined_emb)
            print(output)
            predictions.append(output)
            actuals.append(label)

    accuracy = accuracy_score(actuals, predictions)
    precision = precision_score(actuals, predictions)
    recall = recall_score(actuals, predictions)
    f1 = f1_score(actuals, predictions)
    return accuracy, precision, recall, f1

In [26]:
# num_epochs = 10
custom_dataset = CustomDataset(s_emb_symptoms, s_emb_repairs, labels)
train_dataset, val_dataset = torch.utils.data.random_split(custom_dataset, [0.8, 0.2])
train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=10, shuffle=True)

model = SimpleNN()
criterion = ContrastiveLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# for epoch in range(num_epochs):
train(model, train_loader, criterion, optimizer)
accuracy, precision, recall, f1 = evaluate(model, val_loader)
print(f'Validation - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}')

Epoch [1/10], Loss: 0.7499985
Epoch [2/10], Loss: 0.7499985
Epoch [3/10], Loss: 0.7499985
Epoch [4/10], Loss: 0.9999980
Epoch [5/10], Loss: 0.4999990
Epoch [6/10], Loss: 0.7499985
Epoch [7/10], Loss: 0.7499985
Epoch [8/10], Loss: 0.7499985
Epoch [9/10], Loss: 0.4999990
Epoch [10/10], Loss: 0.7499985
tensor([[0.0972],
        [0.0921],
        [0.0957],
        [0.0914],
        [0.0920],
        [0.0907],
        [0.0917],
        [0.0903],
        [0.0954],
        [0.0983]])
tensor([[0.0924],
        [0.0929],
        [0.0961],
        [0.0926],
        [0.0987],
        [0.0970],
        [0.1022],
        [0.0884],
        [0.0914],
        [0.0954]])
tensor([[0.0930],
        [0.0937],
        [0.0941],
        [0.0909],
        [0.0898],
        [0.0926],
        [0.0943],
        [0.0944],
        [0.0923],
        [0.0931]])
tensor([[0.0900],
        [0.0912],
        [0.0970],
        [0.0928],
        [0.0927],
        [0.0982],
        [0.0988],
        [0.0930],
        [0.0

  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)


ValueError: unknown is not supported

## TEst

In [3]:
from gensim.models import KeyedVectors

# Load a pre-trained Word2Vec model (e.g., Google News vectors)
word2vec_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [49]:
import nltk
class_names = needed_data['processed_text'].unique()

def vectorize_class_name(class_name, model):
    words = class_name.split()
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

class_vectors = np.array([vectorize_class_name(name, word2vec_model) for name in class_names])

In [59]:
repair_counts = needed_data['processed_text'].value_counts()

# You can then examine these counts to determine a threshold for low-frequency classes
print(repair_counts.median())

# Decide on a threshold
threshold = repair_counts.median() + 1  # Example threshold
print(threshold)
# Identify classes below the threshold
low_freq_classes = repair_counts[repair_counts < threshold].index

1.0
2.0


In [67]:
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
# Example: Find the most similar class for each class in 'low_freq_classes'
similar_classes = {}
for class_name in tqdm(low_freq_classes):
    class_index = np.where(class_names == class_name)[0][0]
    class_vector = class_vectors[class_index]
    
    class_vector = class_vector.reshape(1,-1)
    class_vectors = np.array(class_vectors)
    # Compute similarities
    similarities = cosine_similarity(class_vector, class_vectors).flatten()

    # Find the most similar class
    most_similar_index = similarities.argsort()[-2]  # The most similar class (excluding itself)
    similar_classes[class_name] = class_names[most_similar_index]

100%|██████████| 71524/71524 [6:26:10<00:00,  3.09it/s]  


In [25]:
similar_classes

{'Alignment  tape path': 'Alignment  eletrical',
 'Repaired mechanical parts': 'Repaired electrical parts',
 'Format': 'Alignment  Tuner (AM/FM  etc)',
 'Product exchange': 'Product exchange',
 'PART REPLACED OTHER INTERNAL': 'PART REPLACED OTHER EXTERNAL',
 'No backlight': 'LCD pixel bright',
 'Repaired broken PC board trace': 'Repaired cracked PC board',
 'White Balancing': 'Alignment  eletrical',
 'Specification measurement': 'Component video no color',
 'WIRING HARNESS, CONTROL replacement': 'COMPLEX BUSBAR replacement',
 'Software upgrade': 'Software correction, reset',
 'PART REPLACED OTHER EXTERNAL': 'PART REPLACED OTHER INTERNAL',
 'Cover replacement': 'Battery,Assembly replacement',
 'Repaired cracked PC board': 'Repaired broken PC board trace',
 "Pipe ass'y Outlet replacement": "Pipe ass'y Inlet replacement",
 'Dead-On-Arrival Part': 'Part replaced-mechanical (ref#)',
 'DVI distorted not operating': 'DVI distorted',
 'Tape problem': 'Alignment  tape path',
 'Component video n

In [33]:
target_classes = set(similar_classes.values())
len(target_classes)
needed_data['ENGINEER_REPAIR_MAPPED'] = needed_data['ENGINEER_REPAIR'].apply(lambda x: similar_classes[x] if x in similar_classes and x not in target_classes else x)

In [34]:
sample = needed_data.sample(1024,random_state=42)

In [35]:
sample

Unnamed: 0.1,Unnamed: 0,PREV_RECEIPT_SYMPTOM,PREV_ENGINEER_SYMPTOM,PREV_ENGINEER_REPAIR,RECEIPT_SYMPTOM,ENGINEER_SYMPTOM,ENGINEER_REPAIR,combined_text,processed_text,repair_label,ENGINEER_REPAIR_MAPPED
1053981,1875091,Video,Display malfuction,Exchange-excessive service,,Special feature/operation does not work,Part replaced-electrical (ref#),Video Display malfuction Exchange-excessive se...,video display malfuction exchange excessive se...,112,Part replaced-electrical (ref#)
4154566,7530136,,,,Troubleshooting - Power,Part failure,Parts replaced,Troubleshooting - Power Part failure,troubleshooting power part failure,118,Parts replaced
7230514,13409326,,,,Specification,Wet Service,Wet/Preventive Maintenance,Specification Wet Service,specification wet service,172,Wet/Preventive Maintenance
7158500,13265951,,,,,Accessory Missing/ Damaged,Accessory Sale,Accessory Missing/ Damaged,accessory missing damaged,0,Accessory Sale
1958580,3538980,,,,Power Issues,Part failure,Parts replaced,Power Issues Part failure,power issues part failure,118,Parts replaced
...,...,...,...,...,...,...,...,...,...,...,...
5809504,10772128,,,,Specification,Wet Service,Wet/Preventive Maintenance,Specification Wet Service,specification wet service,172,Wet/Preventive Maintenance
3184222,5629940,,,,Troubleshooting - Power,Payment Recieved from customer,O/W Estimate given,Troubleshooting - Power Payment Recieved from ...,troubleshooting power payment recieved from cu...,102,O/W Estimate given
4323144,7869696,,,,Video Issues,Part failure,O/W Estimate given,Video Issues Part failure,video issues part failure,102,O/W Estimate given
6000232,11126532,,,,Specification,Wet Service,Wet/Preventive Maintenance,Specification Wet Service,specification wet service,172,Wet/Preventive Maintenance


In [28]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tabpfn import TabPFNClassifier

In [36]:
encoder = LabelEncoder()
sample['repair_label'] = encoder.fit_transform(sample['ENGINEER_REPAIR_MAPPED'])

In [37]:
X = sample['processed_text']
y = sample['repair_label']

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)