<a href="https://colab.research.google.com/github/juliawol/WB_Knowledge_Base/blob/main/WB_Zero_Shot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Install necessary libraries
!pip install transformers snorkel

Collecting snorkel
  Downloading snorkel-0.9.9-py3-none-any.whl.metadata (9.7 kB)
Collecting munkres>=1.0.6 (from snorkel)
  Downloading munkres-1.1.4-py2.py3-none-any.whl.metadata (980 bytes)
Downloading snorkel-0.9.9-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading munkres-1.1.4-py2.py3-none-any.whl (7.0 kB)
Installing collected packages: munkres, snorkel
Successfully installed munkres-1.1.4 snorkel-0.9.9


In [4]:
import pandas as pd

# Load the dataset
df_full = pd.read_csv('/content/qa_card_dataset.csv')

#Remove duplicates
df = df_full.drop_duplicates()

# Save the cleaned dataset (optional)
df.to_csv('/content/qa_card_dataset_cleaned.csv', index=False)

print(f"Original dataframe shape: {df_full.shape}")
print(f"Cleaned dataframe shape: {df.shape}")

Original dataframe shape: (10000, 4)
Cleaned dataframe shape: (6968, 4)


In [8]:
from transformers import pipeline
from snorkel.labeling import labeling_function, PandasLFApplier, LFAnalysis
from snorkel.labeling.model import LabelModel
from sklearn.metrics import classification_report, accuracy_score

# Load dataset
train_data = pd.read_csv('/content/qa_card_dataset_cleaned.csv')

# Preprocess text
def preprocess_text(text):
    if pd.isnull(text):  # Handle NaN values
        return ""
    return " ".join(str(text).lower().split())

train_data['Question_clean'] = train_data['Question'].fillna("").apply(preprocess_text)
train_data['Description_clean'] = train_data['Description'].fillna("").apply(preprocess_text)
train_data['Answer_clean'] = train_data['Answer'].fillna("").apply(preprocess_text)

# Initialize Zero-Shot Classifier
zero_shot_classifier = pipeline("zero-shot-classification", model="joeddav/xlm-roberta-large-xnli")
candidate_labels = ["answerable", "not answerable"]


Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
# Define Heuristic Labeling Functions
@labeling_function()
def lf_zero_shot(row):
    """Use zero-shot classification to predict sufficiency."""
    result = zero_shot_classifier(
        row['Question_clean'],
        candidate_labels=candidate_labels,
        hypothesis_template="The question is {} based on the description."
    )
    return 1 if result['labels'][0] == "answerable" else 0

@labeling_function()
def lf_keyword_match(row):
    """Label 1 if specific keywords in the question appear in the description."""
    keywords = ["гарантия", "размер", "цена", "материал", "годен"]
    question_words = set(row['Question_clean'].split())
    description_words = set(row['Description_clean'].split())
    return 1 if len(question_words & set(keywords) & description_words) > 0 else -1  # Abstain if no match

@labeling_function()
def lf_description_length(row):
    """Label 0 if description length is too short."""
    return 0 if len(row['Description_clean']) < 50 else -1  # Abstain if length is reasonable

@labeling_function()
def lf_answer_in_description(row):
    """Label 1 if the seller's answer references content from the description."""
    answer_words = set(row['Answer_clean'].split())
    description_words = set(row['Description_clean'].split())
    return 1 if len(answer_words & description_words) > 0 else -1  # Abstain if no overlap

# Combine Labeling Functions
lfs = [lf_zero_shot, lf_keyword_match, lf_description_length, lf_answer_in_description]
applier = PandasLFApplier(lfs=lfs)

# Apply Labeling Functions
L_train = applier.apply(train_data)

# Analyze Labeling Function Performance
lf_analysis = LFAnalysis(L=L_train, lfs=lfs)
print("Labeling Function Summary:")
print(lf_analysis.lf_summary())

# Train a Label Model
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, lr=0.01)

# Generate Probabilistic Labels
train_data['True_Class'] = label_model.predict(L=L_train)
train_data['True_Class_Prob'] = label_model.predict_proba(L=L_train)[:, 1]  # Confidence score for class 1

# Save the auto-labeled dataset
train_data.to_csv('/content/auto_labeled_dataset.csv', index=False)


100%|██████████| 6968/6968 [2:55:29<00:00,  1.51s/it]


Labeling Function Summary:
                          j Polarity  Coverage  Overlaps  Conflicts
lf_zero_shot              0   [0, 1]  1.000000  0.894087   0.270666
lf_keyword_match          1      [1]  0.044489  0.044489   0.003588
lf_description_length     2       []  0.000000  0.000000   0.000000
lf_answer_in_description  3      [1]  0.888777  0.888777   0.270092


100%|██████████| 500/500 [00:00<00:00, 507.35epoch/s]
