In [15]:
import pandas as pd
import spacy
from sklearn.metrics import precision_score, recall_score, f1_score
from spacy.training import Example
import warnings
warnings.filterwarnings("ignore")
import random
from spacy.pipeline import EntityRuler

# Load the uploaded CSV files
train_data_path = 'train_1_translated_cleaned.csv'
test_data_path = 'test_1_translated_cleaned.csv'

# Read the datasets
train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

# Display basic information and the first few rows of each dataset
train_info = train_df.info(), train_df.head()
test_info = test_df.info(), test_df.head()

train_info, test_info


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Component              1000 non-null   object 
 1   Failure                1000 non-null   object 
 2   DefectivePart          1000 non-null   object 
 3   PartsReplaced          1000 non-null   float64
 4   Cost                   1000 non-null   float64
 5   DealerComment_English  1000 non-null   object 
dtypes: float64(2), object(4)
memory usage: 47.0+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Component              1000 non-null   object 
 1   Failure                1000 non-null   object 
 2   DefectivePart          1000 non-null   object 
 3   PartsReplaced          1000 non-null   float64
 4   Cost

((None,
        Component         Failure                   DefectivePart  \
  0  REMORC AS24H          DEFECT               ECU-BODY COMPUTER   
  1  REMORC AS24H  NO FAULT FOUND              AUTOM.TRANSMISSION   
  2  REMORC AS24H          DEFECT                   DELIVERY LINE   
  3  REMORC AS24H          DEFECT  DUMMY MATERIAL FOR CAUSAL PART   
  4  REMORC AS24H          DEFECT  DUMMY MATERIAL FOR CAUSAL PART   
  
     PartsReplaced    Cost                              DealerComment_English  
  0            0.0  126.70  RECORD DOSSIER NO. 8001751090 BREAKDOWN LOCATI...  
  1            0.0  313.32  COMPLAIN The gearbox snaps when starting CAUSE...  
  2            0.0  124.25  RECL BREAKDOWN LOCATION: FRANKFURT AIRPORT, BU...  
  3            0.0  297.50  RECORD DOSSIER NUMBER: 8001717298 BREAKDOWN LO...  
  4            0.0  155.05  RECORD DOSSIER NUMBER: 8001731587 BREAKDOWN LO...  ),
 (None,
              Component Failure         DefectivePart  PartsReplaced  Cost  \
  0  EN

In [16]:


# Load spaCy's pre-trained English model
nlp = spacy.load("en_core_web_sm")

# Sample a subset of dealer comments for initial NER analysis
sample_comments = train_df["DealerComment_English"].sample(10, random_state=1)

# Perform NER on the sample comments
ner_results = []
for comment in sample_comments:
    doc = nlp(comment)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    ner_results.append({"comment": comment, "entities": entities})

# Display the NER results
ner_results


[{'comment': 'COMPLAINT SAT NAV INTERMITTENTLY WORKING CAUSE INTERNAL FAULT IN SD CARD CORRECTION REMOVED AND REPLACED CARD TESTED OK. 003 - EXTRA LABOUR CLAIOMED AS COULD NOT LOCATE SRT TO REPLACE SD CARD.',
  'entities': [('COMPLAINT SAT NAV INTERMITTENTLY WORKING CAUSE INTERNAL FAULT IN SD CARD',
    'ORG'),
   ('003', 'CARDINAL')]},
 {'comment': 'RECL Airbag fault is displayed REASON No error stored in the IPC CORRECTION Update system according to KA4971',
  'entities': [('RECL Airbag', 'ORG'),
   ('REASON', 'NORP'),
   ('KA4971', 'PRODUCT')]},
 {'comment': 'COMPLAINT RADIO JAMS AND DOES NOT RESPOND TO BUTTONS, RADIO DOES NOT WORK, RADIO REPLACEMENT CPRZ RADIO JAMS AND DOES NOT RESPOND TO BUTTONS, RADIO DOES NOT WORK, RADIO REPLACEMENT CORRECTION RADIO JAMS AND DOES NOT RESPOND TO BUTTONS, RADIO DOES NOT WORK, RADIO REPLACEMENT',
  'entities': []},
 {'comment': 'RECLAIM Customer complaint: No DAB+ reception from radio possible REASON DAB radio has an internal short circuit CORRECTI

In [17]:


# Initialize spaCy's blank model
nlp = spacy.blank("en")

# Create and add the EntityRuler to the pipeline directly
ruler = nlp.add_pipe("entity_ruler")

# Define and add patterns for custom NER
patterns = [
    # Defective Component and Part Patterns
    {"label": "DEFECTIVE_COMPONENT", "pattern": [{"LOWER": {"IN": ["nox", "ad-blue", "control", "display", "engine", "pcm", "ecu"]}}]},
    {"label": "DEFECTIVE_PART", "pattern": [{"LOWER": {"IN": ["sensor", "module", "line", "cable", "nozzle", "injector", "pump", "valve"]}}]},

    # Type of Failure Patterns
    {"label": "TYPE_OF_FAILURE", "pattern": [{"LOWER": {"IN": ["defective", "error", "fault", "issue", "malfunction", "broken"]}}, {"LOWER": "message", "OP": "?"}]},

    # Number of Parts Replaced
    {"label": "NUM_PARTS_REPLACED", "pattern": [{"IS_DIGIT": True}, {"LOWER": "x"}, {"LOWER": {"IN": ["sensor", "line", "module", "part"]}}]},
    {"label": "NUM_PARTS_REPLACED", "pattern": {"REGEX": r"\b(one|two|three|four|five|six|seven|eight|nine|ten)\s+(sensors?|lines?|modules?|parts?)\b"}},

    # Total Cost Incurred with optional currency symbols
    {"label": "TOTAL_COST_INCURRED", "pattern": {"REGEX": r"\b\d+(\.\d+)?\s*(eur|usd|euros|dollars|\$)?\b"}},

    # Cause of Failure Patterns
    {"label": "CAUSE_FAILURE_HUMAN_ERROR", "pattern": [{"LOWER": "improper"}, {"LOWER": "use"}]},
    {"label": "CAUSE_FAILURE_DESIGN_FLAW", "pattern": [{"LOWER": "design"}, {"LOWER": "flaw"}]},
    {"label": "CAUSE_FAILURE_MATERIAL", "pattern": [{"LOWER": {"IN": ["material", "manufacturing"]}}, {"LOWER": {"IN": ["flaw", "defect", "issue"]}, "OP": "?"}]},
    {"label": "CAUSE_FAILURE_DESIGN_FLAW", "pattern": {"REGEX": r"\b(design|manufacturing)\s(flaw|defect|issue)\b"}}
]
ruler.add_patterns(patterns)

# Apply the pipeline to a sample of dealer comments
sample_comments = train_df["DealerComment_English"].sample(1000, random_state=42)
extracted_entities = []

for comment in sample_comments:
    doc = nlp(comment)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    extracted_entities.append({"comment": comment, "entities": entities})

# Display the sample output with extracted entities
extracted_entities


[{'comment': 'COMPLAINT Radio + phone handsfree dosent operation. CAUSE Phone handsfree microphone defective contacts. CORRECTION R-I roof light (included Phone handsfree microphone) + Phone handsfree microphone repair. Economy: Phone handsfree microphone repair.',
  'entities': [('defective', 'TYPE_OF_FAILURE')]},
 {'comment': 'COMPLAINT PCM FAILURE, CAUSE POOR PERFORMANCE PCM CONTROL UNIT BATTERY SOLUTION REPLACE BUFFER BATTERY IN DATA CONTROL UNIT PREPARATION OF THE U.D.T. STATION TEST CONTROL UNITS AND ERASING MEMORY FAULTS',
  'entities': [('PCM', 'DEFECTIVE_COMPONENT'),
   ('PCM', 'DEFECTIVE_COMPONENT'),
   ('CONTROL', 'DEFECTIVE_COMPONENT'),
   ('CONTROL', 'DEFECTIVE_COMPONENT'),
   ('CONTROL', 'DEFECTIVE_COMPONENT')]},
 {'comment': 'DAMAGE TYPE ANOMALY REPORTED BY CONTROL ROOM CAUSE PCM BUFFER BATTERY INEFFICIENT/POOR PERFORMANCE CORRECTION PCM BATTERY REPLACEMENT',
  'entities': [('CONTROL', 'DEFECTIVE_COMPONENT'),
   ('PCM', 'DEFECTIVE_COMPONENT'),
   ('PCM', 'DEFECTIVE_COMPO

In [18]:
# Convert extracted entities into training data format
TRAIN_DATA = []
for entry in extracted_entities:
    text = entry['comment']
    entities = []
    for ent_text, ent_label in entry['entities']:
        start = text.find(ent_text)
        end = start + len(ent_text)
        # Only add the entity if it was found in the text
        if start != -1:
            entities.append((start, end, ent_label))
    TRAIN_DATA.append((text, {"entities": entities}))

# Verify sample output of TRAIN_DATA
TRAIN_DATA[:5]


[('COMPLAINT Radio + phone handsfree dosent operation. CAUSE Phone handsfree microphone defective contacts. CORRECTION R-I roof light (included Phone handsfree microphone) + Phone handsfree microphone repair. Economy: Phone handsfree microphone repair.',
  {'entities': [(85, 94, 'TYPE_OF_FAILURE')]}),
 ('COMPLAINT PCM FAILURE, CAUSE POOR PERFORMANCE PCM CONTROL UNIT BATTERY SOLUTION REPLACE BUFFER BATTERY IN DATA CONTROL UNIT PREPARATION OF THE U.D.T. STATION TEST CONTROL UNITS AND ERASING MEMORY FAULTS',
  {'entities': [(10, 13, 'DEFECTIVE_COMPONENT'),
    (10, 13, 'DEFECTIVE_COMPONENT'),
    (50, 57, 'DEFECTIVE_COMPONENT'),
    (50, 57, 'DEFECTIVE_COMPONENT'),
    (50, 57, 'DEFECTIVE_COMPONENT')]}),
 ('DAMAGE TYPE ANOMALY REPORTED BY CONTROL ROOM CAUSE PCM BUFFER BATTERY INEFFICIENT/POOR PERFORMANCE CORRECTION PCM BATTERY REPLACEMENT',
  {'entities': [(32, 39, 'DEFECTIVE_COMPONENT'),
    (51, 54, 'DEFECTIVE_COMPONENT'),
    (51, 54, 'DEFECTIVE_COMPONENT')]}),
 ('COMPLAINT NIS FREEZES

In [19]:
# Function to remove duplicate and overlapping entities
def filter_entities(entities):
    seen = set()
    filtered_entities = []
    for start, end, label in entities:
        if (start, end) not in seen:
            filtered_entities.append((start, end, label))
            seen.add((start, end))
    return filtered_entities

# Apply the filtering function to TRAIN_DATA
filtered_train_data = []
for text, annotations in TRAIN_DATA:
    entities = annotations["entities"]
    # Sort entities by start index to check for overlaps
    entities = sorted(entities, key=lambda x: x[0])
    filtered_entities = []
    last_end = -1
    for start, end, label in entities:
        # Ensure no overlap by checking the last entity's end
        if start >= last_end:
            filtered_entities.append((start, end, label))
            last_end = end
    filtered_train_data.append((text, {"entities": filter_entities(filtered_entities)}))

# Now, filtered_train_data can be used for training


In [26]:
from spacy.training import Example
import random

# Check if "ner" component exists, if not, add it
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner", last=True)
else:
    ner = nlp.get_pipe("ner")

# Add the labels to the NER pipeline
for _, annotations in filtered_train_data:
    for ent in annotations["entities"]:
        ner.add_label(ent[2])

# Initialize the optimizer
optimizer = nlp.initialize()

# Training loop with filtered data
num_epochs = 10
for epoch in range(num_epochs):
    random.shuffle(filtered_train_data)
    losses = {"ner": 0}  # Explicitly track NER losses

    for text, annotations in filtered_train_data:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], drop=0.5, sgd=optimizer, losses=losses)

    print(f"Epoch {epoch + 1}/{num_epochs}, NER Loss: {losses['ner']}")

# Save the model
output_dir = "./custom_ner_model"
nlp.to_disk(output_dir)
print(f"Model saved to {output_dir}")


Epoch 1/10, NER Loss: 1998.1680638986184
Epoch 2/10, NER Loss: 1149.1606185359797
Epoch 3/10, NER Loss: 1007.4163049113686
Epoch 4/10, NER Loss: 959.1832345904916
Epoch 5/10, NER Loss: 933.0228674187343
Epoch 6/10, NER Loss: 881.8376371672871
Epoch 7/10, NER Loss: 840.2406733103259
Epoch 8/10, NER Loss: 765.2994801339414
Epoch 9/10, NER Loss: 746.6582468930318
Epoch 10/10, NER Loss: 694.4778208983848
Model saved to ./custom_ner_model


In [27]:


# Load the saved model
output_dir = "./custom_ner_model"
nlp = spacy.load(output_dir)
print("Model loaded successfully!")


Model loaded successfully!


In [28]:
# Sample text for testing
test_text = "The engine control unit showed an error message. Replaced 1 sensor and incurred a cost of 150 euros."

# Process the text with the model
doc = nlp(test_text)

# Print recognized entities
print("Entities in the test text:")
for ent in doc.ents:
    print(f"Text: {ent.text}, Label: {ent.label_}")


Entities in the test text:
Text: engine, Label: DEFECTIVE_COMPONENT
Text: control, Label: DEFECTIVE_COMPONENT
Text: error message, Label: TYPE_OF_FAILURE
Text: sensor, Label: DEFECTIVE_PART


In [30]:

from sklearn.metrics import precision_score, recall_score, f1_score


# Load the trained model
nlp = spacy.load("./custom_ner_model")

# Load test data
test_df = pd.read_csv("test_1_translated_cleaned.csv")

# Define a function to format true entities in spaCy’s format
def format_entities(row):
    text = row["DealerComment_English"]
    entities = []
    for col, label in zip(["Component", "Failure", "DefectivePart", "PartsReplaced", "Cost"],
                          ["DEFECTIVE_COMPONENT", "TYPE_OF_FAILURE", "DEFECTIVE_PART", "NUM_PARTS_REPLACED", "TOTAL_COST_INCURRED"]):
        if pd.notnull(row[col]):
            start = text.find(str(row[col]))
            if start != -1:
                end = start + len(str(row[col]))
                entities.append((start, end, label))
    return (text, {"entities": entities})

# Format the test data
test_data = [format_entities(row) for _, row in test_df.iterrows()]

# Evaluate model on test data
y_true = []
y_pred = []

for text, annotations in test_data:
    doc = nlp(text)
    true_entities = [(start, end, label) for start, end, label in annotations["entities"]]
    pred_entities = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]

    y_true.extend(true_entities)
    y_pred.extend(pred_entities)

# Convert to binary format for scikit-learn metrics calculation
def convert_to_binary(entity_list, all_possible_labels):
    binary_matrix = []
    for entity in entity_list:
        entity_labels = set((start, end, label) for start, end, label in entity)
        binary_matrix.append([1 if (start, end, label) in entity_labels else 0 for label in all_possible_labels])
    return binary_matrix

all_possible_labels = list(set(y_true + y_pred))

# Calculate precision, recall, and F1-score
precision = precision_score(y_true_binary, y_pred_binary, average="weighted")
recall = recall_score(y_true_binary, y_pred_binary, average="weighted")
f1 = f1_score(y_true_binary, y_pred_binary, average="weighted")

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")


NameError: name 'y_true_binary' is not defined