In [1]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("CUDA devices:", torch.cuda.device_count())
print("Current device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(0))  # Replace 0 with another index if needed


CUDA available: True
CUDA devices: 1
Current device: 0
Device name: NVIDIA GeForce RTX 4050 Laptop GPU


In [None]:
import spacy

# Load a SpaCy English model with NER capabilities
# Choose one:
# nlp = spacy.load("en_core_web_sm")
nlp = spacy.load("en_core_web_trf")  # Larger model, better accuracy if GPU is available

# Assume you already have the 'normalize_raw_text' function and NON_WRITER_KEYWORDS from the previous solution
# as well as any other helper functions like `is_publisher_or_non_writer` and `normalize_name`.

def filter_with_ner(cleaned_text: str) -> str:
    """
    After the initial rule-based cleaning, this function uses SpaCy NER to 
    ensure that we only keep PERSON entities from the cleaned_text.
    """
    # The cleaned_text is something like "Justyce Kaseem Wright/Abdou Gambetta"
    if not cleaned_text:
        return ""
    
    # Split by '/' to process each candidate name
    candidates = [c.strip() for c in cleaned_text.split('/') if c.strip()]
    
    final_entities = []
    for candidate in candidates:
        # Use SpaCy to detect named entities
        doc = nlp(candidate)
        
        # If SpaCy doesn't identify the name as a PERSON, we may consider it non-writer (or we can fallback to heuristics)
        person_count = 0
        for ent in doc.ents:
            if ent.label_ == "PERSON":
                person_count += 1
        
        # Heuristic: If at least one PERSON entity is found in candidate, we keep it.
        # This can be refined further, for example if candidate has multiple entities and you want all to be PERSON.
        if person_count > 0:
            final_entities.append(candidate)
    
    # Join remaining entities with '/'
    return "/".join(final_entities)


def advanced_normalize_raw_text(raw_text: str) -> str:
    """
    This function first applies the previous rule-based normalization, then refines it using NER.
    """
    # Step 1: Use your previous solution to get an initial cleaned/filtered text
    #initial_cleaned = normalize_raw_text(raw_text)  # from your previously implemented code
    
    # Step 2: Apply NER-based filtering
    final_cleaned = filter_with_ner(raw_text)#initial_cleaned)
    
    return final_cleaned

# ------------------ Example Testing ------------------

example1 = "<Unknown>/Wright, Justyce Kaseem"
example2 = "Pixouu/Abdou Gambetta/Copyright Control"
example3 = "Mike Hoyer/JERRY CHESNUT/SONY/ATV MUSIC PUBLISHING (UK) LIMITED"

print("Example 1:")
print("RAW:", example1)
print("After Rule-based:", normalize_raw_text(example1))
print("After NER-based:", advanced_normalize_raw_text(example1))
print()

print("Example 2:")
print("RAW:", example2)
print("After Rule-based:", normalize_raw_text(example2))
print("After NER-based:", advanced_normalize_raw_text(example2))
print()

print("Example 3:")
print("RAW:", example3)
print("After Rule-based:", normalize_raw_text(example3))
print("After NER-based:", advanced_normalize_raw_text(example3))
print()

In [None]:
import pandas as pd
import re

# Example filtering keywords that indicate non-writer entities:
NON_WRITER_KEYWORDS = [
    "unknown", "copyright", "control", "publishing", "music", "records",
    "limited", "ltd", "universal", "warner", "sony", "bmg", "ascap", "bmi", "socan"
]

def is_publisher_or_non_writer(entity: str) -> bool:
    # Check if the entity contains any of the known non-writer keywords
    # We do a case-insensitive check
    entity_lower = entity.lower()
    return any(keyword in entity_lower for keyword in NON_WRITER_KEYWORDS)

def normalize_name(name: str) -> str:
    # If the name is in format "Lastname, Firstname ..."
    # Convert it to "Firstname ... Lastname"
    name = name.strip()
    if "," in name:
        parts = [p.strip() for p in name.split(",")]
        if len(parts) == 2:
            lastname, firstname = parts
            # Convert to title case:
            firstname_tokens = [t.capitalize() for t in firstname.split()]
            lastname_tokens = [t.capitalize() for t in lastname.split()]
            # Reassemble as "Firstname ... Lastname"
            name = " ".join(firstname_tokens + lastname_tokens)
        else:
            # In case of multiple commas (rare)
            # E.g., "Wright, Justyce Kaseem" should just handle 1 comma.
            # If multiple commas appear, handle gracefully:
            # Assume last part is lastname, rest is firstname/middle:
            lastname = parts[0]
            firstname_middle = " ".join(parts[1:])
            firstname_middle_tokens = [t.capitalize() for t in firstname_middle.split()]
            lastname_tokens = [t.capitalize() for t in lastname.split()]
            name = " ".join(firstname_middle_tokens + lastname_tokens)
    else:
        # No comma, just title case the whole name
        name_tokens = [t.capitalize() for t in name.split()]
        name = " ".join(name_tokens)
    return name

def normalize_raw_text(raw_text: str) -> str:
    # Split by "/"
    entities = [e.strip() for e in raw_text.split("/") if e.strip()]

    # Filter out non-writer entities
    writer_candidates = [e for e in entities if not is_publisher_or_non_writer(e)]

    # Normalize each writer candidate name
    normalized_names = [normalize_name(w) for w in writer_candidates]

    # Join back with "/"
    normalized_text = "/".join(normalized_names)
    return normalized_text

# ------------------ Example Testing ------------------
example1 = "<Unknown>/Wright, Justyce Kaseem"
example2 = "Pixouu/Abdou Gambetta/Copyright Control"
example3 = "Mike Hoyer/JERRY CHESNUT/SONY/ATV MUSIC PUBLISHING (UK) LIMITED"

print("Example 1:")
print("RAW:", example1)
print("Normalized:", normalize_raw_text(example1))
print()

print("Example 2:")
print("RAW:", example2)
print("Normalized:", normalize_raw_text(example2))
print()

print("Example 3:")
print("RAW:", example3)
print("Normalized:", normalize_raw_text(example3))
print()

# ------------------ Processing a Dataset ------------------
# Suppose we have a CSV with columns: raw_comp_writers_text, CLEAN_TEXT
# We apply the normalization to raw_comp_writers_text and compare with CLEAN_TEXT

# df = pd.read_csv('path_to_dataset.csv')
# df['PREDICTED_CLEAN_TEXT'] = df['raw_comp_writers_text'].apply(normalize_raw_text)

# Evaluate or inspect a few rows
# print(df.head())





Example 1:
RAW: <Unknown>/Wright, Justyce Kaseem
Normalized: Justyce Kaseem Wright

Example 2:
RAW: Pixouu/Abdou Gambetta/Copyright Control
Normalized: Pixouu/Abdou Gambetta

Example 3:
RAW: Mike Hoyer/JERRY CHESNUT/SONY/ATV MUSIC PUBLISHING (UK) LIMITED
Normalized: Mike Hoyer/Jerry Chesnut



In [5]:
import spacy

# Load a SpaCy English model with NER capabilities
# Choose one:
# nlp = spacy.load("en_core_web_sm")
nlp = spacy.load("en_core_web_trf")  # Larger model, better accuracy if GPU is available

# Assume you already have the 'normalize_raw_text' function and NON_WRITER_KEYWORDS from the previous solution
# as well as any other helper functions like `is_publisher_or_non_writer` and `normalize_name`.

def filter_with_ner(cleaned_text: str) -> str:
    """
    After the initial rule-based cleaning, this function uses SpaCy NER to 
    ensure that we only keep PERSON entities from the cleaned_text.
    """
    # The cleaned_text is something like "Justyce Kaseem Wright/Abdou Gambetta"
    if not cleaned_text:
        return ""
    
    # Split by '/' to process each candidate name
    candidates = [c.strip() for c in cleaned_text.split('/') if c.strip()]
    
    final_entities = []
    for candidate in candidates:
        # Use SpaCy to detect named entities
        doc = nlp(candidate)
        
        # If SpaCy doesn't identify the name as a PERSON, we may consider it non-writer (or we can fallback to heuristics)
        person_count = 0
        for ent in doc.ents:
            if ent.label_ == "PERSON":
                person_count += 1
        
        # Heuristic: If at least one PERSON entity is found in candidate, we keep it.
        # This can be refined further, for example if candidate has multiple entities and you want all to be PERSON.
        if person_count > 0:
            final_entities.append(candidate)
    
    # Join remaining entities with '/'
    return "/".join(final_entities)


def advanced_normalize_raw_text(raw_text: str) -> str:
    """
    This function first applies the previous rule-based normalization, then refines it using NER.
    """
    # Step 1: Use your previous solution to get an initial cleaned/filtered text
    #initial_cleaned = normalize_raw_text(raw_text)  # from your previously implemented code
    
    # Step 2: Apply NER-based filtering
    final_cleaned = filter_with_ner(raw_text)#initial_cleaned)
    
    return final_cleaned

# ------------------ Example Testing ------------------

example1 = "<Unknown>/Wright, Justyce Kaseem"
example2 = "Pixouu/Abdou Gambetta/Copyright Control"
example3 = "Mike Hoyer/JERRY CHESNUT/SONY/ATV MUSIC PUBLISHING (UK) LIMITED"

print("Example 1:")
print("RAW:", example1)
print("After Rule-based:", normalize_raw_text(example1))
print("After NER-based:", advanced_normalize_raw_text(example1))
print()

print("Example 2:")
print("RAW:", example2)
print("After Rule-based:", normalize_raw_text(example2))
print("After NER-based:", advanced_normalize_raw_text(example2))
print()

print("Example 3:")
print("RAW:", example3)
print("After Rule-based:", normalize_raw_text(example3))
print("After NER-based:", advanced_normalize_raw_text(example3))
print()

  model.load_state_dict(torch.load(filelike, map_location=device))


Example 1:
RAW: <Unknown>/Wright, Justyce Kaseem
After Rule-based: Justyce Kaseem Wright
After NER-based: Wright, Justyce Kaseem

Example 2:
RAW: Pixouu/Abdou Gambetta/Copyright Control
After Rule-based: Pixouu/Abdou Gambetta
After NER-based: Pixouu/Abdou Gambetta

Example 3:
RAW: Mike Hoyer/JERRY CHESNUT/SONY/ATV MUSIC PUBLISHING (UK) LIMITED
After Rule-based: Mike Hoyer/Jerry Chesnut
After NER-based: Mike Hoyer/JERRY CHESNUT/SONY



In [None]:
import pandas as pd
from collections import Counter
import re


def split_keep_phrases_dynamic(text, separation='weak'):
    if not text or not isinstance(text, str):
        return []  # Return an empty list for invalid or empty input

    # Identify non-alphanumeric separators (escaping used for handling re symbols)
    if separation == 'strong':
        # More general separation using collection of separators
        separators = ''.join([re.escape(char) for char, _ in Counter(text).items() if not char.isalnum() and char.strip()])
    else:
        # Weaker separation based on the data
        separators_list = '/&,()'#()'
        separators = ''.join([re.escape(char) for char in separators_list if not char.isalnum() and char.strip()])

    # Debugging: Log separators to ensure they're valid
    if not separators:
        print(f"No valid separators found in: {text}")
        return [text.strip()]  # Return the original text as a single phrase

    try:
        # Build regex to split on these separators (excluding spaces)
        regex = f"[{separators}]+"
        
        # Split the text while keeping phrases together
        parts = re.split(regex, text)
        return [part.strip() for part in parts if part.strip()]
    except re.error as e:
        # Log regex error for debugging
        print(f"Regex error with separators '{separators}' for text: {text}. Error: {e}")
        return [text.strip()]  # Return the original text as a fallback



# Read the Excel file
file_path = "data/raw/normalization_assesment_dataset_10k.csv"  
df = pd.read_csv(file_path)

# Apply the splitting function to the 'raw_comp_writers_text' column
df['split_phrases'] = df['raw_comp_writers_text'].apply(lambda x: split_keep_phrases_dynamic(x,separation='strong') if pd.notnull(x) else [])

# Save the updated DataFrame to a new Excel file
output_file_path = "output_file.csv"  
df.to_csv(output_file_path, index=False)

print(f"Processed data has been saved to {output_file_path}")


No valid separators found in: Martin Hygård
No valid separators found in: Mendel Brikman
No valid separators found in: Alvin Lee
No valid separators found in: Mefi Morales
No valid separators found in: Christopher Franke
No valid separators found in: Shashank Katkar
No valid separators found in: Mike Kalambay
No valid separators found in: Rikard Sjöblom
No valid separators found in: Junior Francisco
No valid separators found in: PHUC TRUONG
No valid separators found in: Slatt Zy
No valid separators found in: Ivan Torrent
No valid separators found in: An Stepper
No valid separators found in: fo man
No valid separators found in: Traditional
No valid separators found in: Christian Michelle Felix Felix
No valid separators found in: Eric Andersen
No valid separators found in: Efrem Jamaar Blackwell
No valid separators found in: Ludwig van Beethoven
No valid separators found in: Nguyễn Nhất Huy
No valid separators found in: Hayden Buck Jones
No valid separators found in: Alexey Abrosimov
No 

## Metrics: 

First an quick and dirty calculation  of the average recall

In [None]:
import pandas as pd
import ast

# Function to split CLEAN_TEXT by '/' based on labeling convention
def split_clean_text(text):
    if not text or not isinstance(text, str):
        return []  # Handle empty or invalid inputs
    return [part.strip() for part in text.split('/') if part.strip()]

# Function to calculate accuracy
def calculate_recall(row, display=False):
    # Extract the reference and predicted lists
    reference = set(split_clean_text(row['CLEAN_TEXT']))
    phrases_list = ast.literal_eval(row['split_phrases'])  # Convert to list
    predicted = set(phrases_list)

    # Calculate metrics
    correct = len(reference & predicted)
    total = len(reference)
    acc = correct / total if total > 0 else 0
    if display:
        print('CLEAN_TEXT label:',reference,'Clean Up Prediction:', predicted,'Similarity:', acc)
    return acc

# Load the processed CSV
file_path = "output_file.csv"  
df = pd.read_csv(file_path)

# Small test ERASE
print(df.iloc[0])
calculate_recall(df.iloc[0])

# Apply the accuracy calculation for each row
df['recall'] = df.apply(calculate_recall, axis=1)

# Calculate overall accuracy
overall_accuracy = df['recall'].mean()

# Print the results
print(f"Overall Recall: {overall_accuracy * 100:.2f}%")

# Save results for review
output_with_accuracy = "output_with_accuracy.csv"
df.to_csv(output_with_accuracy, index=False)
print(f"Results with accuracy have been saved to {output_with_accuracy}")


raw_comp_writers_text              Jordan Riley/Adam Argyle/Martin Brammer
CLEAN_TEXT                         Jordan Riley/Adam Argyle/Martin Brammer
split_phrases            ['Jordan Riley', 'Adam Argyle', 'Martin Brammer']
Name: 0, dtype: object
Overall Recall: 73.97%
Results with accuracy have been saved to output_with_accuracy.csv


### More general evaluation: 

We compare the full sets of predicted and reference phrases, labeling items as 1 (present) or 0 (not present) for each set, and then computing standard classification metrics like accuracy, precision, recall, and F1-score.

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.metrics import edit_distance


# Function to split CLEAN_TEXT by '/' based on labeling convention
def split_clean_text(text):
    if not text or not isinstance(text, str):
        return []  # Handle empty or invalid inputs
    return [part.strip() for part in text.split('/') if part.strip()]

# Function to calculate BLEU score with special handling for single-token cases
def calculate_bleu(reference, predicted):
    smoothing_fn = SmoothingFunction().method1

    # If only one token, use a simple string comparison (only 2-gram+ in the implementation from nltk)
    if len(reference) == 1 and len(predicted) == 1:
        return 1.0 if reference[0] == predicted[0] else 0.0

    # Otherwise, compute BLEU score normally
    return sentence_bleu([reference], predicted, weights=(0.5, 0.5), smoothing_function=smoothing_fn)


# Function to calculate metrics per row
def calculate_metrics(row):
    # Extract the reference and predicted sets
    reference = set(split_clean_text(row['CLEAN_TEXT']))
    pred_phrases_list = ast.literal_eval(row['split_phrases'])  # Convert to list
    predicted = set(pred_phrases_list)    
    # Combine all unique phrases
    all_phrases = list(reference | predicted)
    
    # Create binary labels for reference and predicted
    reference_labels = [1 if phrase in reference else 0 for phrase in all_phrases]
    predicted_labels = [1 if phrase in predicted else 0 for phrase in all_phrases]
    
    # Compute edit distance and BLEU score
    edit_dist = edit_distance(' '.join(reference), ' '.join(predicted))
    bleu = calculate_bleu(list(reference), list(predicted))
    #print(reference,predicted, edit_dist, bleu)
    # Return the metrics
    return reference_labels, predicted_labels, all_phrases, edit_dist, bleu


# Load the processed CSV
file_path = "output_file.csv"  
df = pd.read_csv(file_path)

# Apply metrics calculation to each row
metrics = df.apply(calculate_metrics, axis=1)

# Flatten the results for global metrics computation
reference_labels = []
predicted_labels = []
all_phrases = []
edit_distances = []
bleu_scores = []

for ref, pred, phrases, edit_dist, bleu in metrics:
    reference_labels.extend(ref)
    predicted_labels.extend(pred)
    all_phrases.extend(phrases)
    edit_distances.append(edit_dist)
    bleu_scores.append(bleu)

    #for r,p,ph in zip(ref,pred, phrases):
    #    if(r==1 and p==0):
    #        print(r,p,ph,phrases)

# Calculate overall metrics
precision = precision_score(reference_labels, predicted_labels, zero_division=0)
recall = recall_score(reference_labels, predicted_labels, zero_division=0)
f1 = f1_score(reference_labels, predicted_labels, zero_division=0)

# Logging also the accuracy and CM for completion
accuracy = accuracy_score(reference_labels, predicted_labels)
cm = confusion_matrix(reference_labels, predicted_labels) # Expected 0 predicted and true negatives

# Calculate overall average metrics
avg_edit_distance = sum(edit_distances) / len(edit_distances)
avg_bleu_score = sum(bleu_scores) / len(bleu_scores)

# Print overall metrics
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")
print(f"Average Edit Distance: {avg_edit_distance:.2f}")
print(f"Average BLEU Score: {avg_bleu_score:.2f}")


# Save detailed results for debugging (optional)
output_with_details = "output_with_details.csv"
df['reference_labels'] = [ref for ref, _, _, _, _ in metrics]
df['predicted_labels'] = [pred for _, pred, _, _, _ in metrics]
df['all_phrases'] = [phrases for _, _, phrases, _, _ in metrics]
df['edit_distance'] = edit_distances
df['bleu_score'] = bleu_scores
df.to_csv(output_with_details, index=False)
print(f"Detailed results with edit distance and BLEU score have been saved to {output_with_details}")


Accuracy: 57.91%
Precision: 65.99%
Recall: 82.54%
F1 Score: 73.34%
Average Edit Distance: 7.24
Average BLEU Score: 0.70
Detailed results with edit distance and BLEU score have been saved to output_with_details.csv


Accuracy: 57.91%
Precision: 65.99%
Recall: 82.54%
F1 Score: 73.34%
CM: 
[[    0  7463]
 [ 3064 14482]]
Detailed results have been saved to output_with_details.csv

The current  model is biased toward predicting positives (since the)


Strong:
Accuracy: 57.91%
Precision: 65.99%
Recall: 82.54%
F1 Score: 73.34%
Average Edit Distance: 7.24
Average BLEU Score: 0.70


Weak:
Accuracy: 67.63%
Precision: 74.86%
Recall: 87.51%
F1 Score: 80.69%
Average Edit Distance: 6.37
Average BLEU Score: 0.76
Detailed results with edit distance and BLEU score have been saved to output_with_details.csv

Note: If we want to run things faster switch to 

### Basic LLM Implementation:

Initial experimentation

In [None]:
from groq import Groq

client = Groq(
    api_key="",
)

# We create the prompt includings
phrase= "DJ PALEMBANG/Copyright Control"
prompt = \
f"""
You are a useful linguist assistant. We want to perform text normalisation on names of songwriters. We will give you a string which includes names or nicknames 
in different formats and containing potentially unecessary words you ll need to clean up - or maybe just the name or nickname. 
Also some Raw text does not contain any useful information (e.g., unknown, weird initials, standardized words, etc)
Do not include any outputs of non-latin (e.g., cyrillic, chinese or arabian) characters in your output. If there are multiple names or nicknames separate with '/'
You ll need to return the correct names.
Some examples:
Example1:
RAW TEXT: <Unknown>/Wright, Justyce Kaseem
Normalized Text: Justyce Kaseem Wright
Example 2:
RAW TEXT: Pixouu/Abdou Gambetta/Copyright Control
Normalized Text: Pixouu/Abdou Gambetta
Example 3:
RAW TEXT: Mike Hoyer/JERRY CHESNUT/SONY/ATV MUSIC PUBLISHING (UK) LIMITED
Normalized Text: JERRY CHESNUT/Mike Hoyer
Example 4:
RAW TEXT: 신중현 (Shin Joong Hyun)
Normalized Text: Shin Joong Hyun
Example 5:
RAW TEXT: 신중현
Normalized Text:
Example 6:
RAW TEXT: UNKNOWN 
Normalized Text: 

Perform the same for the following cases:
RAW TEXT:{phrase}
RAW TEXT: Аршавир Мартиросян
RAW TEXT: Yuvanshankar Raja & Ramajogayya Sastry
RAW TEXT: jdiocsjiofouefan
"""
print(prompt)
###  Output (we can easily extract the normalized names): #########
#Here are the results of the text normalization:
#
#1. RAW TEXT: DJ PALEMBANG/Copyright Control
#Normalized Text: DJ PALEMBANG
#
#2. RAW TEXT: Аршавир Мартиросян
#Normalized Text: (since the text contains non-Latin characters, I'll leave the field empty)
#
#3. RAW TEXT: Yuvanshankar Raja & Ramajogayya Sastry
#Normalized Text: Yuvanshankar Raja/Ramajogayya Sastry
#
#4. RAW TEXT: jdiocsjiofouefan
#Normalized Text: (since the text appears to be nonsense and doesn't contain any useful information)


chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": prompt,
        }
    ],
    model="llama3-8b-8192",
)
print(chat_completion.choices[0].message.content)