In [20]:
import pandas as pd
import re

# Load the data_mapping CSV file
data_mapping_file_path = 'data_mapping.csv'  # Adjust this path to your actual file location
data_mapping = pd.read_csv(data_mapping_file_path, dtype=str)

# Initial replacement mapping
initial_replacements = {
    "MGE": "G/E",
    "GEN": "G/E",
    "GE": "G/E",
    "G_E": "G/E",
    "ME": "M/E",
    "M_E": "M/E",
    "S_G": "S/G",
    "T_C": "T/C",
    "TC": "T/C",
    "L_O": "L.O",
    "LO": "L.O",
    "F_O": "F.O",
    "FO": "F.O",
    "D_G": "D/G",
    "DG": "D/G"
}

# Second replacement mapping
second_replacements = {
    "_G/E": " G/E",
    "G/E_": "G/E ",
    "_M/E": " M/E",
    "M/E_": "M/E ",
    "_S/G": " S/G",
    "S/G_": "S/G ",
    "_T/C": " T/C",
    "T/C_": "T/C ",
    "_L.O": " L.O",
    "L.O_": "L.O ",
    "_F.O": " F.O",
    "F.O_": "F.O ",
    "_D/G": " D/G",
    "D/G_": "D/G ",
}

# Initialize counters
initial_replacement_counts = {key: 0 for key in initial_replacements}
second_replacement_counts = {key: 0 for key in second_replacements}

def replace_tokens_in_list(words, replacements, counts):
    for i, word in enumerate(words):
        if word in replacements:
            counts[word] += 1
            words[i] = replacements[word]
    return words

def replace_tokens_in_text(text, replacements, counts):
    if pd.isna(text):
        return text
    for old, new in replacements.items():
        # Count occurrences of the token
        count = text.count(old)
        counts[old] += count
        # Replace the token
        text = text.replace(old, new)
    return text

# Backup the original tag_description
data_mapping['org_tag_description'] = data_mapping['tag_description']

# Replace '(' and ')' with spaces
data_mapping['tag_description'] = data_mapping['tag_description'].apply(
    lambda x: x.replace('(', ' ').replace(')', ' ') if pd.notna(x) else x
)

# Function to tokenize and find tokens with numbers and keys/values
def find_and_replace_tokens(description, replacements):
    keys_and_values = list(replacements.keys()) + list(replacements.values())
    if pd.notna(description):
        tokens = description.split()
        new_tokens = []
        for token in tokens:
            replaced = False
            for key_or_value in keys_and_values:
                if (token.startswith(key_or_value) and re.search(r'\d', token[len(key_or_value):])) or \
                   (token.endswith(key_or_value) and re.search(r'\d', token[:len(token)-len(key_or_value)])):
                    # Split numbers and characters with spaces
                    token = re.sub(r'(\d+)', r' \1 ', token)
                    token = re.sub(r'\s+', ' ', token).strip()
                    replaced = True
                    # Print the token that matches the condition
                    print(f"Matched token: {token}")
            new_tokens.append(token if replaced else token)
        return ' '.join(new_tokens)
    return description

# Apply the function to 'tag_description' column to find and replace tokens
data_mapping['tag_description'] = data_mapping['tag_description'].apply(
    lambda x: find_and_replace_tokens(x, initial_replacements) if pd.notna(x) else x
)

# Apply initial replacements
data_mapping['tag_description'] = data_mapping['tag_description'].apply(
    lambda x: ' '.join(replace_tokens_in_list(x.split(), initial_replacements, initial_replacement_counts)) if pd.notna(x) else x
)

# Apply second replacements without re-tokenizing
data_mapping['tag_description'] = data_mapping['tag_description'].apply(
    lambda x: replace_tokens_in_text(x, second_replacements, second_replacement_counts)
)

# Display the counts of each replacement
print("\nInitial Replacement Counts:")
for token, count in initial_replacement_counts.items():
    print(f"{token} replaced {count} times")

print("\nSecond Replacement Counts:")
for token, count in second_replacement_counts.items():
    print(f"{token} replaced {count} times")

# Save the updated data_mapping to a new CSV file (optional)
output_file_path = 'data_mapping_normal.csv'
data_mapping.to_csv(output_file_path, index=False)

print(f"Updated data saved to {output_file_path}")


Matched token: DG 4
Matched token: DG 4
Matched token: T/C 1
Matched token: T/C 1
Matched token: DG 1
Matched token: DG 3
Matched token: G/E 4
Matched token: G/E 4
Matched token: G/E 4
Matched token: G/E 4
Matched token: G/E 4
Matched token: G/E 4
Matched token: G/E 4
Matched token: G/E 4
Matched token: DG 2
Matched token: DG 2 B
Matched token: G/E 4
Matched token: G/E 4
Matched token: G/E 4
Matched token: G/E 4
Matched token: G/E 4
Matched token: G/E 4
Matched token: G/E 4
Matched token: G/E 4
Matched token: DG 1
Matched token: DG 2
Matched token: DG 5
Matched token: DG 3
Matched token: G/E 4
Matched token: G/E 4
Matched token: G/E 4
Matched token: G/E 4
Matched token: G/E 4
Matched token: G/E 4
Matched token: G/E 4
Matched token: G/E 4
Matched token: G/E 4
Matched token: G/E 4
Matched token: G/E 4
Matched token: G/E 4
Matched token: G/E 4
Matched token: G/E 4
Matched token: G/E 4
Matched token: G/E 4
Matched token: DG 4
Matched token: G/E 4
Matched token: G/E 4
Matched token: G/E 4
M