In [10]:
import re

In [12]:
from pykakasi import kakasi

In [4]:
def remove_second_parentheses(text):
    regex = re.compile(r'(（[^）]*）)(（[^）]*）)')
    return re.sub(regex, lambda match: match.group(1), text)


def transcribe_japanese(text):
    from pykakasi import kakasi

    kks = kakasi()
    kks.setMode("J", "H")  # Japanese to Hiragana
    kks.setMode("K", "H")  # Katakana to Hiragana
    conv = kks.getConverter()

    result = ""
    current_chunk = ""
    last_kanji_hiragana = ""
    is_kanji_or_katakana = False

    for char in text:
        if '\u4E00' <= char <= '\u9FFF':  # Kanji
            hiragana = conv.do(char)
            last_kanji_hiragana = hiragana  # Store the hiragana of the current kanji
            if not is_kanji_or_katakana:
                is_kanji_or_katakana = True
                current_chunk = ""
            current_chunk += hiragana
            result += char
        elif char == '々':  # Ideographic Iteration Mark
            if not is_kanji_or_katakana:
                is_kanji_or_katakana = True
                current_chunk = ""
            current_chunk += last_kanji_hiragana
            result += char
        elif '\u30A0' <= char <= '\u30FF':  # Katakana
            hiragana = conv.do(char)
            if not is_kanji_or_katakana:
                is_kanji_or_katakana = True
                current_chunk = ""
            current_chunk += hiragana
            result += char
        else:  # Hiragana or others
            if is_kanji_or_katakana:
                result += f"({current_chunk}){char}"
                is_kanji_or_katakana = False
            else:
                result += char

    if is_kanji_or_katakana:  # Remaining kanji or katakana chunk at the end
        result += f"({current_chunk})"

    return result


# Function to remove text inside parentheses
def remove_text_inside_parentheses(text):
    while '（' in text and '）' in text:
        start = text.find('（')
        end = text.find('）') + 1
        text = text[:start] + text[end:]
    return text

def clean_english(text):
    return text.replace(".", "·").replace("·ˈ", "ˈ").replace("·ˌ", "ˌ") #.replace(" ", "")

def clean_japanese(text):
    return text.replace(".", "").replace("·", "").replace("(", "（").replace(")", "）").replace(" ", "")

def remove_hiragana_inside_parentheses(text):
    # This regex matches hiragana or katakana inside parentheses and removes them, keeping the parentheses
    return re.sub(r'(?<=（)[ぁ-んァ-ンー-]+(?=）)', '', text)

def remove_content_inside_parentheses(text):
    # This regex matches anything inside parentheses and removes it, including the parentheses
    return re.sub(r'（[^）]*）', '', text)


In [6]:
text = "ズークツヴァング（ずーくつゔぁんぐ）"

In [17]:
re.sub(r'（[ぁ-んァ-ンー-]+）', '', "ズークツヴァング（ずーくつゔぁんぐ）")

'ズークツヴァング（ずーくつゔぁんぐ）'

In [7]:
transcribe_japanese(text)

  kks.setMode("J", "H")  # Japanese to Hiragana
  kks.setMode("K", "H")  # Katakana to Hiragana
  conv = kks.getConverter()
  hiragana = conv.do(char)


'ズークツヴァング(ず-くつゔぁんぐ)（ずー(-)くつゔぁんぐ）'

In [21]:
import re

text = "ズークツヴァング（ずーくつゔぁんぐ）"
modified_pattern = r'[（(][ぁ-んァ-ンー-]+[）)]'
result = re.sub(modified_pattern, '', text)
print(result)


ズークツヴァング（ずーくつゔぁんぐ）


In [19]:
import re

text = "ズークツヴァング（ずーくつゔぁんぐ）"
pattern = r'（[^）]+）'  # This will match any characters between full-width parentheses

result = re.sub(pattern, '', text)
print(result)


ズークツヴァング


In [20]:
remove_text_inside_parentheses(text)

'ズークツヴァング'

In [22]:
def remove_text_including_parentheses(text):
    while '（' in text and '）' in text:
        start = text.find('（')
        end = text.find('）') + 1
        text = text[:start] + text[end:]
    return text

# Example usage
text = "ズークツヴァング（ずーくつゔぁんぐ）"
result = remove_text_including_parentheses(text)
print(result)


ズークツヴァング


In [25]:
def remove_text_inside_parentheses(text):
    new_text = ""
    in_parentheses = False

    for char in text:
        if char == '（':
            in_parentheses = True
            new_text += char
        elif char == '）' and in_parentheses:
            in_parentheses = False
            new_text += char
        elif not in_parentheses:
            new_text += char

    return new_text

# Example usage
text = "ズークツヴァング（ずーくつゔぁんぐ）"
result = remove_text_inside_parentheses(text)
print(result)


ズークツヴァング（）


In [26]:
re.sub(r'[（ぁ-んァ-ンー-）]+', '', text)

'ヴゔ'

In [27]:
import re

text = "ズークツヴァング（ずーくつゔぁんぐ）"
pattern = r'[（ぁ-んァ-ンー）]+'

result = re.sub(pattern, '', text)
print(result)


ヴゔ


In [40]:
import re

def remove_hiragana_and_parentheses(text):
    """
    Remove all Hiragana, related letters, and parentheses from the text.

    Parameters:
    text (str): The input string from which Hiragana and parentheses will be removed.

    Returns:
    str: The text with Hiragana and parentheses removed.
    """
    # Regex pattern to remove Hiragana, related letters, and full-width parentheses
    pattern = r'[（）ぁ-ん゙-゚ー-]'

    return re.sub(pattern, '', text)

# Example usage
japanese_synonym = "ズークツヴァング（ずー-くつゔぁんぐ）"
result = remove_hiragana_and_parentheses(japanese_synonym)
print(result)


ズークツヴァングずーくつゔぁんぐ


In [30]:
"-" == "-"

True

In [64]:
def remove_hiragana_inside_parentheses(text):
    # This regex matches hiragana or katakana inside parentheses and removes them, keeping the parentheses
    # return re.sub(r'(?<=（)[ぁ-んァ-ンー-]+(?=）)', '', text)
    return re.sub(r'(?<=（)[ぁ-ゔ-゚ー\-]+(?=）)', '', text)
    # return remove_content_inside_parentheses(text)

In [65]:
remove_hiragana_inside_parentheses(japanese_synonym)

'ズークツヴァング（）'

In [95]:
def remove_japanese_letter_inside_parentheses(text):
    # This regex matches hiragana or katakana inside parentheses and removes them, keeping the parentheses
    return re.sub(r'(?<=（)[ぁ-ゔァ-ヴガ-ドㇰ-ㇿーヵヶヰヱ々〆〤\-（）]+(?=）)', '', text)


def remove_hiragana_including_parentheses(text):
    # Comprehensive regex pattern for Japanese characters
    
    return re.sub(r'（[ぁ-ゔー\-（）]+）', '', text)
    # return remove_text_including_parentheses(text)


def remove_hiragana_inside_parentheses(text):
    # This regex matches hiragana or katakana inside parentheses and removes them, keeping the parentheses

    return re.sub(r'(?<=（)[（）ぁ-ゔー\-]+(?=）)', '', text)
    # return remove_content_inside_parentheses(text)


def remove_hiragana_and_parentheses(text):
    """
    Remove all Hiragana, related letters, and parentheses from the text.

    Parameters:
    text (str): The input string from which Hiragana and parentheses will be removed.

    Returns:
    str: The text with Hiragana and parentheses removed.
    """
    # Regex pattern to remove Hiragana, related letters, and full-width parentheses
    pattern = r'[（）ぁ-ゔ-゚ー\-]'

    return re.sub(pattern, '', text)

In [96]:
text = "ズークツ（ず-くつゔぁんぐ）ヴァング（ず-くつゔぁんぐ）（ずー（）くつゔぁんぐ）"

In [97]:
remove_japanese_letter_inside_parentheses(text)

'ズークツ（）'

In [98]:
remove_hiragana_including_parentheses(text)

'ズークツヴァング'

In [99]:
remove_hiragana_inside_parentheses(text)

'ズークツ（）ヴァング（）'

In [100]:
remove_hiragana_and_parentheses(text)

'ズクツヴァング'