In [None]:
import re

def handle_parentheses(query):
    def replace_match(m):
        inner = m.group(1)
        return f"(?:{process_query(inner)})"

    # Handle innermost parentheses first
    def process_innermost_parentheses(q):
        return re.sub(r"\(([^()]+)\)", lambda m: f"(?:{process_query(m.group(1))})", q)
    
    # Iteratively handle nested parentheses
    previous_query = None
    while previous_query != query:
        previous_query = query
        query = process_innermost_parentheses(query)
        # Handle cases with nested parentheses
        query = re.sub(
            r"\(([^()]*\([^()]*\)[^()]*)\)",
            lambda m: f"(?:{process_query(m.group(1))})",
            query,
        )
        query = process_innermost_parentheses(query)
        
    return query

# Example usage
query = """(((protect* OR care OR caring OR safe* OR shield* OR defend* OR guard* OR defens*) NEAR/3 (sun OR UV OR UVA OR UVB OR solar)) OR "anti-sun" OR "anti sun")"""

# Convert the query to regex pattern
pattern = process_query(query)
print(f"Generated regex pattern: {pattern}")

# Apply regex to rows
for row in rows:
    if re.search(pattern, row, re.IGNORECASE):
        print(f"Match found: {row}")
    else:
        print(f"No match: {row}")


In [1]:
import re

# Sample data (rows)
rows = [
    "I love chocolate and vanilla ice cream Beauty Balm.",
    "Blue and green are my favorite colors.",
    "Taste is subjective, but I prefer optimize over optimise.",
    "Mentioning @doveuk here, and #blue for hashtags.",
]

# Example query
query = """((BB OR "Beauty Balm" OR "Blemish Balm" OR "beauty-balm" OR "blemish-balm" NOT blue) NEAR/2 (cream OR creams)) AND Nice"""


def handle_or(query):
    return query.replace(" OR ", "|")


def handle_and(query):
    return query.replace(" AND ", ".*")


def handle_not(query):
    return re.sub(r"NOT\s+(\w+)", r"^(?!.*\b\1\b)", query)


def handle_exact_phrases(query):
    # Replace exact phrases inside double quotes with regex pattern
    return re.sub(r'"(.*?)"', lambda m: r"\b" + re.escape(m.group(1)) + r"\b", query)


def handle_near_x(query):
    near_pattern = re.compile(r'(\w+|"[^"]*")\s+NEAR/(\d+)\s+(\w+|"[^"]*")')
    while near_pattern.search(query):
        match = near_pattern.search(query)
        term1, distance, term2 = match.groups()
        term1 = term1.strip('"')
        term2 = term2.strip('"')
        pattern = r"\b{}\b(?:\W+\w+){{0,{}}}\W+\b{}\b".format(
            term1, int(distance), term2
        )
        query = near_pattern.sub(pattern, query, 1)
    return query


def handle_wildcards(query):
    return re.sub(r"(\w+)\*", r"\1.*", query)


def handle_question_marks(query):
    return re.sub(r"(\w+)\?", lambda m: f"{m.group(1)}.", query)


def handle_mentions(query):
    return re.sub(r"at_mention:\((.*?)\)", r"@\b\1\b", query)


def handle_hashtags(query):
    return re.sub(r"hashtag:\((.*?)\)", r"#\b\1\b", query)


def process_inner_parentheses(query):
    # Process innermost parentheses first
    def replace_match(m):
        inner = m.group(1)
        return f"(?:{process_query(inner)})"

    return re.sub(r"\(([^()]+)\)", replace_match, query)


def handle_parentheses(query):
    # Recursively handle parentheses
    previous_query = None
    while previous_query != query:
        previous_query = query
        query = process_inner_parentheses(query)
        # Handle cases where there may be nested parentheses
        query = re.sub(
            r"\(([^()]*\([^()]*\)[^()]*)\)",
            lambda m: f"(?:{process_query(m.group(1))})",
            query,
        )
    return query


def process_query(query):
    query = handle_or(query)
    query = handle_and(query)
    query = handle_not(query)
    query = handle_exact_phrases(query)
    query = handle_parentheses(query)
    query = handle_near_x(query)
    query = handle_wildcards(query)
    query = handle_question_marks(query)
    query = handle_mentions(query)
    query = handle_hashtags(query)
    return query


query = handle_or(query)
query = handle_and(query)
query = handle_not(query)
query = handle_exact_phrases(query)
print(query)


((BB|\bBeauty\ Balm\b|\bBlemish\ Balm\b|\bbeauty\-balm\b|\bblemish\-balm\b ^(?!.blu)) NEAR/2 (cream|creams)).*Nice


In [15]:
import re

def handle_or(query):
    return query.replace(" OR ", "|")

def handle_and(query):
    return query.replace(" AND ", ".*")

def handle_not(query):
    return re.sub(r"NOT\s+(\w+)", r"^(?!.*\b\1\b)", query)

def handle_exact_phrases(query):
    return re.sub(r'"(.*?)"', lambda m: r"\b" + re.escape(m.group(1)) + r"\b", query)


def handle_near_x(query):
    # Define a regex pattern to match the NEAR/x operator in the query
    near_pattern = re.compile(r'\((\w+|"[^"]*")\s+NEAR/(\d+)\s+(\w+|"[^"]*")\)')

    # Iterate over all NEAR/x matches in the query
    while near_pattern.search(query):
        match = near_pattern.search(query)
        term1, distance, term2 = match.groups()

        # Strip quotes from the terms
        term1 = term1.strip('"')
        term2 = term2.strip('"')

        # Create a regex pattern for the NEAR/x operator
        pattern = r'\b{}\b(?:\W+\w+){{0,{}}}\W+\b{}\b'.format(
            re.escape(term1), int(distance), re.escape(term2)
        )

        # Replace the NEAR/x pattern in the query with the generated regex pattern
        query = near_pattern.sub(pattern, query, 1)

    return query

def handle_wildcards(query):
    return re.sub(r"(\w+)\*", r"\1.*", query)

def handle_question_marks(query):
    return re.sub(r"(\w+)\?", lambda m: f"{m.group(1)}.", query)

def handle_mentions(query):
    return re.sub(r"at_mention:\((.*?)\)", r"@\b\1\b", query)

def handle_hashtags(query):
    return re.sub(r"hashtag:\((.*?)\)", r"#\b\1\b", query)

def handle_parentheses(query):
    def process_subquery(subquery):
        # Apply transformations directly here
        subquery = handle_or(subquery)
        subquery = handle_and(subquery)
        subquery = handle_not(subquery)
        subquery = handle_exact_phrases(subquery)
        subquery = handle_near_x(subquery)
        subquery = handle_wildcards(subquery)
        subquery = handle_question_marks(subquery)
        subquery = handle_mentions(subquery)
        subquery = handle_hashtags(subquery)
        return subquery

    stack = []
    current_query = ""

    i = 0
    while i < len(query):
        if query[i] == "(":
            # Push current query and position onto the stack
            stack.append((current_query, i))
            current_query = ""
        elif query[i] == ")":
            # Pop from stack and handle the content inside parentheses
            last_query, start_pos = stack.pop()
            subquery = query[start_pos + 1 : i]
            # Process the subquery and add it to the current query
            processed_subquery = process_subquery(subquery)
            current_query = last_query + f"(?:{processed_subquery})"
        else:
            current_query += query[i]
        i += 1

    return current_query

def process_query(query):
    query = handle_parentheses(query)
    query = handle_or(query)
    query = handle_and(query)
    query = handle_not(query)
    query = handle_exact_phrases(query)
    query = handle_near_x(query)
    query = handle_wildcards(query)
    query = handle_question_marks(query)
    query = handle_mentions(query)
    query = handle_hashtags(query)
    return query

# Example usage
query = '''(BB OR "Beauty Balm" OR "Blemish Balm" OR "beauty-balm" OR "blemish-balm") NEAR/2 (cream OR creams)'''

# Convert query to regex pattern
pattern = process_query(query)
print(f"Generated regex pattern: {pattern}")

# Sample data (rows)
rows = [
    "I love chocolate and vanilla ice cream Beauty Balm.",
    "Blue and green are my favorite colors.",
    "Taste is subjective, but I prefer optimize over optimise.",
    "Mentioning @doveuk here, and #blue for hashtags.",
]

# Apply regex to rows
for row in rows:
    if re.search(pattern, row, re.IGNORECASE):  # Added re.IGNORECASE for case-insensitive matching
        print(f"Match found: {row}")
    else:
        print(f"No match: {row}")


Generated regex pattern: (?:BB|\bBeauty\ Balm\b|\bBlemish\ Balm\b|\bbeauty\-balm\b|\bblemish\-balm\b) NEAR/2 (?:cream|creams)
No match: I love chocolate and vanilla ice cream Beauty Balm.
No match: Blue and green are my favorite colors.
No match: Taste is subjective, but I prefer optimize over optimise.
No match: Mentioning @doveuk here, and #blue for hashtags.


In [4]:
import re

def handle_or(query):
    return query.replace(" OR ", "|")

def handle_and(query):
    return query.replace(" AND ", ".*")

def handle_not(query):
    return re.sub(r"NOT\s+(\w+)", r"^(?!.*\b\1\b)", query)

def handle_exact_phrases(query):
    return re.sub(r'"(.*?)"', lambda m: r"\b" + re.escape(m.group(1)) + r"\b", query)

def handle_near_x(query):
    def replace_near(match):
        term1, distance, term2 = match.groups()
        term1 = term1.strip('"')
        term2 = term2.strip('"')
        return r"\b{}\b(?:\W+\w+){{0,{}}}\W+\b{}\b".format(term1, int(distance), term2)
    
    near_pattern = re.compile(r'(\w+|"[^"]*")\s+NEAR/(\d+)\s+(\w+|"[^"]*")')
    return near_pattern.sub(replace_near, query)

def handle_wildcards(query):
    return re.sub(r"(\w+)\*", r"\1.*", query)

def handle_question_marks(query):
    return re.sub(r"(\w+)\?", lambda m: f"{m.group(1)}.", query)

def handle_mentions(query):
    return re.sub(r"at_mention:\((.*?)\)", r"@\b\1\b", query)

def handle_hashtags(query):
    return re.sub(r"hashtag:\((.*?)\)", r"#\b\1\b", query)

def process_subquery(subquery):
    # Apply transformations directly here
    subquery = handle_or(subquery)
    subquery = handle_and(subquery)
    subquery = handle_not(subquery)
    subquery = handle_exact_phrases(subquery)
    subquery = handle_near_x(subquery)
    subquery = handle_wildcards(subquery)
    subquery = handle_question_marks(subquery)
    subquery = handle_mentions(subquery)
    subquery = handle_hashtags(subquery)
    return subquery

def handle_parentheses(query):
    def replace_match(m):
        inner = m.group(1)
        return f"(?:{process_subquery(inner)})"

    # Process innermost parentheses first
    query = re.sub(r'\(([^()]+)\)', replace_match, query)

    # Handle nested parentheses
    while '(' in query:
        query = re.sub(r'\(([^()]*\([^()]*\)[^()]*)\)', replace_match, query)
    
    # Final processing of any remaining unprocessed content
    return process_subquery(query)

def process_query(query):
    query = handle_parentheses(query)
    query = handle_or(query)
    query = handle_and(query)
    query = handle_not(query)
    query = handle_exact_phrases(query)
    query = handle_near_x(query)
    query = handle_wildcards(query)
    query = handle_question_marks(query)
    query = handle_mentions(query)
    query = handle_hashtags(query)
    return query

# Example usage
query = """(((protect* OR care OR caring OR safe* OR shield* OR defend* OR guard* OR defens*) NEAR/3 (sun OR UV OR UVA OR UVB OR solar)) OR "anti-sun" OR "anti sun")"""

# Convert query to regex pattern
pattern = process_query(query)
print(f"Generated regex pattern: {pattern}")

# Sample data (rows)
rows = [
    "I love chocolate and vanilla ice cream Beauty Balm.",
    "Blue and green are my favorite colors.",
    "Taste is subjective, but I prefer optimize over optimise.",
    "Mentioning @doveuk here, and #blue for hashtags.",
]

# Apply regex to rows
for row in rows:
    if re.search(pattern, row, re.IGNORECASE):  # Added re.IGNORECASE for case-insensitive matching
        print(f"Match found: {row}")
    else:
        print(f"No match: {row}")


KeyboardInterrupt: 

In [6]:
import re

def handle_near_x(query):
    """
    Converts NEAR/x patterns in the query into regex.
    :param query: The query containing NEAR/x patterns
    :return: The query with NEAR/x patterns converted to regex
    """
    def convert_near_pattern(lhs, distance, rhs):
        """
        Converts a NEAR/x pattern to a regex pattern.
        :param lhs: Left-hand side of the NEAR/x pattern
        :param distance: Distance for the NEAR operator
        :param rhs: Right-hand side of the NEAR/x pattern
        :return: The regex pattern for the NEAR/x pattern
        """
        # Convert wildcards and question marks to regex
        lhs_regex = re.sub(r'\*', '.*', lhs)
        rhs_regex = re.sub(r'\*', '.*', rhs)
        lhs_regex = re.sub(r'\?', '.', lhs_regex)
        rhs_regex = re.sub(r'\?', '.', rhs_regex)

        # Construct regex pattern for NEAR/x
        return r'\b{}\b(?:\W+\w+){{0,{}}}\W+\b{}\b'.format(lhs_regex, distance, rhs_regex)

    # Find all NEAR/x patterns
    near_pattern = re.compile(r'\(([^()]+)\)\s+NEAR/(\d+)\s+\(([^()]+)\)')
    matches = near_pattern.findall(query)

    for lhs, distance, rhs in matches:
        # Convert NEAR/x pattern to regex
        regex_pattern = convert_near_pattern(lhs, int(distance), rhs)
        # Replace NEAR/x pattern in the query with the regex pattern
        query = re.sub(r'\(\s*{}\s+NEAR/{}\s+{}\s*\)'.format(re.escape(lhs), distance, re.escape(rhs)), 
                       regex_pattern, query, flags=re.UNICODE)

    return query

# Example usage
query = """(protect* OR care OR caring OR safe* OR shield* OR defend* OR guard* OR defens*) NEAR/3 (sun OR UV OR UVA OR UVB OR solar)"""
converted_query = handle_near_x(query)
print(f"Converted query: {converted_query}")


error: bad escape \W at position 91

In [7]:
import re

def handle_near_x(query):
    """
    Converts NEAR/x patterns in the query into regex.
    :param query: The query containing NEAR/x patterns
    :return: The query with NEAR/x patterns converted to regex
    """
    def convert_near_pattern(lhs, distance, rhs):
        """
        Converts a NEAR/x pattern to a regex pattern.
        :param lhs: Left-hand side of the NEAR/x pattern
        :param distance: Distance for the NEAR operator
        :param rhs: Right-hand side of the NEAR/x pattern
        :return: The regex pattern for the NEAR/x pattern
        """
        # Convert wildcards and question marks to regex
        lhs_regex = re.sub(r'\*', '.*', lhs)
        rhs_regex = re.sub(r'\*', '.*', rhs)
        lhs_regex = re.sub(r'\?', '.', lhs_regex)
        rhs_regex = re.sub(r'\?', '.', rhs_regex)

        # Construct regex pattern for NEAR/x
        return r'\b{}\b(?:\W+\w+){{0,{}}}\W+\b{}\b'.format(lhs_regex, distance, rhs_regex)

    # Find all NEAR/x patterns
    near_pattern = re.compile(r'\(([^()]+)\)\s+NEAR/(\d+)\s+\(([^()]+)\)')
    matches = near_pattern.findall(query)

    for lhs, distance, rhs in matches:
        # Convert NEAR/x pattern to regex
        regex_pattern = convert_near_pattern(lhs, int(distance), rhs)
        # Escape special regex characters in lhs and rhs
        lhs_escaped = re.escape(lhs)
        rhs_escaped = re.escape(rhs)
        # Replace NEAR/x pattern in the query with the regex pattern
        query = re.sub(r'\(\s*{}\s+NEAR/{}\s+{}\s*\)'.format(lhs_escaped, distance, rhs_escaped), 
                       regex_pattern, query, flags=re.UNICODE)

    return query

# Example usage
query = """(protect* OR care OR caring OR safe* OR shield* OR defend* OR guard* OR defens*) NEAR/3 (sun OR UV OR UVA OR UVB OR solar)"""
converted_query = handle_near_x(query)
print(f"Converted query: {converted_query}")


error: bad escape \W at position 91

In [9]:
import re

def convert_near_pattern(lhs, distance, rhs):
    lhs_regex = re.sub(r'\*', '.*', lhs)
    rhs_regex = re.sub(r'\*', '.*', rhs)
    lhs_regex = re.sub(r'\?', '.', lhs_regex)
    rhs_regex = re.sub(r'\?', '.', rhs_regex)
    
    return r'\b{}\b(?:\W+\w+){{0,{}}}\W+\b{}\b|\b{}\b(?:\W+\w+){{0,{}}}\W+\b{}\b'.format(
        lhs_regex, distance, rhs_regex, rhs_regex, distance, lhs_regex
    )

# Example usage
query = "blue NEAR/2 green"
lhs, distance, rhs = "blue", 2, "green"
regex_pattern = convert_near_pattern(lhs, distance, rhs)
print(f"Regex Pattern: {regex_pattern}")


Regex Pattern: \bblue\b(?:\W+\w+){0,2}\W+\bgreen\b|\bgreen\b(?:\W+\w+){0,2}\W+\bblue\b


In [11]:
near_pattern = re.compile(
    r'\(\s*(.*?)\s+NEAR/(\d+)\s+(.*?)\s*\)',
    re.IGNORECASE
)

In [13]:
import re

def has_near_operator(query):
    # Regex pattern to find NEAR/x operator
    near_pattern = re.compile(r'NEAR/\d+', re.IGNORECASE)
    
    # Search for the pattern in the query
    if near_pattern.search(query):
        return True
    return False

# Example usage
query1 = '''(BB OR "Beauty Balm" OR "Blemish Balm" OR "beauty-balm" OR "blemish-balm") NEAR/2 (cream OR creams)'''
query2 = "This query has no near operator"

print(has_near_operator(query1))  # Output: True
print(has_near_operator(query2))  # Output: False


True
False


In [16]:
import re

def handle_or(query):
    return query.replace(" OR ", "|")

def handle_and(query):
    return query.replace(" AND ", ".*")

def handle_not(query):
    return re.sub(r"NOT\s+(\w+)", r"^(?!.*\b\1\b)", query)

def handle_exact_phrases(query):
    return re.sub(r'"(.*?)"', lambda m: r"\b" + re.escape(m.group(1)) + r"\b", query)

def handle_near_x(query):
    # Define a regex pattern to match the NEAR/x operator in the query
    near_pattern = re.compile(r'\((\w+|"[^"]*")\s+NEAR/(\d+)\s+(\w+|"[^"]*")\)')

    while near_pattern.search(query):
        match = near_pattern.search(query)
        term1, distance, term2 = match.groups()
        term1 = term1.strip('"')
        term2 = term2.strip('"')
        # Create a regex pattern for the NEAR/x operator
        pattern = r'\b{}\b(?:\W+\w+){{0,{}}}\W+\b{}\b'.format(
            re.escape(term1), int(distance), re.escape(term2)
        )
        # Replace the NEAR/x pattern in the query with the generated regex pattern
        query = near_pattern.sub(pattern, query, 1)

    return query

def handle_wildcards(query):
    return re.sub(r"(\w+)\*", r"\1.*", query)

def handle_question_marks(query):
    return re.sub(r"(\w+)\?", lambda m: f"{m.group(1)}.", query)

def handle_mentions(query):
    return re.sub(r"at_mention:\((.*?)\)", r"@\b\1\b", query)

def handle_hashtags(query):
    return re.sub(r"hashtag:\((.*?)\)", r"#\b\1\b", query)

def handle_parentheses(query):
    def process_subquery(subquery):
        subquery = handle_or(subquery)
        subquery = handle_and(subquery)
        subquery = handle_not(subquery)
        subquery = handle_exact_phrases(subquery)
        subquery = handle_near_x(subquery)
        subquery = handle_wildcards(subquery)
        subquery = handle_question_marks(subquery)
        subquery = handle_mentions(subquery)
        subquery = handle_hashtags(subquery)
        return subquery

    stack = []
    current_query = ""
    i = 0

    while i < len(query):
        if query[i] == "(":
            stack.append((current_query, i))
            current_query = ""
        elif query[i] == ")":
            last_query, start_pos = stack.pop()
            subquery = query[start_pos + 1 : i]
            processed_subquery = process_subquery(subquery)
            current_query = last_query + f"(?:{processed_subquery})"
        else:
            current_query += query[i]
        i += 1

    return current_query

def process_query(query):
    query = handle_parentheses(query)
    query = handle_or(query)
    query = handle_and(query)
    query = handle_not(query)
    query = handle_exact_phrases(query)
    query = handle_near_x(query)
    query = handle_wildcards(query)
    query = handle_question_marks(query)
    query = handle_mentions(query)
    query = handle_hashtags(query)
    return query

# Example usage
query = '''(BB OR "Beauty Balm" OR "Blemish Balm" OR "beauty-balm" OR "blemish-balm") NEAR/2 (cream OR creams)'''

# Convert query to regex pattern
pattern = process_query(query)
print(f"Generated regex pattern: {pattern}")

# Sample data (rows)
rows = [
    "I love chocolate and vanilla ice cream Beauty Balm.",
    "Blue and green are my favorite colors.",
    "Taste is subjective, but I prefer optimize over optimise.",
    "Mentioning @doveuk here, and #blue for hashtags.",
]

# Apply regex to rows
for row in rows:
    if re.search(pattern, row, re.IGNORECASE):  # Added re.IGNORECASE for case-insensitive matching
        print(f"Match found: {row}")
    else:
        print(f"No match: {row}")


Generated regex pattern: (?:BB|\bBeauty\ Balm\b|\bBlemish\ Balm\b|\bbeauty\-balm\b|\bblemish\-balm\b) NEAR/2 (?:cream|creams)
No match: I love chocolate and vanilla ice cream Beauty Balm.
No match: Blue and green are my favorite colors.
No match: Taste is subjective, but I prefer optimize over optimise.
No match: Mentioning @doveuk here, and #blue for hashtags.


In [18]:
import re

def handle_or(query):
    return query.replace(" OR ", "|")

def handle_and(query):
    return query.replace(" AND ", ".*")

def handle_not(query):
    return re.sub(r"NOT\s+(\w+)", r"^(?!.*\b\1\b)", query)

def handle_exact_phrases(query):
    return re.sub(r'"(.*?)"', lambda m: r"\b" + re.escape(m.group(1)) + r"\b", query)

def handle_near_x(query):
    near_pattern = re.compile(r'\((\w+|"[^"]*")\s+NEAR/(\d+)\s+(\w+|"[^"]*")\)')
    
    def convert_near_pattern(lhs, distance, rhs):
        lhs = re.escape(lhs.strip('"'))
        rhs = re.escape(rhs.strip('"'))
        return r'\b{}\b(?:\W+\w+){{0,{}}}\W+\b{}\b'.format(lhs, distance, rhs)
    
    while near_pattern.search(query):
        match = near_pattern.search(query)
        term1, distance, term2 = match.groups()
        pattern = convert_near_pattern(term1, int(distance), term2)
        query = near_pattern.sub(pattern, query, 1)
    
    return query

def handle_wildcards(query):
    return re.sub(r"(\w+)\*", r"\1.*", query)

def handle_question_marks(query):
    return re.sub(r"(\w+)\?", lambda m: f"{m.group(1)}.", query)

def handle_mentions(query):
    return re.sub(r"at_mention:\((.*?)\)", r"@\b\1\b", query)

def handle_hashtags(query):
    return re.sub(r"hashtag:\((.*?)\)", r"#\b\1\b", query)

def handle_parentheses(query):
    def process_subquery(subquery):
        subquery = handle_or(subquery)
        subquery = handle_and(subquery)
        subquery = handle_not(subquery)
        subquery = handle_exact_phrases(subquery)
        subquery = handle_near_x(subquery)
        subquery = handle_wildcards(subquery)
        subquery = handle_question_marks(subquery)
        subquery = handle_mentions(subquery)
        subquery = handle_hashtags(subquery)
        return subquery

    stack = []
    current_query = ""
    i = 0

    while i < len(query):
        if query[i] == "(":
            stack.append((current_query, i))
            current_query = ""
        elif query[i] == ")":
            last_query, start_pos = stack.pop()
            subquery = query[start_pos + 1 : i]
            processed_subquery = process_subquery(subquery)
            current_query = last_query + f"(?:{processed_subquery})"
        else:
            current_query += query[i]
        i += 1

    return current_query

def process_query(query):
    query = handle_parentheses(query)
    query = handle_or(query)
    query = handle_and(query)
    query = handle_not(query)
    query = handle_exact_phrases(query)
    query = handle_near_x(query)
    query = handle_wildcards(query)
    query = handle_question_marks(query)
    query = handle_mentions(query)
    query = handle_hashtags(query)
    return query

# Example usage
query = '''(BB OR "Beauty Balm" OR "Blemish Balm" OR "beauty-balm" OR "blemish-balm") NEAR/2 (cream OR creams)'''

# Convert query to regex pattern
pattern = process_query(query)
print(f"Generated regex pattern: {pattern}")

# Sample data (rows)
rows = [
    "I love chocolate and vanilla ice cream Beauty Balm.",
    "Blue and green are my favorite colors.",
    "Taste is subjective, but I prefer optimize over optimise.",
    "Mentioning @doveuk here, and #blue for hashtags.",
]

# Apply regex to rows
for row in rows:
    if re.search(pattern, row, re.IGNORECASE):  # Added re.IGNORECASE for case-insensitive matching
        print(f"Match found: {row}")
    else:
        print(f"No match: {row}")


Generated regex pattern: (?:BB|\bBeauty\ Balm\b|\bBlemish\ Balm\b|\bbeauty\-balm\b|\bblemish\-balm\b) NEAR/2 (?:cream|creams)
No match: I love chocolate and vanilla ice cream Beauty Balm.
No match: Blue and green are my favorite colors.
No match: Taste is subjective, but I prefer optimize over optimise.
No match: Mentioning @doveuk here, and #blue for hashtags.


In [19]:
import re

# Sample data (rows)
rows = [
    "I love chocolate and vanilla ice cream Beauty Balm.",
    "Blue and green are my favorite colors.",
    "Taste is subjective, but I prefer optimize over optimise.",
    "Mentioning @doveuk here, and #blue for hashtags.",
]

# Example query
query = """((BB OR "Beauty Balm" OR "Blemish Balm" OR "beauty-balm" OR "blemish-balm") NEAR/2 (cream OR creams))"""

def convert_to_regex(query):
    # Replace OR, AND, NOT with regex equivalents
    query = query.replace(" OR ", "|").replace(" AND ", ".*").replace("NOT ", "^(?!.*")
    
    # Handle exact phrases (those inside double quotes)
    query = re.sub(r'"(.*?)"', r'\b\1\b', query)
    
    # Handle parentheses
    query = re.sub(r'\((.*?)\)', lambda m: f'(?:{convert_to_regex(m.group(1))})', query)
    
    return query

def handle_near_x(query):
    # Match patterns like `"phrase1" NEAR/x "phrase2"`
    near_pattern = re.compile(r'(".*?"|\w+)\s+NEAR/(\d+)\s+(".*?"|\w+)')
    while near_pattern.search(query):
        match = near_pattern.search(query)
        phrase1, distance, phrase2 = match.groups()
        phrase1 = phrase1.strip('"')
        phrase2 = phrase2.strip('"')
        pattern = r'\b{}\b(?:\W+\w+){{0,{}}}\W+\b{}\b'.format(phrase1, int(distance), phrase2)
        query = near_pattern.sub(pattern, query, 1)
    return query

def handle_wildcards_and_questions(query):
    query = re.sub(r'(\w+)\*', r'\1.*', query)  # Handle wildcards
    query = re.sub(r'(\w+)\?', lambda m: f'{m.group(1)}[a-zA-Z]', query)  # Handle question marks
    return query

def handle_mentions_hashtags(query):
    query = re.sub(r'at_mention:\((.*?)\)', r'@\b\1\b', query)
    query = re.sub(r'hashtag:\((.*?)\)', r'#\b\1\b', query)
    return query

def process_query(query):
    query = convert_to_regex(query)
    query = handle_near_x(query)
    query = handle_wildcards_and_questions(query)
    query = handle_mentions_hashtags(query)
    return query

# Apply regex to rows
pattern = process_query(query)
print(f"Generated regex pattern: {pattern}\n")

for row in rows:
    if re.search(pattern, row, re.IGNORECASE):  # Added re.IGNORECASE for case-insensitive matching
        print(f"Match found: {row}")
    else:
        print(f"No match: {row}")


Generated regex pattern: (?:(BBBeauty BalBlemish Balbeauty-balblemish-bal) NEAR/2 (?:cream|creams))

No match: I love chocolate and vanilla ice cream Beauty Balm.
No match: Blue and green are my favorite colors.
No match: Taste is subjective, but I prefer optimize over optimise.
No match: Mentioning @doveuk here, and #blue for hashtags.


In [2]:
import re

# Sample data (rows)
rows = [
    "I love chocolate and vanilla ice cream Beauty Balm.",
    "Blue and green are my favorite colors.",
    "Taste is subjective, but I prefer optimize over optimise.",
    "Mentioning @doveuk here, and #blue for hashtags.",
]

# Example query
query = """(BB OR "Beauty Balm" OR "Blemish Balm" OR "beauty-balm" OR "blemish-balm") NEAR/2 (cream OR creams)"""

def handle_or(query):
    return query.replace(" OR ", "|")

def handle_and(query):
    return query.replace(" AND ", ".*")

def handle_not(query):
    return re.sub(r'NOT\s+(\w+)', r'^(?!.*\b\1\b)', query)

def handle_exact_phrases(query):
    return re.sub(r'"(.*?)"', r'\b\1\b', query)

def handle_parentheses(query):
    # Recursively handle nested queries inside parentheses
    while '(' in query:
        query = re.sub(r'\(([^()]+)\)', lambda m: f'(?:{process_query(m.group(1))})', query)
    return query

def handle_near_x(query):
    near_pattern = re.compile(r'(\w+|"[^"]*")\s+NEAR/(\d+)\s+(\w+|"[^"]*")')
    while near_pattern.search(query):
        match = near_pattern.search(query)
        term1, distance, term2 = match.groups()
        term1 = term1.strip('"')
        term2 = term2.strip('"')
        pattern = r'\b{}\b(?:\W+\w+){{0,{}}}\W+\b{}\b'.format(term1, int(distance), term2)
        query = near_pattern.sub(pattern, query, 1)
    return query

def handle_wildcards(query):
    return re.sub(r'(\w+)\*', r'\1.*', query)

def handle_question_marks(query):
    return re.sub(r'(\w+)\?', lambda m: f'{m.group(1)}.', query)

def handle_mentions(query):
    return re.sub(r'at_mention:\((.*?)\)', r'@\b\1\b', query)

def handle_hashtags(query):
    return re.sub(r'hashtag:\((.*?)\)', r'#\b\1\b', query)

def process_query(query):
    query = handle_or(query)
    print(query)
    query = handle_and(query)
    print(query)
    query = handle_not(query)
    print(query)
    query = handle_exact_phrases(query)
    print(query)
    #query = handle_parentheses(query)
    query = handle_near_x(query)
    print(query)
    query = handle_wildcards(query)
    print(query)
    query = handle_question_marks(query)
    print(query)
    query = handle_mentions(query)
    print(query)
    query = handle_hashtags(query)
    print(query)
    return query

# Apply regex to rows
pattern = process_query(query)
print(f"Generated regex pattern: {pattern}\n")

for row in rows:
    if re.search(pattern, row, re.IGNORECASE):  # Added re.IGNORECASE for case-insensitive matching
        print(f"Match found: {row}")
    else:
        print(f"No match: {row}")


(BB|"Beauty Balm"|"Blemish Balm"|"beauty-balm"|"blemish-balm") NEAR/2 (cream|creams)
(BB|"Beauty Balm"|"Blemish Balm"|"beauty-balm"|"blemish-balm") NEAR/2 (cream|creams)
(BB|"Beauty Balm"|"Blemish Balm"|"beauty-balm"|"blemish-balm") NEAR/2 (cream|creams)
(BBBeauty BalBlemish Balbeauty-balblemish-bal) NEAR/2 (cream|creams)
(BBBeauty BalBlemish Balbeauty-balblemish-bal) NEAR/2 (cream|creams)
(BBBeauty BalBlemish Balbeauty-balblemish-bal) NEAR/2 (cream|creams)
(BBBeauty BalBlemish Balbeauty-balblemish-bal) NEAR/2 (cream|creams)
(BBBeauty BalBlemish Balbeauty-balblemish-bal) NEAR/2 (cream|creams)
(BBBeauty BalBlemish Balbeauty-balblemish-bal) NEAR/2 (cream|creams)
Generated regex pattern: (BBBeauty BalBlemish Balbeauty-balblemish-bal) NEAR/2 (cream|creams)

No match: I love chocolate and vanilla ice cream Beauty Balm.
No match: Blue and green are my favorite colors.
No match: Taste is subjective, but I prefer optimize over optimise.
No match: Mentioning @doveuk here, and #blue for hashtags

In [1]:
query = """(BB OR "Beauty Balm" OR "Blemish Balm" OR "beauty-balm" OR "blemish-balm") NEAR/2 (cream OR creams)"""

while '(' in query:
    print('----')
    query = re.sub(r'\(([^()]+)\)', lambda m: f'(?:{process_query(m.group(1))})', query)



----


NameError: name 're' is not defined

NEAR/x (blue NEAR/3 green):
Regex Pattern: \bblue\b(?:\W+\w+){0,3}\W+\bgreen\b

In [14]:
import re

# Example query
query = """(BB OR "Beauty Balm" OR "Blemish Balm" OR "beauty-balm" OR "blemish-balm") NEAR/2 (cream OR creams)"""

def contains_near_operator(query):
    # Regular expression to match the NEAR/x pattern
    near_pattern = re.compile(r'\bNEAR/\d+\b')

    # Search for the pattern in the query
    match = near_pattern.search(query)

    # Return True if the pattern is found, otherwise False
    return match is not None

# Check if the NEAR/x operator exists in the query
if contains_near_operator(query):
    print("The query contains the NEAR/x operator.")
else:
    print("The query does not contain the NEAR/x operator.")


The query contains the NEAR/x operator.


In [17]:
near_strings = re.findall(r'(\(?([^\)\(]*?)\)?(?:\s)((?:NEAR\/)([0-9]+))(?:\s)\(?(.*?)\))', query,
                              flags=re.UNICODE)

In [23]:
near_strings
from nltk import word_tokenize



In [29]:
def convert_to_python_operators(query):
    """
    Takes the query with symbol operators and replace with python equivalent
    so can be evaluated by eval() function
    :param query: The input query
    :return query: The query with operators replaced with Python operators (and, not, or) so that Python can evaluate
     the query
    """
    query = re.sub(r'\!\!', 'not', query, flags=re.UNICODE)
    query = re.sub(r'\&\&', 'and', query, flags=re.UNICODE)
    query = re.sub(r'\|\|', 'or', query, flags=re.UNICODE)

    return query


quoted_word_pattern = re.compile(u'"(.*?)"', flags=re.UNICODE)
question_mark_pattern = re.compile(r'\w+?\.[a-z]*', flags=re.UNICODE)
def parse_near(parsed_query, input_text):
    """
    Takes a NEAR statement, evaluates it and returns the full query with the parsed NEAR statement.
    :param parsed_query: The query associated with a topic
    :param input_text: The input text
    :return parsed_query: The output query once it has been parsed with the NEAR operator
    """
    parsed_query = parsed_query.lower()
    parsed_query = parsed_query.replace('?', '.')

    # Find all the near statements in the query
    # \(?([^\)\(]*?)\)?(?:\s)((?:near\/)([0-9]+))(?:\s)\(?(.*?)\)  (\(\(?(.*?)\)?(?:\s)((?:near\/)([0-9]+))(?:\s)\(?(.*?)\)?\))
    near_strings = re.findall(r'(\(?([^\)\(]*?)\)?(?:\s)((?:near\/)([0-9]+))(?:\s)\(?(.*?)\))', parsed_query,
                              flags=re.UNICODE)

    for nstring in near_strings:
        # Strip whitespace
        words = [word.strip() for word in nstring]

        # Get the number after NEAR/ in the query
        sepdist = words[3]

        # Get the words in the LHS of the query
        lhs = words[1]

        # Get the words in the RHS of the query
        rhs = words[4]

        keywords = ['(', ')', u'||', u'&&', u'!!']

        # Find quoted phrases
        quoted_left = quoted_word_pattern.findall(lhs)
        quoted_right = quoted_word_pattern.findall(rhs)
        n_quoted_left = []
        for w in quoted_left:
            n_quoted_left.append('"' + w + '"')
        quoted_left = n_quoted_left
        n_quoted_right = []
        for w in quoted_right:
            n_quoted_right.append('"' + w + '"')
        quoted_right = n_quoted_right
        # Extract words to match with input text before and after near and append quoted phrases
        words_left = [w for w in word_tokenize(re.sub(u'"(.*?)"', '', lhs, flags=re.UNICODE)) if
                      w.lower() not in keywords] + quoted_left
        words_right = [w for w in re.sub(u'"(.*?)"', '', rhs, flags=re.UNICODE).split() if
                       w not in keywords] + quoted_right
        
        # We sort them so quoted words come first to prevent issue where words inside quoted words aren't replaced
        words_left = sorted(words_left)
        words_right = sorted(words_right)
        
        words_near = []

        # Appending regex symbol /w to match any word character(s) before the next space
        for a in words_left:
            for b in words_right:
                if a[-1:] == '*':
                    match_a = re.escape(re.sub(r'\*', '', a, flags=re.UNICODE)) + r'\w*'
                else:
                    question_mark = question_mark_pattern.findall(a)

                    if a in question_mark:
                        match_a = a
                    else:
                        match_a = re.escape(a)

                if b[-1:] == '*':
                    match_b = re.escape(re.sub(r'\*', '', b, flags=re.UNICODE)) + r'\w*'
                else:
                    question_mark = question_mark_pattern.findall(b)

                    if b in question_mark:
                        match_b = b
                    else:
                        match_b = re.escape(b)

                match_a_nq = match_a.strip('"')
                match_b_nq = match_b.strip('"')

                # Check distance between each pair of words on each side of NEAR is less than required
                m = re.search(r'\b(?:' + match_a_nq + r'\W+(?:\w+\W+){0,' + re.escape(
                    sepdist) + r'}?' + match_b_nq + r'|' + match_b_nq + r'\W+(?:\w+\W+){0,' + re.escape(
                    sepdist) + r'}?' + match_a_nq + r')\b', input_text, flags=re.UNICODE)

                if m:
                    x = True

                else:
                    x = False

                words_near.append((a, b, x))

        eval_left = lhs.lower()

        # for each word in query on lhs, check it satisfies condition on rhs
        for word_left in words_left:

            match_dict = {wn[1]: wn[2] for wn in words_near if wn[0] == word_left}

            eval_right = rhs

            for key, value in match_dict.items():
                if key[0] == '"':
                    # We match direct on key as we add in the quotes in quoted_right
                    eval_right = re.sub(key, str(value), eval_right, flags=re.UNICODE)
                else:
                    eval_right = re.sub(r'\b' + key + r'\b', str(value), eval_right, flags=re.UNICODE)

            # Replace asterisks so that each side can be evaluated
            eval_left = re.sub(r'\*', '', eval_left, flags=re.UNICODE)
            eval_right = re.sub(r'\*', '', eval_right, flags=re.UNICODE)
            eval_left = convert_to_python_operators(eval_left)
            eval_right = convert_to_python_operators(eval_right)
            eval_right = eval_right.replace('"', '')


            if word_left[0] == '"':
                # We match direct on key as we add in the quotes in quoted_left
                eval_left = re.sub(word_left, str(eval(eval_right)), eval_left, flags=re.UNICODE)
            else:
                eval_left = re.sub(r'\b' + word_left + r'\b', str(eval(eval_right)), eval_left, flags=re.UNICODE)

        eval_left = eval_left.replace('"', '')

        final_eval = eval(eval_left)

        parsed_query = parsed_query.replace(nstring[0], str(final_eval))

    return parsed_query

In [30]:
parsed_query = '(apple NEAR/2 orange)'
input_text = 'I ate an apple and an orange yesterday.'

result = parse_near(parsed_query, input_text)
print(result)

True


In [28]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KRILLIN\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [31]:
import re
import nltk

# Ensure NLTK's tokenizer is ready
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Sample data (rows)
rows = [
    "I love chocolate and vanilla ice cream Beauty Balm.",
    "Blue and green are my favorite colors.",
    "Taste is subjective, but I prefer optimize over optimise.",
    "Mentioning @doveuk here, and #blue for hashtags.",
]

# Example query
query = """(BB OR "Beauty Balm" OR "Blemish Balm" OR "beauty-balm" OR "blemish-balm") NEAR/2 (cream OR creams)"""

def handle_or(query):
    return query.replace(" OR ", "|")

def handle_and(query):
    return query.replace(" AND ", ".*")

def handle_not(query):
    return re.sub(r'NOT\s+(\w+)', r'^(?!.*\b\1\b)', query)

def handle_exact_phrases(query):
    return re.sub(r'"(.*?)"', r'\b\1\b', query)

def handle_parentheses(query):
    while '(' in query:
        query = re.sub(r'\(([^()]+)\)', lambda m: f'(?:{process_query(m.group(1))})', query)
    return query

def handle_wildcards(query):
    return re.sub(r'(\w+)\*', r'\1.*', query)

def handle_question_marks(query):
    return re.sub(r'(\w+)\?', lambda m: f'{m.group(1)}.', query)

def handle_mentions(query):
    return re.sub(r'at_mention:\((.*?)\)', r'@\b\1\b', query)

def handle_hashtags(query):
    return re.sub(r'hashtag:\((.*?)\)', r'#\b\1\b', query)

def convert_to_python_operators(query):
    query = re.sub(r'\!\!', 'not', query, flags=re.UNICODE)
    query = re.sub(r'\&\&', 'and', query, flags=re.UNICODE)
    query = re.sub(r'\|\|', 'or', query, flags=re.UNICODE)
    return query

quoted_word_pattern = re.compile(u'"(.*?)"', flags=re.UNICODE)
question_mark_pattern = re.compile(r'\w+?\.[a-z]*', flags=re.UNICODE)

def parse_near(parsed_query, input_text):
    parsed_query = parsed_query.lower().replace('?', '.')
    near_strings = re.findall(r'(\(?([^\)\(]*?)\)?(?:\s)((?:near\/)([0-9]+))(?:\s)\(?(.*?)\))', parsed_query,
                              flags=re.UNICODE)
    for nstring in near_strings:
        words = [word.strip() for word in nstring]
        sepdist = words[3]
        lhs = words[1]
        rhs = words[4]
        keywords = ['(', ')', u'||', u'&&', u'!!']

        quoted_left = quoted_word_pattern.findall(lhs)
        quoted_right = quoted_word_pattern.findall(rhs)
        n_quoted_left = ['"' + w + '"' for w in quoted_left]
        n_quoted_right = ['"' + w + '"' for w in quoted_right]
        quoted_left = n_quoted_left
        quoted_right = n_quoted_right

        words_left = [w for w in word_tokenize(re.sub(u'"(.*?)"', '', lhs, flags=re.UNICODE)) if
                      w.lower() not in keywords] + quoted_left
        words_right = [w for w in re.sub(u'"(.*?)"', '', rhs, flags=re.UNICODE).split() if
                       w not in keywords] + quoted_right

        words_left = sorted(words_left)
        words_right = sorted(words_right)
        words_near = []

        for a in words_left:
            for b in words_right:
                match_a = re.escape(re.sub(r'\*', '', a, flags=re.UNICODE)) + r'\w*' if a[-1:] == '*' else re.escape(a)
                match_b = re.escape(re.sub(r'\*', '', b, flags=re.UNICODE)) + r'\w*' if b[-1:] == '*' else re.escape(b)
                match_a_nq = match_a.strip('"')
                match_b_nq = match_b.strip('"')

                m = re.search(r'\b(?:' + match_a_nq + r'\W+(?:\w+\W+){0,' + re.escape(sepdist) + r'}?' + match_b_nq +
                              r'|' + match_b_nq + r'\W+(?:\w+\W+){0,' + re.escape(sepdist) + r'}?' + match_a_nq + r')\b',
                              input_text, flags=re.UNICODE)
                x = True if m else False
                words_near.append((a, b, x))

        eval_left = lhs.lower()

        for word_left in words_left:
            match_dict = {wn[1]: wn[2] for wn in words_near if wn[0] == word_left}
            eval_right = rhs

            for key, value in match_dict.items():
                eval_right = re.sub(key, str(value), eval_right, flags=re.UNICODE) if key[0] == '"' else \
                            re.sub(r'\b' + key + r'\b', str(value), eval_right, flags=re.UNICODE)

            eval_left = re.sub(r'\*', '', eval_left, flags=re.UNICODE)
            eval_right = re.sub(r'\*', '', eval_right, flags=re.UNICODE)
            eval_left = convert_to_python_operators(eval_left)
            eval_right = convert_to_python_operators(eval_right)
            eval_right = eval_right.replace('"', '')

            eval_left = re.sub(word_left, str(eval(eval_right)), eval_left, flags=re.UNICODE) if word_left[0] == '"' else \
                        re.sub(r'\b' + word_left + r'\b', str(eval(eval_right)), eval_left, flags=re.UNICODE)

        eval_left = eval_left.replace('"', '')
        final_eval = eval(eval_left)
        parsed_query = parsed_query.replace(nstring[0], str(final_eval))

    return parsed_query

def process_query(query, text):
    query = handle_or(query)
    query = handle_and(query)
    query = handle_not(query)
    query = handle_exact_phrases(query)
    #query = handle_parentheses(query)
    query = parse_near(query, text)  # Using parse_near here
    query = handle_wildcards(query)
    query = handle_question_marks(query)
    query = handle_mentions(query)
    query = handle_hashtags(query)
    return query

# Apply regex to rows
for row in rows:
    pattern = process_query(query, row)
    print(f"Generated regex pattern: {pattern}\n")
    if re.search(pattern, row, re.IGNORECASE):  # Added re.IGNORECASE for case-insensitive matching
        print(f"Match found: {row}")
    else:
        print(f"No match: {row}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KRILLIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


NameError: name 'Falses' is not defined