In [143]:
import re
import numpy as np
def matches_pattern(string, pattern):
    """
    Tests if the entire string matches the regex pattern.

    Args:
        string (str): The string to be tested.
        pattern (str): The regular expression pattern.

    Returns:
        bool: True if the string matches the pattern, False otherwise.
    """
    # Use re.fullmatch to ensure the entire string is checked.
    return bool(re.fullmatch(pattern, string))

def char_to_vector(ch: str) -> list:
    # Check that the input is a single character
    if len(ch) != 1:
        raise ValueError("Input must be a single character string")

    # Determine membership for each group
    is_lowercase = ch.islower()       # Lowercase letter
    is_uppercase = ch.isupper()       # Uppercase letter
    is_word = ch.isalpha() 
    is_digit     = ch.isdigit()        # Numeric (digit)
    # Special character: a character that's not a letter or digit.
    is_special   = (not ch.isalnum()    ) and not (ch == " ") 
    is_whitespace = (ch == " ")
    vec = [is_lowercase, is_uppercase, is_word, is_digit, is_special, is_whitespace]
    assert any(vec) == True

    # Return the vector in the specified order:
    # [lowercase, uppercase, numeric, special]
    return [int(i) for i in vec]

def string_to_matrix(x : str) -> list:
    output = []
    for char in x:
        output.append(char_to_vector(char))
    return output

def pad_strings(list1, list2):
    # Combine both lists to determine the length of the longest string
    combined = list1 + list2
    if not combined:  # If both lists are empty, return them as-is.
        return list1, list2
        
    max_length = max(len(s) for s in combined)
    
    # Pad each string in both lists using the str.ljust() method
    padded_list1 = [s.ljust(max_length) for s in list1]
    padded_list2 = [s.ljust(max_length) for s in list2]
    
    return padded_list1, padded_list2

def check_columns(matrix):
    # If matrix is empty, return an empty list
    if not matrix:
        return []

    # Transpose rows to columns using zip, and for each column,
    # check if all values are 1. If so, append 1, otherwise 0.
    result = [1 if all(val == 1 for val in col) else 0 for col in zip(*matrix)]
    return result

def update_matrix_xor(matrix1, matrix2):
    # Assuming both matrices have the same dimensions.
    for i in range(len(matrix1)):
        for j in range(len(matrix1[0])):
            # If both matrix1 and matrix2 have a 1 at position (i, j)
            if matrix1[i][j] == 1 and matrix2[i][j] == 1:
                matrix1[i][j] = 0
    return matrix1

def map_index_to_psuedo_regex(x):
    if x == 0:
        return "[a-z]"
    elif x == 1:
        return "[A-Z]"
    elif x == 2:
        return "\w"
    elif x == 3:
        return "\d"
    elif x == 4:
        return "\W"
    elif x == -1:
        return "."
    elif x == 5:
        return "\s"
    elif type(x) == str:
        if x not in "1234567890":
            return "\\" + x
        else:
            return x
    else:
        raise Exception("index is larger than expected")

def return_min_index(lst):
    try:
        return lst.index(1)
    except:
        return -1

def find_common_chars(strings):
    if not strings:
        return {}

    # Find the length of the shortest string to avoid index errors
    min_length = min(len(s) for s in strings)
    common_dict = {}

    for i in range(min_length):
        # Get the character at index i in the first string as a reference
        char = strings[0][i]  
        # Check if all strings have the same character at position i
        if all(s[i] == char for s in strings):
            common_dict[i] = char

    return common_dict


In [120]:
def test_generated_regex(valid_strings, invalid_strings, out_func):
    """
    Tester function that uses the provided output function (out_func) to generate a regex pattern.
    It then verifies that all valid strings match the regex fully and that all invalid strings do not.
    
    Args:
        valid_strings (list): List of strings expected to match the regex.
        invalid_strings (list): List of strings that should not match the regex.
        out_func (function): A function that takes (valid_strings, invalid_strings) as parameters
                             and returns a regex string.
    
    Returns:
        bool: True if the regex passes all tests, False otherwise.
    """
    # Generate the Gree Expression (a regex pattern, per the challenge)
    pattern = out_func(valid_strings, invalid_strings)
    print(f"Generated Regex Pattern: {pattern}\n")
    
    passed = True

    # Test valid strings
    for s in valid_strings:
        if not matches_pattern(s, pattern):
            print(f"FAIL: Valid string '{s}' did not match!")
            passed = False
        else:
            print(f"PASS: Valid string '{s}' matched.")

    # Test invalid strings
    for s in invalid_strings:
        if matches_pattern(s, pattern):
            print(f"FAIL: Invalid string '{s}' mistakenly matched!")
            passed = False
        else:
            print(f"PASS: Invalid string '{s}' did not match.")

    if len(pattern) >= 20:
        print(f"Length of pattern exceed 20: {len(pattern)}")
        passed = False
    return passed

def keep_first_non_neg1(lst):
    found = False
    result = []
    for elem in lst:
        if elem != -1 and not found:
            # Keep the first non -1 element
            result.append(elem)
            found = True
        else:
            # Turn every other element into -1
            result.append(-1)
    return result

In [32]:
from pprint import pprint

In [147]:
def answer(valid_strings, invalid_strings):
    valid_strings, invalid_strings = pad_strings(valid_strings, invalid_strings)
    print(valid_strings)
    valid_strings_matrix_list = []
    invalid_strings_matrix_list = []
    for i in valid_strings:
        valid_strings_matrix_list.append(string_to_matrix(i))
    for i in invalid_strings:
        invalid_strings_matrix_list.append(string_to_matrix(i))
    
    assert len({len(s) for s in valid_strings}) == 1, "Not all valid strings have the same length"
    assert len({len(s) for s in invalid_strings}) == 1, "Not all invalid strings have the same length"
    assert len(valid_strings_matrix_list) == len(valid_strings)
    valid_string_col_vec = np.all(np.array(valid_strings_matrix_list), axis = 0).astype(int)
    invalid_string_col_vec = np.any(np.array(invalid_strings_matrix_list), axis = 0).astype(int)
    valid_string_col_vec = update_matrix_xor(valid_string_col_vec, invalid_string_col_vec)
    
    valid_string_col_min_index = [return_min_index(list(i)) for i in valid_string_col_vec]
    common_char_dict = find_common_chars(valid_strings)

    
    for key,value in common_char_dict.items():
        tmp = True
        for invalid in invalid_strings:
            if value == invalid[key]:
                print("invalid all char")
                tmp = False
        if tmp:
            valid_string_col_min_index[key] = value
    valid_string_col_min_index = keep_first_non_neg1(valid_string_col_min_index)
    return "^" + "".join([map_index_to_psuedo_regex(i) for i in valid_string_col_min_index]) + ".*$"

In [157]:
test_generated_regex( ["abc2", "bbb1", 'cde2'],  ["abcf", "bb"], answer)

['abc2', 'bbb1', 'cde2']
Generated Regex Pattern: ^...\d.*$

PASS: Valid string 'abc2' matched.
PASS: Valid string 'bbb1' matched.
PASS: Valid string 'cde2' matched.
PASS: Invalid string 'abcf' did not match.
PASS: Invalid string 'bb' did not match.


True

In [77]:
answer(["abc@gmail.com", "as2@gmail.co"], ["bb@ff"])

[[0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 1]
 [0 0 0 0 0]
 [1 0 1 0 0]
 [1 0 1 0 0]
 [1 0 1 0 0]
 [1 0 1 0 0]
 [0 0 0 0 0]
 [1 0 1 0 0]
 [1 0 1 0 0]
 [0 0 0 0 0]]


'^.....[a-z][a-z][a-z][a-z].[a-z][a-z].$'