In [1]:
import pandas as pd
import re
import csv
import numpy as np
import regex

## Import Data

In [2]:
# elm classes
ec = pd.read_csv('.\elm_classes.csv')

# elm instances
ei = pd.read_csv('.\elm_instances_html.csv',encoding = "ISO-8859-1")

### wrangle data

In [3]:
# get relevant columns
elm_patterns = ec[['ELMIdentifier','Regex']]
elm_patterns.columns = ['elm_id', 'regex_pattern']
elm_sequences = ei[['ELM identifier','Subsequence']]
elm_sequences.columns = ['elm_id', 'subsequence']

In [4]:
elm = elm_sequences.merge(elm_patterns, on='elm_id', how='outer')

In [5]:
len(elm)

3538

In [6]:
elm.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3538 entries, 0 to 3537
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   elm_id         3538 non-null   object
 1   subsequence    3523 non-null   object
 2   regex_pattern  3525 non-null   object
dtypes: object(3)
memory usage: 110.6+ KB


In [7]:
elm.head(5)[['subsequence', 'regex_pattern']]

Unnamed: 0,subsequence,regex_pattern
0,LHCMTGVDCTDGTRQKAAAL,[DSTE][^P][^DEWHFYC]D[GSAN]
1,LPRPQSPSDLDSLDGRSLND,[DSTE][^P][^DEWHFYC]D[GSAN]
2,PQSPSDLDSLDGRSLNDDGS,[DSTE][^P][^DEWHFYC]D[GSAN]
3,VPPSVSKDVPDGPLLREETY,[DSTE][^P][^DEWHFYC]D[GSAN]
4,PPKFTSQDSPDGQYENSEGG,[DSTE][^P][^DEWHFYC]D[GSAN]


In [8]:
elm = elm.dropna()

In [9]:
def frame_match(df):
    
    p = df.regex_pattern
    s = df.subsequence

    p = re.compile(p)
    m = re.search(p, s)
    if m is not None:
        m = m.group(0)
    else:
        m = ''

    return m
    #print("""
     #   pattern: {}
      #  string: {}
       # match: {}""".format(p, s, m))


In [10]:
# add matches to dataframe
elm['pattern_match'] = elm.apply(frame_match, axis=1)

In [11]:
# setup a sample
elm_sample = elm.sample(n=25, random_state=1)

These patterns are unique so they won't have a subsequence to match; however, they're still good to text catch conditions on - and test final program against no subsequences found.

## regex subpattern identification

In [12]:
def reduce_expression(input_expression, input_subsequence):
    """First function in process. Handles:
     - Unnested alternations (i.e. where pattern has two major alternation options)
     = Slicing parent, string down
     Abstract: This function handles the major iteration through the input pattern 
               by passing the parent pattern out to other functions, then slicing matches
               so the next section can be checked."""

        
    # match pattern, return string for routine
    matched_sequence_object = re.search(input_expression, input_subsequence)
    matched_sequence = matched_sequence_object.group(0)
    
    # global subsequence length
    global indexer
    indexer = matched_sequence_object.start()

    # script-testing
    if printest is True:
        print('input expression: {}\ninput subsequence: {}\nmatched string: {}\nadjustment: {}'.format(input_expression, input_subsequence, matched_sequence, indexer))    
        
    # assign variables within function
    pattern = input_expression
    sequence = matched_sequence

    # compile patterns
    is_group = '\^*\((?:[^()]+|(?R))*+\)({.+?})*\$*'
    is_set = '\^*\[.+?\]({.+?})*\$*'
    is_lit = '\^*[.\w]({.+?})*\$*'
    has_alt = '\|'
    
    # Has alternation?
    m = regex.search(has_alt, pattern)
    if m is not None:
        # Is the alternation unnested?
        pattern = is_alternation_nested(pattern, sequence)

    # cycle through objects
    while pattern != '':
        
        # script-testing
        if printest is True:
            print('  main feed pattern: {}\n  main feed sequence: {}'.format(pattern, sequence))
        
        # Is group?
        if regex.match(is_group, pattern):
            group_pattern = regex.match(is_group, pattern).group(0)
            
            # Nested group?
            if len(regex.findall('\(', group_pattern)) > 1:
                pattern, sequence = nested_group(pattern, sequence)
                pass
            
            # Single group?
            else:
                pattern, sequence = single_group(pattern, sequence)
                pass

        # is set?
        elif regex.match(is_set, pattern):
            pattern, sequence = single_set(pattern, sequence)
            pass
            
        # is lit?
        elif regex.match(is_lit, pattern):
            pattern, sequence = single_literal(pattern, sequence)
            pass
        
def is_alternation_nested(pattern, sequence):
    
    # need better definition here; need to be able to catch when alternation is nested and return false
    
    alternation_not_nested = '\((?:[^()]+|(?R))*+\)[|]\((?:[^()]+|(?R))*+\)'
    alternation_nested = '\([^)]+?\|'
    
    # processes alternation because parenths are balanced
    if regex.search(alternation_nested, pattern) is None:
        pattern = process_alternation(pattern, sequence)
    
    # processes alternation because no parenths
    elif regex.search('\(', pattern) is None:
        pattern = process_alternation(pattern, sequence)        
    
    # processes alternation because parenths are balanced
    elif regex.search(alternation_not_nested, pattern) is not None:
        pattern = process_alternation(pattern, sequence)
    
    return pattern

def nested_group(pattern, sequence):
    
    # script-testing
    if printest is True:
        print('  nested group pattern: {}\n  nested group sequence: {}'.format(pattern, sequence))
    
    # patterns
    is_group = '\^*\((?:[^()]+|(?R))*+\)({.+?})*\$*'
    is_set = '\^*\[.+?\]({.+?})*\$*'
    is_lit = '\^*[.\w]({.+?})*\$*'
    has_alt = '\|'
    
    # create group pattern; use capture to 'remove parenths'
    nested_pattern_match = regex.match(is_group, pattern)
    
   # create local variables
    nested_pattern_inner = nested_pattern_match.group(0)
    nested_pattern_inner = strip_nest(nested_pattern_inner)
        
    # Has alternation?
    m = regex.search(has_alt, nested_pattern_inner)
    
    if m is not None:
        
        # Is the alternation unnested?
        nested_pattern_inner = is_alternation_nested(nested_pattern_inner, sequence)

    # cycle through objects
    while nested_pattern_inner != '':

        # Nested group?
        if len(regex.findall('\(', nested_pattern_inner)) > 1:
            nested_pattern_inner, sequence = nested_group(nested_pattern_inner, sequence)
            pass
    
        elif regex.match(is_group, nested_pattern_inner):
            nested_pattern_inner, sequence = single_group(nested_pattern_inner, sequence)
            pass

        elif regex.match(is_set, nested_pattern_inner):
            nested_pattern_inner, sequence = single_set(nested_pattern_inner, sequence)
            pass

        elif regex.match(is_lit, nested_pattern_inner):
            nested_pattern_inner, sequence = single_literal(nested_pattern_inner, sequence)
            pass

    # edit import pattern, sequence
    pattern = slice_string(nested_pattern_match.group(0), pattern)
        
    return pattern, sequence

def single_group(pattern, sequence):
    
    # script-testing
    if printest is True:
        print('  single group pattern: {}\n  single group sequence: {}'.format(pattern, sequence))
    
    # patterns
    is_group = '\^*\((?:[^()]+|(?R))*+\)({.+?})*\$*'
    is_set = '\^*\[.+?\]({.+?})*\$*'
    is_lit = '\^*[.\w]({.+?})*\$*'
    
    # create group pattern; use capture to 'remove parenths'
    group_pattern_match = regex.match(is_group, pattern)
    
    # create local variables
    group_pattern_inner = group_pattern_match.group(0)
    group_pattern_inner = strip_nest(group_pattern_inner)
    
    # script-testing
    if printest is True:
        print('  group pattern inner: {}\n  group sequence: {}'.format(pattern, sequence))
    
    while group_pattern_inner != '':
        
        # is set?
        if regex.match(is_set, group_pattern_inner):
            group_pattern_inner, sequence = single_set(group_pattern_inner, sequence)
            pass
            
        # is lit?
        elif regex.match(is_lit, group_pattern_inner):
            group_pattern_inner, sequence = single_literal(group_pattern_inner, sequence)
            pass
            
    # edit import pattern, sequence
    pattern = slice_string(group_pattern_match.group(0), pattern)
    
    return pattern, sequence

def single_set(pattern, sequence):
    
    # script-testing
    if printest is True:
        print('  set pattern: {}\n  set sequence: {}'.format(pattern, sequence))
    
    # patterns
    is_set = '\^*\[.+?\]({.+?})*\$*'

    # is set?
    if regex.match(is_set, pattern):
        set_pattern = regex.match(is_set, pattern).group(0) # collect set, send to outputter
        pattern, sequence = process_pattern(set_pattern, pattern, sequence)

    return pattern, sequence

def single_literal(pattern, sequence):
    
    # script-testing
    if printest is True:
        print('  literal pattern: {}\n  literal sequence: {}'.format(pattern, sequence))
    
    # patterns
    is_lit = '\^*[.\w]({.+?})*\$*'
 
    # is lit?
    if regex.match(is_lit, pattern):
        lit_pattern = regex.match(is_lit, pattern).group(0) # collect literal, send to outputter
        pattern, sequence = process_pattern(lit_pattern, pattern, sequence)
    
    return pattern, sequence

def process_alternation(pattern, sequence):
    """Splits input pattern on | then evaluate.
       Alternation will evaluate all options then return option with greatest match length."""

    # splits the pattern alternation into into sub patterns
    alternatives = re.split(
                   pattern = '\|',
                   string = pattern
    )

    # constant for evaluating the longest string match
    pattern_match_length = 0

    # evaluate sub
    for alt in alternatives:
        m = regex.search(alt, sequence)
        if m is not None:
            alt_match_length = len(m.group(0))
            # script-testing
            if printest is True:    
                print('    alternation eval:\n    alternative pattern: {}\n    alt match string: {}\n    alt match length: {}'.format(alt, m.group(0), len(m.group(0))))
            if alt_match_length >= pattern_match_length:
                pattern_match_length = alt_match_length    # set length OF MATCH for evaluation
                pattern = alt                  # set return pattern
    
    # script-testing
    if printest is True:    
        print('    alternation selection:\n    {}'.format( pattern))

            
    return pattern

def strip_nest(pattern):
    
    pattern = pattern[1:]
    pattern = pattern[:-1]

    return pattern

def process_pattern(sub, pattern, sequence):
    
    global indexer
    
    # script-testing
    if printest is True:
        print('  process sub: {}\n  process pattern: {}\n  process sequence: {}'.format(sub, pattern, sequence))
    
    # Get match
    match_string = re.search(sub, sequence).group(0)    # need to use re.search() here as match will not always be at start of string
    #start = match_string.start() + index_adj                    # adding diff to make count relative to input subsequence
    #end = match_string.end() + index_adj
    
    # Increment index
    start = indexer
    indexer = indexer + len(match_string)
    end = indexer
    
    # script-testing    
    if printest is True:
        print('  sub: {}\n  match string: {}\n  index: ({}, {})'.format(sub, match_string, start, end))

    # Print out
    print('\t{}\t({}, {})'.format(sub, start, end))

    # Slice pattern
    pattern = slice_string(sub, pattern)

    # Slice sequence
    sequence = slice_string(match_string, sequence)
    # script-testing
    if printest is True:
        print('\n  pattern slice: {}\n  sequence slice: {}\n'.format(pattern, sequence))    

    return pattern, sequence

def slice_string(pattern, string):
    
    # handles escapes
    pattern = re.escape(pattern)
        
    # slice the input string on input pattern
    string = re.split(
        pattern = pattern,
        string = string,
        maxsplit = 1
    )[1]
    
    return string

def err_catch(re, su):
    try:
        reduce_expression(re, su)
    except:
        print('Could not process: {}'.format(su))
        pass
    



In [13]:
# Call Function

printest = False

elm_sample.apply(lambda row: err_catch(row['regex_pattern'], row['subsequence']), axis=1)

	[RK]	(4, 5)
	.{2,4}	(5, 9)
	[LIVP]	(9, 10)
	.	(10, 11)
	P	(11, 12)
	[LIV]	(12, 13)
	.	(13, 14)
	[LIVMF]	(14, 15)
	.	(17, 18)
	[ILM]	(18, 19)
	R$	(19, 20)
	.	(5, 6)
	R	(6, 7)
	.	(7, 8)
	.	(8, 9)
	L	(9, 10)
	.	(10, 11)
	.	(11, 12)
	[LIVM]	(12, 13)
	.	(13, 14)
	.	(1, 2)
	.	(2, 3)
	.	(3, 4)
	.	(4, 5)
	[LIFVYMTE]	(5, 6)
	[ASGC]	(6, 7)
	[^P]{2}	(7, 9)
	L	(9, 10)
	[^P]{2}	(10, 12)
	[IVMTL]	(12, 13)
	[GACS]	(13, 14)
	[D]	(14, 15)
	[^P]	(15, 16)
	[FVLMI]	(16, 17)
	.	(17, 18)
	[LMFYWIC]	(7, 8)
	.	(8, 9)
	.	(9, 10)
	I	(10, 11)
	.	(11, 12)
	E	(12, 13)
	Y	(7, 8)
	[^EPILVFYW]	(8, 9)
	[^HDEW]	(9, 10)
	[PLIV]	(10, 11)
	[^DEW]	(11, 12)
	R	(7, 8)
	V	(8, 9)
	.	(9, 10)
	P	(10, 11)
	.	(11, 12)
	[^P]	(6, 7)
	L	(7, 8)
	[^P]	(8, 9)
	[^P]	(9, 10)
	L	(10, 11)
	L	(11, 12)
	[^P]	(12, 13)
	H	(6, 7)
	.	(7, 8)
	[KR]	(8, 9)
	.	(9, 10)
	.	(10, 11)
	[ST]	(11, 12)
	[^P]	(12, 13)
	R	(7, 8)
	.	(8, 9)
	[LI]	(9, 10)
	.	(10, 11)
	[EDQ]	(11, 12)
	.	(7, 8)
	.	(8, 9)
	.	(9, 10)
	[ST]	(10, 11)
	P	(11, 12)
	.	(12, 13)
	[STCA]	(6

364     None
136     None
99      None
954     None
550     None
2005    None
3248    None
1609    None
2694    None
3504    None
650     None
2396    None
2886    None
2611    None
3293    None
2343    None
1093    None
1276    None
2717    None
3350    None
1089    None
2451    None
984     None
2700    None
318     None
dtype: object