Skip to content

Commit

Permalink
exclude box drawing (TUI) from matches, see #22
Browse files Browse the repository at this point in the history
  • Loading branch information
laktak committed Nov 4, 2017
1 parent 8b8c83d commit b3f9592
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 13 deletions.
35 changes: 22 additions & 13 deletions extrakto.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,16 @@

RE_URL_OR_PATH = RE_PATH + "|" + RE_URL

RE_WORD = r'[^][(){} \t\n\r]+'
# "words" consist of anything but the following characters:
# [](){}
# unicode range 2500-27BF which includes:
# - Box Drawing
# - Block Elements
# - Geometric Shapes
# - Miscellaneous Symbols
# - Dingbats
# and whitespace ( \t\n\r)
RE_WORD = '[^][(){}\u2500-\u27BF \\t\\n\\r]+'

# reg exp to exclude transfer speeds like 5k/s or m/s, and page 1/2
RE_SPEED = r'[kmgKMG]/s$|^\d+/\d+$'
Expand All @@ -33,7 +42,7 @@ def get_args():
help='extract url tokens')

parser.add_argument('-w', '--words', action='store_true',
help='extract word tokens')
help='extract "word" tokens')

parser.add_argument('-r', '--reverse', action='store_true',
help='reverse output')
Expand All @@ -46,7 +55,7 @@ def get_args():
return args


def process_urls_and_paths(find, text, ml):
def process_urls_and_paths(find, text, min_length):
res = list()

for m in re.finditer(find, "\n" + text, flags=re.I):
Expand All @@ -58,29 +67,29 @@ def process_urls_and_paths(find, text, ml):

# exclude transfer speeds like 5k/s or m/s, and page 1/2
if not re.search(RE_SPEED, item, re.I):
if len(item) > ml:
if len(item) >= min_length:
res.append(item)
return res


def get_urls(text, ml=0):
return process_urls_and_paths(RE_URL, text, ml)
def get_urls(text, min_length=0):
return process_urls_and_paths(RE_URL, text, min_length)


def get_paths(text, ml=0):
return process_urls_and_paths(RE_PATH, text, ml)
def get_paths(text, min_length=0):
return process_urls_and_paths(RE_PATH, text, min_length)


def get_urls_or_paths(text, ml=0):
return process_urls_and_paths(RE_URL_OR_PATH, text, ml)
def get_urls_or_paths(text, min_length=0):
return process_urls_and_paths(RE_URL_OR_PATH, text, min_length)


def get_words(text, ml):
def get_words(text, min_length):
words = []

for m in re.finditer(RE_WORD, "\n" + text):
for m in re.finditer(RE_WORD, text):
item = m.group().strip(',:;()[]{}<>\'"|').rstrip('.')
if len(item) > ml:
if len(item) >= min_length:
words.append(item)

return words
Expand Down
10 changes: 10 additions & 0 deletions tests/test_get_words.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@


class TestGetWords(unittest.TestCase):

def test_skip_dot_last_word_in_sentence(self):
text = "Hello world. Extrakto is an awesome plugin."
words = [
Expand All @@ -16,6 +17,15 @@ def test_skip_dot_last_word_in_sentence(self):
result = get_words(text, 4)
self.assertEquals(words, result)

def test_box_drawing(self):
text = "last│first"
words = [
"last", "first"
]

result = get_words(text, 4)
self.assertEquals(words, result)

def test_match_hidden_files(self):
text = "one /home/user/.hidden.txt two .hidden.txt three ./.hidden.txt four ../.hidden.txt"
words = [
Expand Down

0 comments on commit b3f9592

Please sign in to comment.