In [1]:
import re

filepath = '.data//deep_learning_corpus.txt'

In [2]:
with open(filepath, 'r') as fp:
    corpus_raw = fp.read().split('\n\n\n\n\n')

In [3]:
NQGENTEXT = 50
NPUZZLES = 28

corpus_raw = [corpus_raw for i, corpus_raw in enumerate(corpus_raw) if i==0 or i > NQGENTEXT+NPUZZLES]
print([len(text) for text in corpus_raw])

[8580, 142103, 161644, 112244, 356004, 72331, 55831, 133866, 224178, 24666, 25730, 23198, 12600, 36138, 28372, 121395, 116671, 37869, 134623, 65586, 86287, 15098, 53625, 31043, 64047, 33813, 52926, 51036, 132454, 42954, 6595, 45631, 37612, 137919, 88632, 63357, 55796, 53555, 55225, 37161, 30340, 35584, 55219, 33986, 46811, 174003, 55948, 69210, 39471, 49741, 63500, 72326, 38951, 104703, 80274, 26067, 32557, 10647, 63174, 47609, 84564, 25389, 90939, 51121, 44802, 221984, 184533, 46509, 30136, 117365, 89576, 143112, 120265, 85859, 155781, 140351, 119843, 187337, 45901, 100552, 229087, 157390, 80684, 79563, 152513, 90716, 71792, 109505, 34535, 59793, 128758, 180649, 36933, 68840, 211959, 4327, 89975, 46650, 57728, 63410, 56381, 159421, 11892, 33254, 23265, 58108, 30365, 144549, 94635, 101996, 22309, 68146, 6690, 71501, 21555, 85089, 91820, 20667, 18871, 34720, 37056, 48373, 88992, 73279, 38978, 85864, 125887, 81737, 21088, 39133, 85673, 157178, 36378, 78638, 189287, 41406, 113477, 37691, 

In [10]:
def remove_section(section_name, text):
    text = re.sub(fr'=+{section_name}=+\n.*(?:=|$)', r'¬', text, flags=re.DOTALL)
    text = re.sub(r'¬$', '', text)

    return text

def clean_wiki_article(raw_text):
    text = raw_text

    regex = re.compile(r'\n\n')
    end_header = regex.search(text).start() + 2
    text = text[end_header:]

    # tables
    text = re.sub(r'\{\|.+?\|\}', '', text, flags=re.DOTALL)
    
    # quotes
    text = re.sub(r'\{\{quote \| (.+?)\}\}\n', r'\1\n', text)
    text = re.sub(r'<blockquote.*?>(.*?)</blockquote>', r'\1\n', text, flags=re.DOTALL)

    # images
    text = re.sub(r'\[\[File:.*\n', '', text)
    
    # random tags
    tags = ['poem', 'gallery', 'ref']
    for tag in tags:
        text = re.sub(fr'<{tag}.*?>.*?</{tag}>', '', text, flags=re.DOTALL)

    # special links
    text = re.sub(r'\{\{[^}]+\}\}', '', text)
    
    # wiki links
    text = re.sub(r'\[\[(?:[^|\]]+\|)?([^\]]+)\]\]', r'\1', text)

    # comments
    text = re.sub(r'<!--[^-]*-->', '', text)

    # math
    text = re.sub(r'(?::: )?<math>.*?</math>', '', text)

    # '''Artistotle'''
    text = re.sub(r"'{2,}([- \w]+)'{2,}", r'\1', text)

    # remove sections
    for section in ['References', 'Sources', 'Citations', 'See also', 'Notes', 'External links', 'Further reading']:
        text = remove_section(section, text)

    # title
    text = re.sub(r'\s*=+(.*?)=+', r'\1:', text)

    replace_chars = {
        r'\&nbsp;?': ' ',
    }
    for c, r in replace_chars.items():
        text = re.sub(c, r, text)

    return text


In [12]:
corpus = corpus_raw[0]
corpus += '\n'

for text_raw in corpus_raw[1:]:
    corpus += clean_wiki_article(text_raw)
    corpus += '\n'

In [13]:
with open('deep_learning_corpus_clean.txt', 'w') as fp:
    fp.write(corpus)

## Progress

In [None]:
# https://stackoverflow.com/questions/26385984/recursive-pattern-in-regex
# https://stackoverflow.com/questions/59437266/regex-pattern-recursively-in-python

import regex

b = corpus_raw[1]
print(b[(pos:=re.search('School of Aristotle', b).start())-200:pos+200])
regex.findall("\[\[((?>[^\[\]]+|(?R))*)\]\]", b)
b1 = regex.sub(r'((?!^)\[\[(?:[^\[\]]*|(?R))+\]\])', '\1', b)
print(b1[(pos:=re.search('School of Aristotle', b1).start())-200:pos+200])

def chars_at_level(s):
    out = ['[']
    nesting_level = 0

    for c in s:
        if c == '[':
            nesting_level += 1
        elif c == ']':
            nesting_level -= 1
        elif nesting_level == 1:
            out.append(c)

    out.append(']')
    return ''.join(out)
text = chars_at_level(text)

def parse(string):
    while True:
        # output = re.sub(r'(?<!^)\[\[([^\[\]]*)\]\](?!$)', '', string)
        output = re.sub(r'(\[\[[^\[]*)\[\[[^\]]+\]\]([^\]]*\]\])', r'\1 AA \2', string)
        if output == string:
            break
        string = output
    return output



b = corpus_raw[1]
print(b[(pos:=re.search('School of Aristotle', b).start())-200:pos+200])
b1 = parse(b)
print(b1[(pos:=re.search('School of Aristotle', b1).start())-200:pos+200])

# re.sub(r'(\[\[[^\[]*)\[\[[^\]]+\]\]([^\]]*\]\])', r'\1 AA \2', text)

simply "The Philosopher". His ethics, though always influential, gained renewed interest with the modern advent of [[virtue ethics]].

==Life==
[[File:20160518 092 mieza nympheum.jpg | thumb | left | School of Aristotle in [[Mieza, Macedonia|Mieza]], [[Macedonia (Greece)|Macedonia, Greece]]]]

In general, the details of Aristotle's life are not well-established. The biographies written in ancient 


AttributeError: 'NoneType' object has no attribute 'start'