In [40]:
import wikipediaapi
import re

wiki = wikipediaapi.Wikipedia(
        language='en',
        extract_format=wikipediaapi.ExtractFormat.WIKI
)

In [41]:
import json
from pprint import pprint

with open('articles.json') as f:
    data = json.load(f)

In [42]:
articles = data['articles']
articles

['Donald_Trump',
 'United_States',
 'Bitcoin',
 'Queen_Victoria',
 'Elon_Musk',
 'Facebook',
 'Barack_Obama',
 'YouTube',
 'Aristotle',
 'Statistics',
 'Chemistry',
 'Monkey',
 'Chess',
 'Python_(programming_language)',
 'Comedy',
 'Trial',
 'Executive_(government)',
 'State',
 'President_of_the_United_States',
 'George_H._W._Bush',
 'Politician',
 'Head_of_government',
 'Leo_Varadkar',
 'President_of_Ireland',
 'Veto',
 'China',
 'Confucianism',
 'Zhou_Dynasty',
 'Metropolitan_area',
 'Urban_area',
 'Word',
 'Religion',
 'Spirituality',
 'Sarah_Bernhardt',
 'Christopher_Columbus',
 'Yuri_Gagarin',
 'Archimedes',
 'Nicolaus_Copernicus',
 'Marie_Curie',
 'Galileo_Galilei',
 'Karl_Marx',
 'Friedrich_Nietzsche',
 'Alexander_the_Great',
 'Augustus',
 'Napoleon',
 'Gautama_Buddha',
 'Jesus',
 'Knowledge',
 'God',
 'Soul',
 'Judaism',
 'Family',
 'Politics',
 'Money',
 'International_Red_Cross_and_Red_Crescent_Movement',
 'United_Nations',
 'Military',
 'War',
 'Writing',
 'Literature',
 'As

In [43]:
def save_sections(sections, lst, level=0):
        for s in sections:
            skip = ["References", "Sources", "Further reading", "External links", "See also", "Other websites"]
            if not s.title in skip:
                lst.append(s.text.strip())
                save_sections(s.sections, lst, level + 1)

In [44]:
def get_simplified_page(wiki):
    return wiki.langlinks['simple']

In [45]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [46]:
all_complex = []
all_simple = []
for article in articles:
    
    # scrape wikipedia page
    page = wiki.page(article)
    complex_sections = []
    save_sections(page.sections, complex_sections)
    
    simple_sections = []
    save_sections(get_simplified_page(page).sections, simple_sections)
    
    def process_sentences(sections):
        sections = ". ".join(sections) \
            .replace('\n', ' ')
                    
        remove_re = r'[!@#$%^&*()<>"{}|,+0-9~;`]'
        sentences = re.sub(remove_re, '', sections)
        sentences = sentences.split(' ')
        
        to_filter = ['', ',', '', 'U', 'S', 's', "-Y"]
        sentences = filter(lambda x: x not in to_filter, sentences)
        sentences = filter(lemmatizer.lemmatize, sentences)
        sentences = map(lambda x: x.strip(), sentences)
        sentences = " ".join(list(sentences)).split(". ")
        sentences = filter(lambda x: x not in to_filter, sentences)
        return list(sentences)
    
    complex_sentences = process_sentences(complex_sections)
    #print(complex_sentences)

    simple_sentences = process_sentences(simple_sections)
    
    all_complex.extend(complex_sentences[:len(simple_sentences)])
    all_simple.extend(simple_sentences)
#    print(complex_sentences)
    print("Finished: {}".format(article))

Finished: Donald_Trump
Finished: United_States
Finished: Bitcoin
Finished: Queen_Victoria
Finished: Elon_Musk
Finished: Facebook
Finished: Barack_Obama
Finished: YouTube
Finished: Aristotle
Finished: Statistics
Finished: Chemistry
Finished: Monkey
Finished: Chess
Finished: Python_(programming_language)
Finished: Comedy
Finished: Trial
Finished: Executive_(government)
Finished: State
Finished: President_of_the_United_States
Finished: George_H._W._Bush
Finished: Politician
Finished: Head_of_government
Finished: Leo_Varadkar
Finished: President_of_Ireland
Finished: Veto
Finished: China
Finished: Confucianism
Finished: Zhou_Dynasty
Finished: Metropolitan_area
Finished: Urban_area
Finished: Word
Finished: Religion
Finished: Spirituality
Finished: Sarah_Bernhardt
Finished: Christopher_Columbus
Finished: Yuri_Gagarin
Finished: Archimedes
Finished: Nicolaus_Copernicus
Finished: Marie_Curie
Finished: Galileo_Galilei
Finished: Karl_Marx
Finished: Friedrich_Nietzsche
Finished: Alexander_the_Great

In [47]:
len(all_complex)

6509

In [48]:
len(all_simple)

6567

In [50]:
with open('generated_data/complex_sentences.txt', 'w') as file:
    for i in all_complex[:len(all_simple)]:
        file.write(i.lower() + '\n')
with open('generated_data/simple_sentences.txt', 'w') as file:
    for i in all_simple:
        file.write(i.lower() + '\n')

In [52]:
with open('generated_data/all_sentences.csv', 'w') as csv:
    for i in all_complex[:len(all_simple)]:
        csv.write("{},{}".format(i.lower(), "complex\n"))
    for i in all_simple:
        csv.write("{},{}".format(i.lower(), "simple\n"))