In [2]:
import wikipediaapi
import pandas as pd
import wikipedia
wikipedia.set_lang('en') # setting wikipedia language
# from nltk.tokenize import word_tokenize
import sys
from time import sleep
import re
import regex

## Data acquisition

In [6]:
def fetch_category_members(category_members, level=0, max_level=1):
        """
        Function to take all articles in category (max_level controls the depth of articles taken from the subcategories)
        Arguments:
        category_members - a list of category members
        level - the level at which to start getting articles
        max_level - the maximal level for the fetched articles
        Returns:
        article_names - a list of the desired article names
        """
        article_names = []
        for c in category_members.values():
            if c.ns == 0:
                article_names.append(c) 
                #print("%s: %s (ns: %d)" % ("*" * (level + 1), c.title, c.ns))
            elif level < max_level and c.ns == wikipediaapi.Namespace.CATEGORY:
                sub_list = []
                sub_list = fetch_category_members(c.categorymembers, level=level + 1, max_level=max_level)
                article_names = article_names + sub_list
        return article_names

In [7]:
def get_words(article_names):
        """
        Function that tokenizes and returns all words in the given list of articles 
        Arguments:
        article_names - list of articles
        Returns:
        words_df - the words in the articles in a dataframe
        """
        len_time = len(article_names)*0.05
        words_df = pd.DataFrame(columns=['biography'])
        for i in range(len(article_names)):
            try:
                page = wikipedia.page(article_names[i].title)
            except wikipedia.DisambiguationError as e:
                s = e.options
                s = list(filter(lambda x : x != "", s))
                try :
                    page = wikipedia.page(s)
                except wikipedia.DisambiguationError as e:
                    pass
            except wikipedia.PageError:
                pass
#             words = word_tokenize(page.content)
#             words = [elem.lower() for elem in words]
#             words = [elem for elem in words if len(elem) > 1 and elem.isdigit() == False]
#             words_wostop = [x for x in words if x not in stop_words]
#             words = [elem.lower() for elem in words]
            words_df.loc[i] = [page.content]
            sys.stdout.write('\r')
            sys.stdout.write("[%-20s] %d%%" % ('='*int((i+1)/len_time), int(5*(i+1)/len_time)))
            sys.stdout.flush()
            sleep(0.25)
        return words_df

In [9]:
wiki_wiki = wikipediaapi.Wikipedia('en') # getting articles in english
# fetching the articles for categories of interest
people_pages = wiki_wiki.page("Category:People from Venice")
people_articles = fetch_category_members(people_pages.categorymembers, level = 0, max_level = 7)

In [10]:
venetian_biographies = get_words(people_articles)

[=                   ] 9%



  lis = BeautifulSoup(html).find_all('li')




In [15]:
venetian_biographies

Unnamed: 0,biography
0,The Mayor of Venice is an elected politician w...
1,Rabbi Samuel ben Abraham Aboab (Hebrew: רבי שמ...
2,"Afrob (born Robert Zemichiel on August 1, 1977..."
3,Giovanni Battista Agnello (fl. 1560–1577) was ...
4,Domenico Alberti (c. 1710 – 14 October 1740 or...
...,...
5394,"Villa Pisani at Stra refers to the monumental,..."
5395,The Sagredo were an aristocratic Venetian fami...
5396,Caterina Sagredo Barbarigo (14 July 1715 – 11 ...
5397,Giovanni Francesco Sagredo (1571– 5 March 1620...


## Cleaning

In [47]:
venetian_biographies = venetian_biographies.drop_duplicates().reset_index(drop=True)

In [49]:
# Remove anything between equal signs like "== Overview =="
# Remove anything between <> signs
venetian_biographies = venetian_biographies.biography.str.replace("<(.*?)>|=+ (.*?) =+","").to_frame()

In [51]:
# compression_opts = dict(method='zip',
#                         archive_name='venetian_bios.csv')  
# venetian_biographies.to_csv('venetian_bios.csv', index=False) 

In [272]:
# venetian_biographies = pd.read_csv('venetian_bios.csv')

In [269]:
def insert_sep(s):
    s = regex.sub(r'[^(\p{Common}|\p{Latin})]', u'', s)
    s = s+"end."
    s = s.replace("\n"," ")
    i = s.index(re.findall(r"\w{2,}?\.(?!(?:[^(]*\([^)]*\))*[^()]*\))", s)[0])
    return s[:i] + s[i:-4].replace(" ", " [SEP] ", 1)

In [273]:
df = venetian_biographies.biography.apply(insert_sep)

In [280]:
df.to_csv('venetian_bios_2.csv', index=False) 

  """Entry point for launching an IPython kernel.
