In [1]:
import wikipediaapi
import pandas as pd
import wikipedia
wikipedia.set_lang('en') # setting wikipedia language
# from nltk.tokenize import word_tokenize
import sys
from time import sleep
import re
import regex

## Data acquisition

In [7]:
def fetch_category_members(category_members, level=0, max_level=1):
        """
        Function to take all articles in category (max_level controls the depth of articles taken from the subcategories)
        Arguments:
        category_members - a list of category members
        level - the level at which to start getting articles
        max_level - the maximal level for the fetched articles
        Returns:
        article_names - a list of the desired article names
        """
        article_names = []
        for c in category_members.values():
            if c.ns == 0:
                article_names.append(c) 
                #print("%s: %s (ns: %d)" % ("*" * (level + 1), c.title, c.ns))
            elif level < max_level and c.ns == wikipediaapi.Namespace.CATEGORY:
                sub_list = []
                sub_list = fetch_category_members(c.categorymembers, level=level + 1, max_level=max_level)
                article_names = article_names + sub_list
        return article_names

In [8]:
def get_words(article_names):
        """
        Function that tokenizes and returns all words in the given list of articles 
        Arguments:
        article_names - list of articles
        Returns:
        words_df - the words in the articles in a dataframe
        """
        len_time = len(article_names)*0.05
        words_df = pd.DataFrame(columns=['biography'])
        for i in range(len(article_names)):
            try:
                page = wikipedia.page(article_names[i].title)
            except wikipedia.DisambiguationError as e:
                s = e.options
                s = list(filter(lambda x : x != "", s))
                try :
                    page = wikipedia.page(s)
                except wikipedia.DisambiguationError as e:
                    pass
            except wikipedia.PageError:
                pass
#             words = word_tokenize(page.content)
#             words = [elem.lower() for elem in words]
#             words = [elem for elem in words if len(elem) > 1 and elem.isdigit() == False]
#             words_wostop = [x for x in words if x not in stop_words]
#             words = [elem.lower() for elem in words]
            words_df.loc[i] = [page.content]
            sys.stdout.write('\r')
            sys.stdout.write("[%-20s] %d%%" % ('='*int((i+1)/len_time), int(5*(i+1)/len_time)))
            sys.stdout.flush()
            sleep(0.25)
        return words_df

In [9]:
wiki_wiki = wikipediaapi.Wikipedia('en') # getting articles in english
# fetching the articles for categories of interest
people_pages = wiki_wiki.page("Category:People from Venice")
people_articles = fetch_category_members(people_pages.categorymembers, level = 0, max_level = 7)

In [11]:
venetian_biographies = get_words(people_articles)

[=                   ] 9%



  lis = BeautifulSoup(html).find_all('li')




In [12]:
venetian_biographies = venetian_biographies.drop_duplicates().reset_index(drop=True)

In [13]:
venetian_biographies.to_csv('venetian_bios_full.csv', index=False) 

In [116]:
pd.read_csv('venetian_bios_full.csv')['biography'][394]

'Matteo Ponzone (17th century) was an Italian painter of the Baroque period, active between 1630 and 1700 mainly in Venice. He was a pupil of Santo Peranda.\nSeveral of his works are in the churches and public buildings of Venice, particularly in San Giorgio Maggiore, and in the church of the "Padri Croceferi".\n\n\n== Life ==\n\nAccording to several sources, Ponzone was born in Venice, identified as «Mathi et Simon fiol de noble Patron Claudio Bolzon et Agnesina Negro equal in Madonna» born in the parish of Saint Moses November 9, 1583. Some other sources reported his date of birth approximately in 1586 in Rab, in the far north of Dalmatia, that time owned by Republic of Venice.\nMatteo Ponzone operated mainly in Venice, unless an interim period of ten years spent in Dalmatia, leaving their works in various locations of the coast.Ponzone was young student of Jacopo Palma the Younger, and was related to the painter Sante Peranda, who was probably one of his teachers and whose influence

## Cleaning

In [44]:
venetian_biographies = venetian_biographies.drop_duplicates().reset_index(drop=True)

Remove anything after References|Sources|Notes|External links|Works (anything containing works)|Further readings|Gallery|Exhibitions

bibliography 394 is a good example: it contains 'Major Works'

In [133]:
venetian_biographies.biography = venetian_biographies.biography.apply(lambda x: re.split(r"=+ (References|Sources|Notes|External links|Works|Further readings|Gallery|Exhibitions|\w+ works.*?) =+", x)[0])

In [138]:
# Remove anything between equal signs like "== Overview =="
# Remove anything between <> signs
venetian_biographies = venetian_biographies.biography.str.replace("<(.*?)>|=+ (.*?) =+","").to_frame()

Dates: Remove anything containing 1900+

In [155]:
venetian_biographies_old_dates = venetian_biographies[~venetian_biographies.biography.str.contains(r"[1-2][9][0-9][0-9]|[2][0-9]{3}")]

In [51]:
# compression_opts = dict(method='zip',
#                         archive_name='venetian_bios.csv')  
# venetian_biographies.to_csv('venetian_bios.csv', index=False) 

In [272]:
# venetian_biographies = pd.read_csv('venetian_bios.csv')

In [156]:
def insert_sep(s):
    s = regex.sub(r'[^(\p{Common}|\p{Latin})]', u'', s)
    s = s+"end."
    s = s.replace("\n"," ")
    i = s.index(re.findall(r"\w{2,}?\.(?!(?:[^(]*\([^)]*\))*[^()]*\))", s)[0])
    return s[:i] + s[i:-4].replace(" ", " [SEP] ", 1)

In [157]:
venetian_biographies = venetian_biographies.biography.apply(insert_sep)

In [158]:
venetian_biographies_old_dates = venetian_biographies_old_dates.biography.apply(insert_sep)

In [280]:
# df.to_csv('venetian_bios_2.csv', index=False) 

  """Entry point for launching an IPython kernel.


In [165]:
venetian_biographies.to_frame().to_csv('bios_all_dates.csv', index=False) 

In [166]:
venetian_biographies_old_dates.to_frame().to_csv('bios_old_dates.csv', index=False) 

In [3]:
pd.read_csv('bios_old_dates.csv')

Unnamed: 0,biography
0,Rabbi Samuel ben Abraham Aboab (Hebrew: ; ...
1,Giovanni Battista Agnello (fl. 1560–1577) was ...
2,Domenico Alberti (c. 1710 – 14 October 1740 or...
3,Pietro Cesare Alberti (1608–1655) — later Pete...
4,Tomaso Giovanni Albinoni (8 June 1671 – 17 Jan...
...,...
1159,Manon Balletti (1740–1776) was the daughter of...
1160,"Gaetano Casanova (2 April 1697, Parma – 18 Dec..."
1161,"Lochac, Locach or Locat is a country far south..."
1162,Marco Polo is a lunar impact crater that is lo...
