In [72]:
import wikipediaapi
import pandas as pd
import wikipedia
wikipedia.set_lang('en') # setting wikipedia language
from nltk.tokenize import word_tokenize
import sys
from time import sleep

In [26]:
def fetch_category_members(category_members, level=0, max_level=1):
        """
        Function to take all articles in category (max_level controls the depth of articles taken from the subcategories)
        Arguments:
        category_members - a list of category members
        level - the level at which to start getting articles
        max_level - the maximal level for the fetched articles
        Returns:
        article_names - a list of the desired article names
        """
        article_names = []
        for c in category_members.values():
            if c.ns == 0:
                article_names.append(c) 
                #print("%s: %s (ns: %d)" % ("*" * (level + 1), c.title, c.ns))
            elif level < max_level and c.ns == wikipediaapi.Namespace.CATEGORY:
                sub_list = []
                sub_list = fetch_category_members(c.categorymembers, level=level + 1, max_level=max_level)
                article_names = article_names + sub_list
        return article_names

In [65]:
def get_words(article_names):
        """
        Function that tokenizes and returns all words in the given list of articles 
        Arguments:
        article_names - list of articles
        Returns:
        words_df - the words in the articles in a dataframe
        """
        len_time = len(article_names)*0.05
        words_df = pd.DataFrame(columns=['article', 'words'])
        for i in range(len(article_names)):
            try:
                page = wikipedia.page(article_names[i].title)
            except wikipedia.DisambiguationError as e:
                s = e.options
                s = list(filter(lambda x : x != "", s))
                try :
                    page = wikipedia.page(s)
                except wikipedia.DisambiguationError as e:
                    pass
            except wikipedia.PageError:
                pass
            words = word_tokenize(page.content)
            words = [elem.lower() for elem in words]
            words = [elem for elem in words if len(elem) > 1 and elem.isdigit() == False]
#             words_wostop = [x for x in words if x not in stop_words]
            words = [elem.lower() for elem in words]
            words_df.loc[i] = [article_names[i].title] + [words]
            sys.stdout.write('\r')
            sys.stdout.write("[%-20s] %d%%" % ('='*int((i+1)/len_time), int(5*(i+1)/len_time)))
            sys.stdout.flush()
            sleep(0.25)
        return words_df

In [47]:
wiki_wiki = wikipediaapi.Wikipedia('en') # getting articles in english
# fetching the articles for categories of interest
people_pages = wiki_wiki.page("Category:People from Venice")
people_articles = fetch_category_members(people_pages.categorymembers, level = 0, max_level = 7)

In [49]:
len(people_articles)

5399

In [64]:
wikipedia.page(people_articles[1].title)

<WikipediaPage 'Samuel Aboab'>

In [50]:
people_articles

[Mayor of Venice (id: ??, ns: 0),
 Samuel Aboab (id: ??, ns: 0),
 Afrob (id: ??, ns: 0),
 Giovanni Battista Agnello (id: ??, ns: 0),
 Domenico Alberti (id: ??, ns: 0),
 Pietro Cesare Alberti (id: ??, ns: 0),
 Tomaso Albinoni (id: ??, ns: 0),
 Pietro Alcionio (id: ??, ns: 0),
 Innocente Alessandri (id: ??, ns: 0),
 Pope Alexander VIII (id: ??, ns: 0),
 Francesco Algarotti (id: ??, ns: 0),
 David ben Solomon Altaras (id: ??, ns: 0),
 Maria Luisa Altieri Biagi (id: ??, ns: 0),
 Angelo Maria Amorevoli (id: ??, ns: 0),
 Marcantonio Amulio (id: ??, ns: 0),
 Giulia Andreani (id: ??, ns: 0),
 Andrew III of Hungary (id: ??, ns: 0),
 Attilio Anelli-Monti (id: ??, ns: 0),
 Giuseppe Angeli (id: ??, ns: 0),
 Edi Angelillo (id: ??, ns: 0),
 Baldassare d'Anna (id: ??, ns: 0),
 Domiziano Arcangeli (id: ??, ns: 0),
 Paolo Aretino (id: ??, ns: 0),
 Marc-René de Voyer de Paulmy d'Argenson (1652–1721) (id: ??, ns: 0),
 Roberto Assagioli (id: ??, ns: 0),
 Giovanna Baccelli (id: ??, ns: 0),
 Giacomo Badoaro

In [73]:
take_words(people_articles)



Unnamed: 0,article,words
0,Mayor of Venice,"[the, mayor, of, venice, is, an, elected, poli..."
1,Samuel Aboab,"[rabbi, samuel, ben, abraham, aboab, hebrew, ר..."


In [None]:
# venetians = get_words(people_articles)

In [75]:
# venetians.to_pickle("./venetians.pkl")

In [76]:
pd.read_pickle("./venetians.pkl")

Unnamed: 0,article,words
0,Mayor of Venice,"[the, mayor, of, venice, is, an, elected, poli..."
1,Samuel Aboab,"[rabbi, samuel, ben, abraham, aboab, hebrew, ר..."
2,Afrob,"[afrob, born, robert, zemichiel, on, august, i..."
3,Giovanni Battista Agnello,"[giovanni, battista, agnello, fl, 1560–1577, w..."
4,Domenico Alberti,"[domenico, alberti, c., october, or, was, an, ..."
...,...,...
5394,"Villa Pisani, Stra","[villa, pisani, at, stra, refers, to, the, mon..."
5395,Sagredo family,"[the, sagredo, were, an, aristocratic, venetia..."
5396,Caterina Sagredo Barbarigo,"[caterina, sagredo, barbarigo, july, february,..."
5397,Giovanni Francesco Sagredo,"[giovanni, francesco, sagredo, 1571–, march, w..."
