In [15]:
import re
import urllib.request
from bs4 import BeautifulSoup
from pandas import DataFrame
import pandas as pd
import datetime
import seaborn as sns

oldArmenianUrl = 'https://en.wiktionary.org/wiki/Category:Old_Armenian_lemmas'

In [38]:
def getSoupFromUrl(url):
    with urllib.request.urlopen(url) as response:
        html = response.read()
        html = html.decode('utf-8')
    return BeautifulSoup(html)

def getAllPageUrls(url0, verbose=False):
    urls = [url0]
    hasNext = True
    
    while hasNext:
        nextUrl = urls[-1]
        
        if verbose:
            print(len(urls), nextUrl)
                
        bs = getSoupFromUrl(nextUrl)
        
        nextPageLinks = bs.find_all('a', text='next page')
        if len(nextPageLinks) == 0:
            hasNext = False
        else:
            nextUrl = 'https://en.wiktionary.org/' + nextPageLinks[0].get('href')
            urls.append(nextUrl)
    
    return urls

def getCategoryEntries(url):
    bs = getSoupFromUrl(url)
    return DataFrame([{'name': li.text, 'url': li.a.get('href')} 
     for li in bs.find_all('div', class_='mw-content-ltr')[-1].find_all('li')])[['name','url']]

In [39]:
oldArmenianUrls = getAllPageUrls(oldArmenianUrl, verbose=True)

1 https://en.wiktionary.org/wiki/Category:Old_Armenian_lemmas
2 https://en.wiktionary.org//w/index.php?title=Category:Old_Armenian_lemmas&pagefrom=%D4%B1%D5%83%D4%BB%D5%92%D5%86%0A%D5%A1%D5%B3%D5%AB%D6%82%D5%B6#mw-pages
3 https://en.wiktionary.org//w/index.php?title=Category:Old_Armenian_lemmas&pagefrom=%D4%B1%D5%86%D5%80%D4%B5%D4%B4%D4%B5%D4%B4%D5%88%D5%92%D4%B9%D4%BB%D5%92%D5%86%0A%D5%A1%D5%B6%D5%B0%D5%A5%D5%A4%D5%A5%D5%A4%D5%B8%D6%82%D5%A9%D5%AB%D6%82%D5%B6#mw-pages
4 https://en.wiktionary.org//w/index.php?title=Category:Old_Armenian_lemmas&pagefrom=%D4%B1%D5%8D%D5%88%D5%92%D4%B5%D4%B1%D5%85%0A%D5%A1%D5%BD%D5%B8%D6%82%D5%A5%D5%A1%D5%B5#mw-pages
5 https://en.wiktionary.org//w/index.php?title=Category:Old_Armenian_lemmas&pagefrom=%D4%B1%D5%90%D5%8F%D4%B1%D4%BD%D5%88%D5%92%D5%90%D4%B1%D4%BF%0A%D5%A1%D6%80%D5%BF%D5%A1%D5%AD%D5%B8%D6%82%D6%80%D5%A1%D5%AF#mw-pages
6 https://en.wiktionary.org//w/index.php?title=Category:Old_Armenian_lemmas&pagefrom=%D4%B2%D4%B1%D5%86%D4%B5%D5%84%0A%D5%A2%D

In [63]:
testUrl = oldArmenianUrls[20]
bs = getSoupFromUrl(testUrl)
print(testUrl)

https://en.wiktionary.org//w/index.php?title=Category:Old_Armenian_lemmas&pagefrom=%D5%84%D4%BB%D4%B1%D5%87%D4%B1%D4%B2%D4%B1%D4%B9%0A%D5%B4%D5%AB%D5%A1%D5%B7%D5%A1%D5%A2%D5%A1%D5%A9#mw-pages


In [66]:
entries = []

for url in oldArmenianUrls:
    print(len(entries), end=' ' if (len(entries)+1) % 10 != 0 else '\n')
    df = getCategoryEntries(url)
    entries.append(df)
    
OldArmenian = pd.concat(entries)
OldArmenian

0 1 2 3 4 5 6 7 8 9
10 11 12 13 14 15 16 17 18 19
20 21 22 23 24 25 26 27 28 29
30 

Unnamed: 0,name,url
0,-ա-,/wiki/-%D5%A1-
1,-աբար,/wiki/-%D5%A1%D5%A2%D5%A1%D6%80
2,Աբգար,/wiki/%D4%B1%D5%A2%D5%A3%D5%A1%D6%80
3,աբեթ,/wiki/%D5%A1%D5%A2%D5%A5%D5%A9
4,աբեղայ,/wiki/%D5%A1%D5%A2%D5%A5%D5%B2%D5%A1%D5%B5
...,...,...
109,օրիորդ,/wiki/%D6%85%D6%80%D5%AB%D5%B8%D6%80%D5%A4
110,օրհն,/wiki/%D6%85%D6%80%D5%B0%D5%B6
111,օրհնեմ,/wiki/%D6%85%D6%80%D5%B0%D5%B6%D5%A5%D5%B4
112,-օք,/wiki/-%D6%85%D6%84


In [67]:
OldArmenian.to_csv('data/old_armenian_entries.csv')

In [69]:
OldArmenian.sample(30)

Unnamed: 0,name,url
118,-աւք,/wiki/-%D5%A1%D6%82%D6%84
19,երէ,/wiki/%D5%A5%D6%80%D5%A7
158,աղիճ,/wiki/%D5%A1%D5%B2%D5%AB%D5%B3
22,նէր,/wiki/%D5%B6%D5%A7%D6%80
174,աղուեսենի,/wiki/%D5%A1%D5%B2%D5%B8%D6%82%D5%A5%D5%BD%D5%...
103,եղունգն,/wiki/%D5%A5%D5%B2%D5%B8%D6%82%D5%B6%D5%A3%D5%B6
48,գրեանք,/wiki/%D5%A3%D6%80%D5%A5%D5%A1%D5%B6%D6%84
21,մայր,/wiki/%D5%B4%D5%A1%D5%B5%D6%80
171,Խարբերդ,/wiki/%D4%BD%D5%A1%D6%80%D5%A2%D5%A5%D6%80%D5%A4
42,մանրագոր,/wiki/%D5%B4%D5%A1%D5%B6%D6%80%D5%A1%D5%A3%D5%...
