In [20]:
import sys,os
import requests
from bs4 import BeautifulSoup
import pickle
import dask
import dask.dataframe as ddf
import pandas as pd
import numpy as np
import mwparserfromhell
from glob import glob

# get articles in selected categories

In [2]:
def wikipediaCategoryArticleNames(category):
    categoryPage= requests.get('https://en.wikipedia.org/wiki/Category:'+category)
    cpxml= BeautifulSoup(categoryPage.text,'html')
    span = cpxml.select("span#Pages_in_category")[0]
    page_list = span.find_next("div")
    # get titles instead of href, since we want to get content from processed wiki dump
    test=[ref.replace('/wiki/','') 
          for ref in 
          [a['title'] 
           for a in page_list.find_all('a') 
           if a.has_attr('href') and a.has_attr('title') and len(a.attrs)==2]
          if not ':' in ref and not '?' in ref and not '//' in ref]
    return set(test)

In [3]:
tcmCategories=set(['Traditional_Chinese_medicine', 
                   'Acupuncture',
                   'Baguazhang',
                   'Chinese_medical_texts',
                   'Plants_used_in_traditional_Chinese_medicine', 
                   'Fungi_used_in_traditional_Chinese_medicine',
                   'Qigong',
                   'Tai_chi',
#                    'Traditional_chinese_medical_pills',
                   'Wu_Xing'])

In [4]:
tcmArticles= {category: wikipediaCategoryArticleNames(category) for category in tcmCategories}

In [5]:
titles = [*tcmArticles.values()]
titles = [list(i) for i in titles]
titles = sum(titles, [])
titles = list(set(titles))

# get main article
Look up the most curent content for each article, then search for {{Main|xx}} markup

In [6]:
with open('../../intermediate-result/title-look-up.pickle', 'rb') as handle:
    title_lookup = pickle.load(handle)

In [7]:
folder_info = {} # which folder contains which article
articles_not_in_wiki_dump = [] # articles cannot find in wiki dumps, need to extract using API?

In [8]:
def add_to_folder_info(folder,title):
    if folder in folder_info:
        folder_info[folder].append(title)
    else:
        folder_info[folder] = [title]

In [9]:
def get_main_articles(title):
    if title not in title_lookup:
        articles_not_in_wiki_dump.append(title)
        return []
    added_pages = []
    folder = title_lookup[title]
    article_content = ddf.read_parquet('../../../mount-files/all-data-extracted-page-info/' + folder +'/page.content').compute().loc[title]
    wikicode = mwparserfromhell.parse(article_content)
    templates = wikicode.filter_templates()
    for i in templates:
        if i.name.lower() == 'main':
            for j in i.params:
                if (j not in titles) and (j not in added_pages):
                    added_pages.append(j.strip())
    return added_pages

In [10]:
added_pages = []
added_info = {}

In [11]:
for title in titles:
    added = get_main_articles(title)
    if len(added) > 0:
        added_pages += added
        added_info[title] = added

In [12]:
added_pages = list(set(added_pages))

In [13]:
titles += added_pages

In [17]:
with open("../../intermediate-result/TCM/tcmTitles.txt", "wb") as fp:   #Pickling
    pickle.dump(titles, fp)

# Extract Article Content

In [23]:
path = '../../../enwiki-columns/'

In [24]:
all_folders = glob(path + '*.bz2*')

In [26]:
sub_files = glob(all_folders[0] + '/' + '*bz20--2499*')

In [27]:
fields = [i.split('parquet_dir_')[1] for i in sub_files]

In [None]:
fields.remove('page.title')
fields.remove('revision.fileindex')

In [None]:
final_path = '../../tcm-columns-add-main/'

In [None]:
for i in fields:
    field_path = final_path + i
    if not os.path.exists(field_path):
        os.makedirs(field_path)

In [None]:
# for each article, get the folder name
# then extract the info for all fields

In [None]:
# reduce file reading time: some titles are from one folder, only need to read once

In [None]:
folder_title_info = {}

In [None]:
def update_folder_title_info(title,folder):
    if folder in folder_title_info:
        folder_title_info[folder].append(title)
    else:
        folder_title_info[folder] = [title]

In [None]:
for title in titles:
    if title in title_lookup:
        folder = title_lookup[title]
        update_folder_title_info(title,folder)

In [None]:
def find_bznumber(text):
    match = re.search(r'(\bbz(\d+--\d+)\b)',text).group(0)
    return match

In [None]:
class extractTCM(object):
    def __init__(self,folder,kept_titles):
        self.folder = folder
        self.kept_titles = kept_titles
        
    def get_page_info(self):
        all_titles_files = glob(path + self.folder + '/' + '*page.title*')
        page_title_files = [s for s in all_titles_files]
        titles = [ddf.read_parquet(file) for file in page_title_files]
        self.page_title = ddf.concat(titles).compute()
        
        self.unique_files = list(map(find_bznumber, all_titles_files))
        
        file_index_files = [s for s in glob(path + self.folder + '/' + '*revision.fileindex*')]
        file_index = [ddf.read_parquet(file) for file in file_index_files]
        self.file_index = ddf.concat(file_index).compute()
        
        page_info = self.page_title.join(self.file_index)
        self.page_info = page_info[page_info['page.title'].isin(self.kept_titles)]
        
    def find_file_number(self,min_ind,max_ind):
            min_val = int(min_ind/2500) * 2500
            max_val = int(max_ind/2500) * 2500
            res = []
            while min_val != max_val:
                res.append([i for i in self.unique_files if i.startswith('bz2' + str(min_val) + '--')][0])
                min_val += 2500
            res.append([i for i in self.unique_files if i.startswith('bz2' + str(min_val) + '--')][0])
            return res
        
    def get_kept_file_index(self):
        self.kept_page_info = self.page_info.groupby('page.title').agg({'revision.fileindex':['min','max']})
        self.kept_page_info.columns = self.kept_page_info.columns.droplevel(0)        
        max_ind = self.file_index.max()['revision.fileindex']
        max_file = [i for i in self.unique_files if i.endswith(str(max_ind))][0]
        self.kept_page_info['file_index'] = self.kept_page_info.apply(lambda d: self.find_file_number(d['min'],d['max']),axis=1)
    
    def extract_other_fields(self,kept_index,field,fileindex,title):
        fileindex = ['*' + i +'*' for i in fileindex]
        all_folders = []
        for i in fileindex:
            all_folders += glob(path + self.folder + '/' + i)
        kept_folders = [i for i in all_folders if field in i]
        all_files = [s for s in kept_folders if field in s]
        df = [ddf.read_parquet(file) for file in all_files]
        df = ddf.concat(df).compute()
        df = df[df.index.isin(kept_index)]
        dask_df = ddf.from_pandas(df,chunksize=1000)
        fin_path = final_path + field +'/' + title + '/'
        dask_df.to_parquet(fin_path)

    def get_title_info(self,title):
        # get and save title info
        curr_page_info = self.page_info[self.page_info['page.title'] == title]
        curr_page_title = curr_page_info[['page.title']]
        curr_page_title_df = ddf.from_pandas(curr_page_title,chunksize=1000)
        curr_fin_path_title = final_path + 'page.title/' + title + '/'
        curr_page_title_df.to_parquet(curr_fin_path_title)
        # get and save fileindex info
        curr_fileindex = curr_page_info[['revision.fileindex']]
        curr_fileindex_df = ddf.from_pandas(curr_fileindex,chunksize=1000)
        curr_fin_path_fileindex = final_path + 'revision.fileindex/' + title + '/'
        curr_fileindex_df.to_parquet(curr_fin_path_fileindex)
        curr_kept_index = list(curr_page_info.index)
        curr_fileindex_list = self.kept_page_info.loc[title]['file_index']
        x = [self.extract_other_fields(curr_kept_index,i,curr_fileindex_list,title) for i in fields]
        
    def run(self):
        self.get_page_info()
        self.get_kept_file_index()
        x = [self.get_title_info(t) for t in self.kept_titles]


In [None]:
for k,v in folder_title_info.items():
    x = extractTCM(k,v)
    x.run()