In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
def get_pageviews(year, month, day):
    pageviews_url = 'https://dumps.wikimedia.org/other/pageviews/{0}/{0}-{1}/'.format(year, month)
    soup = BeautifulSoup(requests.get(pageviews_url).text)
    for a in soup.find_all('a'):
        if 'pageviews-{0}{1}{2}'.format(year, month, day) in a['href']:
            yield pageviews_url + a['href']
            
def write_file(url):
    req = requests.get(url, stream=True)
    local_filename = url.split("/")[-1]
    with open('data/' + local_filename, 'wb') as f:
        for chunk in req.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)
                f.flush()

In [None]:
[write_file(url) for url in get_pageviews(year, month, day)]

In [None]:
all_pageviews = [n for n in get_pageviews('2016', '01', '01')]
write_file(all_pageviews[0])

In [None]:
# System independent way to join paths
LOCAL_DATA_PATH = os.path.join(os.getcwd(), "pageviews-gz")

def mv_files(filename, hdfs_dir, hdfs_conn):
    dir_name = hdfs_dir + filename[:-3]
    hdfs_conn.mkdir(dir_name)
    filepathtarget = '/'.join([dir_name, filename])
    hdfs_conn.put(filepathtarget, os.path.join(LOCAL_DATA_PATH, filename))
    return dir_name

In [None]:
def extract_datetime(filename):
    _, date_str, time_str = filename.split("-")
    year = date_str[:4]
    month = date_str[4:6]
    day = date_str[-2:]
    hour = time_str[:2]
    return year, month, day, hour

In [None]:
def to_pd_dt(filename):
    return pd.to_datetime(filename, format='pageviews-%Y%m%d-%H0000')

In [None]:
FILE_SCHEMA = ibis.schema([('project_name', 'string'),
                           ('page_name', 'string'),
                           ('n_views', 'int64'),
                           ('n_bytes', 'int64')])

def gz_2_data_insert(data_dir, ibis_conn, db_name):
    tmp_table = ibis_conn.delimited_file(hdfs_dir=data_dir,
                                  schema=FILE_SCHEMA,
                                  delimiter=' ')
    year, month, day, hour = extract_datetime(data_dir.split("/")[-1])
    # create a column named time
    tmp_w_time = tmp_table.mutate(year=year, month=month, day=day, hour=hour)

    working_db = safe_get_db(ibis_conn, db_name)
    if 'wiki_pageviews' in working_db.tables:
        ibis_conn.insert('wiki_pageviews', tmp_w_time, database=db_name)
    else:
        ibis_conn.create_table('wiki_pageviews', obj=tmp_w_time,
                               database=db_name)

In [None]:
def safe_get_db(ibis_conn, db_name):
    if not ibis_conn.exists_database(db_name):
         ibis_conn.create_database(db_name)
    return ibis_conn.database(db_name)

In [None]:
LOCAL_FILES = os.listdir(LOCAL_DATA_PATH)

hdfs_gz_dirs = [mv_files(filename, hdfs_dir, hdfs_conn) for filename in LOCAL_FILES]
[gz_2_data_insert(data_dir, ibis_conn, db_name) for data_dir in hdfs_gz_dirs]