In [22]:
from __future__ import print_function

import pickle
from datetime import datetime
import glob

from pandas import DataFrame, Series
import pandas as pd
import numpy as np
import simplejson
import dateutil.parser

In [23]:
def load_germanwings():
    filename = "../data/germanwings.pkl"
    with open(filename, "rb") as f:
        return pickle.load(f)

german_wings = load_germanwings()

In [24]:
def loader(filespec):
    for filename in glob.glob(filespec):
        print(filename)
        with open(filename) as f:
            for line in f:
                d = simplejson.loads(line)
                doc = {
                    "short_url": d["g"],
                    "country": d.get("c", ""),
                    "timestamp": datetime.utcfromtimestamp(d["t"]),
                    "timezone": d.get("tz", ""),
                }
                if doc["short_url"] in german_wings:
                    yield doc

In [33]:
from csv import DictReader, register_dialect

def load_remapped_timezones():
    print("Loading timezones")
    with open("../data/timezone-map.csv") as f:
        reader = DictReader(f, fieldnames=["timezone", "offset1", "offset2"])
        return {row["timezone"]: row["offset1"] for row in reader}


def load_remapped_country_codes():
    print("Loading country codes")
    dialect = register_dialect('tabs', delimiter='\t')
    with open("../data/country-code-lookup.csv") as f:
        reader = DictReader(f, dialect='tabs')
        return {row["Code"]: row["Country name"] for row in reader}


remapped_timezone = load_remapped_timezones()
remapped_country_code = load_remapped_country_codes()


Loading timezones
Loading country codes


In [77]:
GERMANWINGS_PICKLE = "germanwings-hist.pkl"


def save_germanwings():
    df = DataFrame(loader("../data-capstone/*.log"))
    # http://stackoverflow.com/questions/24216425/adding-a-new-pandas-column-with-mapped-value-from-a-dictionary
    df["timezone_offset"] = df.timezone.map(remapped_timezone.get)
    df["country_name"] = df.country.map(remapped_country_code.get)
    # http://stackoverflow.com/questions/25146121/extracting-just-month-and-year-from-pandas-datetime-column-python
    df["day"] = df.timestamp.dt.day
    df["hour"] = df.timestamp.dt.hour
    df["minute"] = df.timestamp.dt.minute

    df.to_pickle(GERMANWINGS_PICKLE)

def load_germanwings():
    return pd.read_pickle(GERMANWINGS_PICKLE)

In [78]:
def save_as_loadable_JSON(df, json_filename, column_remapping)
    with open(json_filename, "w") as f:
        df_tmp = df.rename(columns=column_remapping)
        objs = [dict(row) for _, row in df_tmp.iterrows()] 
        f.write(simplejson.dumps(objs))

In [117]:
TIME_COLUMNS = ["day", "hour", "minute"]
COUNTRY_DF_COLUMNS = ["country_name"] + TIME_COLUMNS
TIMEZONE_COLUMNS = ["timezone_offset"] + TIME_COLUMNS


def country_selection_df(df):
    s = pd.Series(df.groupby(COUNTRY_DF_COLUMNS).count().timezone, name='count')
    df = DataFrame(s)
    df.reset_index(inplace=True)
    return df

def make_US_CA(df):
    tmp = df.query('country == "US" or country == "CA"')
    return country_selection_df(tmp)

df_US_CA = make_US_CA(df)
save_as_loadable_JSON(df_US_CA, "urlhist_US_CA.json", {'country_name': 'key'})

  country_name  day  hour  minute  count
0       Canada   25     0       0    206
1       Canada   25     0       1    184
2       Canada   25     0       2    221
3       Canada   25     0       3    219
4       Canada   25     0       4    203


In [120]:
def make_US_DE_ES_IT(df):
    tmp = df.query('country == "US" or country == "DE" or country == "ES" or country == "IT" or country == "FR"')
    return country_selection_df(tmp)

df_US_DE_ES_IT = make_US_DE_ES_IT(df)
save_as_loadable_JSON(df_US_DE_ES_IT, "urlhist_US_DE_ES_IT.json", {'country_name': 'key'})

In [119]:
def make_timezone_offset(df):
    s = pd.Series(df.groupby(TIMEZONE_COLUMNS).count().timezone, name='count')
    df_timezone_offset = DataFrame(s)
    df_timezone_offset.reset_index(inplace=True)
    return df_timezone_offset

df_timezone = make_timezone_offset(df)
save_as_loadable_JSON(df_timezone, "urlhist_timezone_offset.json", {'timezone_offset': 'key'})