### Geocoding locations and localizing time
以下のカーネルを参考にしている。  
https://www.kaggle.com/xavierbourretsicotte/localizing-utc-time-eda-and-walkthrough

In [1]:
import os
import numpy as np
import pandas as pd
import time
import warnings
import json
import feather
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

import googlemaps
import pytz as pytz
from datetime import datetime
from tqdm import tqdm
from pandas.io.json import json_normalize
from pycountry_convert import ( map_countries, country_name_to_country_alpha3,country_name_to_country_alpha2)

### Example API call

In [2]:
#Setting up API key
gmaps = googlemaps.Client(key='AIzaSyDEF6IXeAfAiaTuFHOyu2BEE35fWgFyX-8')

###EXAMPLE
# Geocoding an address (API CALL)
geocode_result = gmaps.geocode('1600 Amphitheatre Parkway, Mountain View, CA')

#Extracting timezone from latitude and longitude (API CALL)
timez = gmaps.timezone(location = geocode_result[0]['geometry']['location'])
timez

{'dstOffset': 3600,
 'rawOffset': -28800,
 'status': 'OK',
 'timeZoneId': 'America/Los_Angeles',
 'timeZoneName': 'Pacific Daylight Time'}

### Preparing data and search term

In [3]:
#Utility function
def remove_missing_vals(x):
    remove_list = ['(not set)', 'not available in demo dataset', 'unknown.unknown']
    if x in remove_list:
        return ''
    else:
        return x 

In [4]:
train = feather.read_dataframe('../data/interim/train.ftr')
test = feather.read_dataframe('../data/interim/test.ftr')
train["visitStartTime_datetime"] = pd.to_datetime(train['visitStartTime'], unit='s')
test["visitStartTime_datetime"] = pd.to_datetime(test['visitStartTime'], unit='s')
train.shape, test.shape

((903653, 56), (804684, 54))

In [5]:
#Concatenate train and test set
total = pd.concat([train, test], axis=0, sort=False)

#Subset only geographical columns
geoNetwork_columns = [col for col in train.columns if "geoNetwork" in col]
all_geo = total[geoNetwork_columns]

#Remove "Not set" and other inconsistent values
all_geo = all_geo.applymap(remove_missing_vals)

#Concatenate city, region, country into a single column
all_city_region_country = all_geo['geoNetwork.city'] + ' ' + all_geo['geoNetwork.region'] + ' ' + all_geo['geoNetwork.country']

#Keep only unique values
all_city_region_country = all_city_region_country.unique()
print('Number of unique values:', all_city_region_country.shape)

Number of unique values: (2149,)


In [6]:
[col for col in all_city_region_country if col.find("not") != -1]

[]

In [7]:
[col for col in all_city_region_country if col.find("N/A") != -1]

[]

In [8]:
[col for col in all_city_region_country if col.find("unknown") != -1]

[]

In [9]:
pd.Series(all_city_region_country).isnull().sum()

0

### API calls

In [10]:
#Initialize empty df
geocode_df = pd.DataFrame()

last_rows = 0
for search_term in all_city_region_country:
#Note this can be slow (30 - 45 mins for 3k lines)
    try:
        # Geocoding an address
        geocode_result = gmaps.geocode(search_term)

        #Extracting timezone from latitude and longitude
        timezone_result = gmaps.timezone(location=geocode_result[0]['geometry']['location'])

        #Normalize the result so that we can work with it as a df
        temp_df = json_normalize(geocode_result)
        temp_df['search_term'] = search_term
        temp_df['timeZoneId'] = timezone_result['timeZoneId']
        geocode_df = geocode_df.append(temp_df, ignore_index=True, sort=False)
        
        # for debug
        if (geocode_df.shape[0] - last_rows) != 1:
            print(search_term)
        last_rows = geocode_df.shape[0]

    except:
        pass

Montreal Quebec United States
La Victoria Lima Region Argentina
San Jose California Taiwan
Amsterdam North Holland United States
Sydney New South Wales United States
Mountain View California Taiwan
Mountain View California Canada
Zurich Zurich Ireland
Dnipro Dnipropetrovsk Oblast United States
Shinjuku Tokyo Taiwan
Ningbo Zhejiang United States
London England Japan
Salem Virginia Costa Rica
Ningbo Zhejiang Japan
San Jose California Canada
Jakarta Jakarta Venezuela
Santiago de Surco Cusco Peru
Mountain View California Puerto Rico
Singapore  Ireland
Mountain View California Philippines
San Jose California Iraq
Sunnyvale California Germany
San Francisco California Slovakia
Santa Clara California Canada
Buenos Aires Buenos Aires United States
Cork County Cork Ireland
Warsaw Masovian Voivodeship United Kingdom
London England Netherlands
Sunnyvale California Canada
Mountain View California Colombia
Mountain View California Switzerland
Warsaw Masovian Voivodeship United States
Kitchener Ontar

In [11]:
print(geocode_df.shape)
geocode_df.isnull().sum()

(2319, 20)


address_components                    0
formatted_address                     0
geometry.bounds.northeast.lat       733
geometry.bounds.northeast.lng       733
geometry.bounds.southwest.lat       733
geometry.bounds.southwest.lng       733
geometry.location.lat                 0
geometry.location.lng                 0
geometry.location_type                0
geometry.viewport.northeast.lat       0
geometry.viewport.northeast.lng       0
geometry.viewport.southwest.lat       0
geometry.viewport.southwest.lng       0
place_id                              0
types                                 0
search_term                           0
timeZoneId                            0
plus_code.compound_code            1629
plus_code.global_code              1629
partial_match                      2120
dtype: int64

In [74]:
# temp_df.shape[0]が1でない場合がある。その現象を調査する。
search_term = "Sunnyvale California Philippines"
geocode_result = gmaps.geocode(search_term)
temp_df = json_normalize(geocode_result)
temp_df

Unnamed: 0,address_components,formatted_address,geometry.location.lat,geometry.location.lng,geometry.location_type,geometry.viewport.northeast.lat,geometry.viewport.northeast.lng,geometry.viewport.southwest.lat,geometry.viewport.southwest.lng,place_id,plus_code.compound_code,plus_code.global_code,types
0,"[{'long_name': '905', 'short_name': '905', 'ty...","905 E Duane Ave, Sunnyvale, CA 94085, USA",37.388948,-122.004635,ROOFTOP,37.390297,-122.003286,37.387599,-122.005984,ChIJZc5kLju2j4ARpFeOuiLEfuU,"9XQW+H4 Sunnyvale, California, United States",849V9XQW+H4,"[establishment, food, point_of_interest, resta..."
1,"[{'long_name': '621', 'short_name': '621', 'ty...","621 Caliente Dr, Sunnyvale, CA 94085, USA",37.391365,-122.013304,ROOFTOP,37.392714,-122.011955,37.390016,-122.014653,ChIJ7xuuUja2j4ARW-6_MBecO88,"9XRP+GM Sunnyvale, California, United States",849V9XRP+GM,"[establishment, food, point_of_interest, resta..."


In [12]:
# for confirm
import pickle
with open('../data/interim/geocodes_timezones.pkl', mode='rb') as f:
    kernel_result = pickle.load(f)
kernel_result.shape

(2244, 20)

In [70]:
with open('../data/interim/geocode_df.pickle', mode='wb') as f:
    pickle.dump(geocode_df, f)
with open('../data/interim/geocode_df.pickle', mode='rb') as f:
    geocode_df_r = pickle.load(f)
geocode_df_r.shape

(2319, 20)

### Utility functions

In [23]:
def time_zone_converter(x):
    try:
        return pytz.country_timezones(x)[0]
    except AttributeError:
        return np.nan
    
def time_localizer(s):
    #format of series [time,zone]
    try:
        tz = pytz.timezone(s[1])
        return pytz.utc.localize(s[0], is_dst=None).astimezone(tz)
    except:
        return np.nan
    
def map_timezone(x):   
    try:
        return timezone_dict[x]
    except KeyError:
        return 'UTC'

### Using the time zone information wity pytz to localize time

In [24]:
# Generate foreign key '_search_term' by concatenating city, region, country
train['_search_term'] = train['geoNetwork.city'].map(remove_missing_vals) + ' ' + train['geoNetwork.region'].map(remove_missing_vals) + ' ' + train['geoNetwork.country'].map(remove_missing_vals)
test['_search_term'] = test['geoNetwork.city'].map(remove_missing_vals) + ' ' + test['geoNetwork.region'].map(remove_missing_vals) + ' ' + test['geoNetwork.country'].map(remove_missing_vals)

#Set global variable, needed for map_timezone function
global timezone_dict
timezone_dict = dict(zip(geocode_df['search_term'], geocode_df['timeZoneId']))

#Map timezones
train['_timeZoneId'] = train['_search_term'].map(map_timezone)
test['_timeZoneId'] = test['_search_term'].map(map_timezone)

#Create time zone aware column
train['_local_time'] = train[['visitStartTime_datetime', '_timeZoneId']].apply(time_localizer, axis = 1).astype(str)
test['_local_time'] = test[['visitStartTime_datetime', '_timeZoneId']].apply(time_localizer, axis = 1).astype(str)  

In [52]:
train['_local_time'] = train['_local_time'].str[:19]
train['_local_time'] = pd.to_datetime(train['_local_time'])
test['_local_time'] = test['_local_time'].str[:19]
test['_local_time'] = pd.to_datetime(test['_local_time'])

In [54]:
train[train['geoNetwork.city']=="Shinjuku"][['visitStartTime_datetime', '_local_time']].head()

Unnamed: 0,visitStartTime_datetime,_local_time
7178,2017-03-13 05:04:44,2017-03-13 14:04:44
7226,2017-03-13 00:09:58,2017-03-13 09:09:58
7593,2017-03-12 10:58:59,2017-03-12 19:58:59
7648,2017-03-13 06:25:08,2017-03-13 15:25:08
7666,2017-03-12 17:46:12,2017-03-13 02:46:12


In [57]:
train[train['geoNetwork.city']=="Shinjuku"]['_local_time'].head().dt.hour

7178    14
7226     9
7593    19
7648    15
7666     2
Name: _local_time, dtype: int64

In [58]:
#Creating a df with visitstarttime as the index
sub_cols = ['fullVisitorId', 'sessionId', 'visitId','visitStartTime_datetime', 
             '_local_time', '_timeZoneId']

train[sub_cols].isnull().sum(), test[sub_cols].isnull().sum()

(fullVisitorId              0
 sessionId                  0
 visitId                    0
 visitStartTime_datetime    0
 _local_time                0
 _timeZoneId                0
 dtype: int64, fullVisitorId              0
 sessionId                  0
 visitId                    0
 visitStartTime_datetime    0
 _local_time                0
 _timeZoneId                0
 dtype: int64)

### save train file

In [62]:
train.drop(["visitStartTime_datetime", "_search_term", "_timeZoneId"], axis=1, inplace=True)
test.drop(["visitStartTime_datetime", "_search_term", "_timeZoneId"], axis=1, inplace=True)

In [66]:
feather.write_dataframe(train, "../data/interim/train_with_localtime.ftr")
feather.write_dataframe(test, "../data/interim/test_with_localtime.ftr")

In [68]:
print(train[['visitStartTime', '_local_time']].dtypes)
train[['visitStartTime', '_local_time']].head()

visitStartTime             int64
_local_time       datetime64[ns]
dtype: object


Unnamed: 0,visitStartTime,_local_time
0,1472830385,2016-09-02 18:33:05
1,1472880147,2016-09-03 14:52:27
2,1472865386,2016-09-03 03:16:26
3,1472881213,2016-09-03 12:40:13
4,1472822600,2016-09-02 14:23:20


In [73]:
[col for col in train.columns if col not in test.columns]

['totals.transactionRevenue', 'trafficSource.campaignCode']

### Plotting UTC and local of the day

In [None]:
#Creating a df with visitstarttime as the index
sub_cols = ['fullVisitorId', 'sessionId', 'visitId','visitStartTime_datetime', 
             '_local_time', '_timeZoneId', '_local_hourofday']

train_ts = train[sub_cols].copy()
test_ts =  test[sub_cols].copy()
train_ts.index = train_ts['visitStartTime_datetime']
test_ts.index = test_ts['visitStartTime_datetime']

train_ts['_utc_hourofday'] = train_ts.index.hour
test_ts['_utc_hourofday'] = test_ts.index.hour

#Localize hour time
train['_local_hourofday'] = train['_local_time'].str[11:13]
test['_local_hourofday'] = test['_local_time'].str[11:13]

#### Sessions per hour of day (UTC vs Local time)

In [None]:
df1 = train_ts.groupby('_utc_hourofday').count()['sessionId']
df2 = train_ts.groupby('_local_hourofday').count()['sessionId']
df3 = test_ts.groupby('_utc_hourofday').count()['sessionId']
df4 = test_ts.groupby('_local_hourofday').count()['sessionId']

plt.figure(figsize = (15,15))
plt.subplot(2,2,1)

sns.barplot(x = df1.index ,y = df1.values, color = 'darkblue', alpha = .6)
plt.title('Sessions per hour of day (Training UTC)')

plt.subplot(2,2,2)
sns.barplot(x = df2.index ,y = df2.values, color = 'darkblue', alpha = .6)
plt.title('Sessions per hour of day (Training Local)')

plt.subplot(2,2,3)
sns.barplot(x = df3.index ,y = df3.values, color = 'darkred', alpha = .6)
plt.title('Sessions per hour of day (Test UTC)')

plt.subplot(2,2,4)
sns.barplot(x = df4.index ,y = df4.values, color = 'darkred', alpha = .6)
plt.title('Sessions per hour of day (Test Local)')

plt.show()