In [21]:
from elasticsearch import Elasticsearch
import json
import requests

import datetime
import time

import numpy as np
from numpy.random import randn
from random import shuffle
import pandas as pd
from scipy import stats
import matplotlib as mpl

import matplotlib.pyplot as plt
import seaborn as sns

import math

from sklearn.ensemble import RandomForestClassifier

import plotly.plotly as py
from plotly.graph_objs import *

import eslogin
import settings

%matplotlib inline
mpl.style.use('ggplot')

pd.options.display.max_rows = 10
pd.options.display.max_columns = 30

#Download data

In [2]:
datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')

'2015-06-19 09:42:00'

In [3]:
es = Elasticsearch([{'host': eslogin.host, 'port': eslogin.port}],http_auth=(eslogin.user,eslogin.password))

In [4]:
es.count(index="events-*", body={'query': {'bool': {'must':[{'match': { 'et' : 'AD_CLICK' }}], 'must_not':[{'match': { 'fr' : 'true' }}]}}})['count']

11019

In [5]:
clicks = es.search(index="events-*", body={'query': {'bool': {'must':[{'match': { 'et' : 'AD_CLICK' }}], 'must_not':[{'match': { 'fr' : 'true' }}]}}}, size = 11019)

In [6]:
df_clicks = pd.DataFrame([datum['_source'] for datum in clicks['hits']['hits']])

In [7]:
df_clicks.shape

(11019, 46)

In [8]:
df_clicks.to_csv("../data/ad_clicks_150619.tab",sep='\t',encoding='utf-8')

In [2]:
df_clicks = pd.read_csv("../data/ad_clicks_150619.tab",sep='\t',index_col=0,encoding='utf-8')

In [3]:
clicks_uniq = df_clicks.loc[np.where(np.unique(df_clicks['created_at']))[0], :]

In [4]:
clicks_uniq.shape

(11018, 46)

In [11]:
es.count(index="events-*", body={'query': {'bool': {'must':[{'match': { 'et' : 'AD_SHOW' }}], 'must_not':[{'match': { 'fr' : 'true' }}]}}})['count']

5460523

In [12]:
randomInd = np.random.choice(range(5460523), size=20000, replace=False)

In [13]:
shows = []

In [20]:
for i in randomInd:
    event = es.search(index="events-*", body={'query': {'bool': {'must':[{'match': { 'et' : 'AD_SHOW' }}], 'must_not':[{'match': { 'fr' : 'true' }}]}}}, from_ = i, size = 1)
    shows = shows + event['hits']['hits']

In [22]:
df_shows = pd.DataFrame([datum['_source'] for datum in shows])
df_shows.to_csv("../data/ad_shows_150619.tab",sep='\t',encoding='utf-8')

In [6]:
df_shows = pd.read_csv("../data/ad_shows_150619.tab",sep='\t',index_col=0,encoding='utf-8',low_memory=False)

In [8]:
shows_uniq = df_shows.loc[np.where(np.unique(df_shows['created_at']))[0], :]
shows_uniq.shape

(19973, 34)

#Prepare for the data set

In [10]:
duplicate_id = set(shows_uniq['id']) - set(clicks_uniq['id'])
shows_uniq_only = shows_uniq.loc[shows_uniq['id'].apply(lambda x: x in duplicate_id), :]

In [11]:
clicks_uniq['click'] = 1
shows_uniq_only['click'] = 0

In [12]:
data = pd.concat([clicks_uniq[['ai','br','co','cty','rgn','plfm','pv','dt','sdpt','isp',
                               'ernc','dp','vw_drtn','vw_perc','cr','created_at','et','click']],
                  shows_uniq_only[['ai','br','co','cty','rgn','plfm','pv','dt','sdpt','isp',
                                   'ernc','dp','vw_drtn','vw_perc','cr','created_at','et','click']]],
                 axis=0)

In [13]:
def get_date(timestring):
    mytime = timestring.split('.')[0]
    return datetime.datetime.strptime(mytime, '%Y-%m-%dT%X')
data['created_at'] = data['created_at'].apply(get_date)

In [16]:
data['br_short'] = data['br'].apply(lambda x:str(x).split()[0])

In [17]:
ad_type = pd.read_csv("../data/zenit_creatives_parsed.tab",sep='\t',encoding='utf-8')

In [18]:
def get_ad_type(cr_id):
    if cr_id > 0 and cr_id in ad_type['cr']:
        return ad_type['ad_type'][ad_type['cr'].apply(lambda x: x == cr_id)].values[0]
    else:
        return np.NaN

In [19]:
data = pd.merge(data, ad_type, left_on ='cr', right_on ='cr', how='left')

In [78]:
len(np.unique(data['cty'].values))

3828

In [87]:
from geopy import geocoders
g = geocoders.GoogleV3()
place, (lat, lng) = g.geocode('Eastbourne')

In [124]:
g2 = geocoders.GoogleV3(settings.google_api_key)
g2.timezone((lat, lng))

<DstTzInfo 'Europe/London' LMT-1 day, 23:59:00 STD>

In [128]:
def get_loc(city):
    try:
        #g = geocoders.GoogleV3()
        place, (lat, lng) = g.geocode(city)
        
        return (place, lat, lng)
    except:
        return np.NaN

In [148]:
get_loc('Pasadena')

nan

In [121]:
city_loc = {}
for city in cities:
    city_loc[city] = get_loc(city)

In [122]:
json.dump(city_loc, open("../data/city_location_150623.txt",'w'))

^Most of them are NA... query too often?

In [144]:
import time
city_loc = {}
for city in cities:
    city_loc[city] = get_loc(city)
    time.sleep(2)

In [145]:
json.dump(city_loc, open("../data/city_location_150624.txt",'w'))

^Looks much better this time!

In [146]:
data.to_csv("../data/data_150624.tab",sep='\t',encoding='utf-8')

In [67]:
def utc_to_local(time, city):
    try:
        # get the local time zone
        g = geocoders.GoogleV3()
        place, (lat, lng) = g.geocode(city)
        timezone = g.timezone((lat, lng))

        tz = pytz.timezone(timezone.zone)
    
        # convert UTC to local
        utc = time.replace(tzinfo=pytz.utc)
        local = tz.normalize(utc.astimezone(tz))
    
        return local
        
    except:
        return np.NaN

#Learning!

Convert categorical features to binary (dummy) variables

In [149]:
data2 = pd.concat([data[['click']],pd.get_dummies(data['br_short'], dummy_na=True, prefix='br'),
                   pd.get_dummies(data['co'], dummy_na=True, prefix='co'),
                   pd.get_dummies(data['cty'].apply(cty_parser), dummy_na=True, prefix='cty'),
                   pd.get_dummies(data['rgn'], dummy_na=True, prefix='rgn'),
                   pd.get_dummies(data['plfm'], dummy_na=True, prefix='plfm'),
                   pd.get_dummies(data['pv'], dummy_na=True, prefix='pv'),
                   pd.get_dummies(data['dt'], dummy_na=True, prefix='dt'),
                   pd.get_dummies(data['isp'], dummy_na=True, prefix='isp'),
                   pd.get_dummies(data['ad_type'], dummy_na=True, prefix='ad_type'),
                   data[['ai','cr','sdpt','ernc','dp']]],axis=1)

Shuffle indices for 10-fold cross validation

In [152]:
indices = data2.index.values
indices

array([    0,     1,     2, ..., 30909, 30910, 30911])

In [153]:
shuffle(indices)

In [221]:
len(np.unique(indices))

30912

In [157]:
test_size = len(indices)/10

In [158]:
test_ind = indices[:test_size]
train_ind = indices[test_size:]

In [163]:
train_set = data2.loc[train_ind,:]
test_set = data2.loc[test_ind,:]

In [169]:
train_set.dtypes.value_counts()

float64    5969
int64         4
dtype: int64

In [176]:
import collections
print [item for item, count in collections.Counter(train_set.columns.values).items() if count > 1]

['br_nan']


There are two 'br_nan', I need to keep only one - indicating the combining of two

In [209]:
train_set_2 = train_set.drop(['br_nan'], axis=1)

In [227]:
train_set_2.loc[:,'br_nan'] = 0 * train_set_2.shape[0]
for i in train_set_2.index:
    train_set_2.loc[i,'br_nan'] = train_set.ix[:,12][i] + train_set.ix[:,13][i]

In [226]:
train_set.ix[:,12]

20004    0
11087    0
9111     0
...
12349    0
1059     0
1958     0
Name: br_nan, Length: 27821, dtype: float64

In [228]:
forest = RandomForestClassifier(n_estimators = 100)

In [230]:
print [item for item, count in collections.Counter(train_set_2.columns.values).items() if count > 1]

[]


In [231]:
train_set_2.dtypes.value_counts()

float64    5968
int64         4
dtype: int64

The error is extremely wierd...

Try just use the old data:

In [233]:
clicks_uniq = pd.read_csv("../data/clicks_uniq_150610.tab",sep='\t',index_col=0,encoding='utf-8')
adShows_uniq = pd.read_csv("../data/adShows_uniq_150610.tab",sep='\t',index_col=0,encoding='utf-8')
duplicate_id = set(adShows_uniq['id']) - set(clicks_uniq['id'])
adShows_uniq_only = adShows_uniq.loc[adShows_uniq['id'].apply(lambda x: x in duplicate_id), :]
data = pd.concat([clicks_uniq[['ai','br','co','cty','rgn','plfm','pv','dt','sdpt','isp','ernc','dp','vw_drtn',
                               'vw_perc','cr','created_at','et']],
                  adShows_uniq_only[['ai','br','co','cty','rgn','plfm','pv','dt','sdpt','isp','ernc','dp',
                                     'vw_drtn','vw_perc','cr','created_at','et']]], axis=0)
def get_date(timestring):
    mytime = timestring.split('.')[0]
    return datetime.strptime(mytime, '%Y-%m-%dT%X')
data['created_at'] = data['created_at'].apply(get_date)
data2 = data[data['created_at'].apply(lambda x:(x.month == 5 or x.month == 6))]
data2['br_short'] = data2['br'].apply(lambda x:x.split()[0])
data2['click'] = data2['et'].apply(lambda x:x == 'AD_CLICK')
data2 = pd.merge(data2, ad_type, left_on ='cr', right_on ='cr', how='left')
data2['time'] = data2['created_at'].apply(lambda x:x.hour)

In [248]:
features = ['click','ai','cr','br_short','co','cty','rgn','plfm','pv','dt',
            'sdpt','isp','ernc','dp','ad_type','time']
data3 = data2.loc[:,features]
data3['click'].value_counts()

False    13345
True      5384
dtype: int64

In [238]:
data3.columns

Index([u'click', u'ai', u'cr', u'br_short', u'co', u'cty', u'rgn', u'plfm', u'pv', u'dt', u'sdpt', u'isp', u'ernc', u'dp', u'ad_type', u'time'], dtype='object')

In [249]:
data3 = pd.concat([data3[['click']],pd.get_dummies(data3['br_short'], dummy_na=True, prefix='br'),
                   pd.get_dummies(data3['co'], dummy_na=True, prefix='co'),
                   pd.get_dummies(data3['cty'].apply(cty_parser), dummy_na=True, prefix='cty'),
                   pd.get_dummies(data3['rgn'], dummy_na=True, prefix='rgn'),
                   pd.get_dummies(data3['plfm'], dummy_na=True, prefix='plfm'),
                   pd.get_dummies(data3['pv'], dummy_na=True, prefix='pv'),
                   pd.get_dummies(data3['dt'], dummy_na=True, prefix='dt'),
                   pd.get_dummies(data3['isp'], dummy_na=True, prefix='isp'),
                   pd.get_dummies(data3['ad_type'], dummy_na=True, prefix='ad_type'),
                   data3[['ai','sdpt','time','ernc','dp']]],axis=1)

Shuffle the indices

In [250]:
indices = data3.index.values
shuffle(indices)
len(np.unique(indices))

18729

In [251]:
test_size = len(indices)/10
test_size

1872

In [252]:
test_ind = indices[:test_size]
train_ind = indices[test_size:]
train_set = data3.loc[train_ind,:]
test_set = data3.loc[test_ind,:]

In [254]:
train_data = train_set.values