In [2]:
import os
import sys
import gzip
import glob
import math
import numpy as np
import pandas as pd
from time import time
from scipy import stats
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.backends.backend_pdf
# matplotlib.style.use('ggplot')
%matplotlib inline

In [7]:
''' Loading data sets
'''
meta = pd.read_csv('data/metaData_taxistandsID_name_GPSlocation.csv')
train = pd.read_csv('data/train.1k.csv', quotechar='"')
print meta.head()
train.head()

   ID  Descricao       Latitude  Longitude
0   1       Agra  41.1771457135  -8.609670
1   2    Alameda    41.15618964  -8.591064
2   3     Aldoar  41.1705249231  -8.665876
3   4  Alfândega  41.1437639911  -8.621803
4   5      Amial  41.1835097223  -8.612726


Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,1372636858620000589,C,,,20000589,1372636858,A,False,"[[-8.618643,41.141412],[-8.618499,41.141376],[..."
1,1372637303620000596,B,,7.0,20000596,1372637303,A,False,"[[-8.639847,41.159826],[-8.640351,41.159871],[..."
2,1372636951620000320,C,,,20000320,1372636951,A,False,"[[-8.612964,41.140359],[-8.613378,41.14035],[-..."
3,1372636854620000520,C,,,20000520,1372636854,A,False,"[[-8.574678,41.151951],[-8.574705,41.151942],[..."
4,1372637091620000337,C,,,20000337,1372637091,A,False,"[[-8.645994,41.18049],[-8.645949,41.180517],[-..."


In [27]:
print 'Distribution of CALL_TYPE'
print train.groupby('CALL_TYPE').apply(lambda x: len(x))
print 'Distribution of DAY_TYPE'
print train.groupby('DAY_TYPE').apply(lambda x: len(x))
print 'Distribution of MISSING_DATA'
print train.groupby('MISSING_DATA').apply(lambda x: len(x))

Distribution of CALL_TYPE
CALL_TYPE
A            267
B            433
C            299
dtype: int64
Distribution of DAY_TYPE
DAY_TYPE
A           999
dtype: int64
Distribution of MISSING_DATA
MISSING_DATA
False           999
dtype: int64


In [54]:
import ast
from ast import literal_eval
string = train[train.TRIP_ID==1372636858620000589].POLYLINE.values[0]
ll = list(literal_eval(string))
print ll
ll[0]
print ll[0][1] > ll[0][0]

[[-8.618643, 41.141412], [-8.618499, 41.141376], [-8.620326, 41.14251], [-8.622153, 41.143815], [-8.623953, 41.144373], [-8.62668, 41.144778], [-8.627373, 41.144697], [-8.630226, 41.14521], [-8.632746, 41.14692], [-8.631738, 41.148225], [-8.629938, 41.150385], [-8.62911, 41.151213], [-8.629128, 41.15124], [-8.628786, 41.152203], [-8.628687, 41.152374], [-8.628759, 41.152518], [-8.630838, 41.15268], [-8.632323, 41.153022], [-8.631144, 41.154489], [-8.630829, 41.154507], [-8.630829, 41.154516], [-8.630829, 41.154498], [-8.630838, 41.154489]]
True


In [60]:
print ll
print ll[-1]

[[-8.618643, 41.141412], [-8.618499, 41.141376], [-8.620326, 41.14251], [-8.622153, 41.143815], [-8.623953, 41.144373], [-8.62668, 41.144778], [-8.627373, 41.144697], [-8.630226, 41.14521], [-8.632746, 41.14692], [-8.631738, 41.148225], [-8.629938, 41.150385], [-8.62911, 41.151213], [-8.629128, 41.15124], [-8.628786, 41.152203], [-8.628687, 41.152374], [-8.628759, 41.152518], [-8.630838, 41.15268], [-8.632323, 41.153022], [-8.631144, 41.154489], [-8.630829, 41.154507], [-8.630829, 41.154516], [-8.630829, 41.154498], [-8.630838, 41.154489]]
[-8.630838, 41.154489]


In [66]:
# find BBox
lat = ll[-1][0]
lng = ll[-1][1]
# voting area
area = [0,0,0,0]
for coo in ll[:-1]:
    if coo[0] < lat and coo[1] < lng:
        area[0] += 1
    elif coo[0] < lat and coo[1] > lng:
        area[1] += 1
    elif coo[0] > lat and coo[1] < lng:
        area[2] += 1
    elif coo[0] > lat and coo[1] > lng:
        area[3] += 1
print area
areaArray = np.array(area)
idx = int(np.where(areaArray == areaArray.min())[0])
print idx, lat, lng

[3, 0, 14, 3]
1 -8.630838 41.154489


In [67]:
''' Loading TEST data sets
'''
test = pd.read_csv('data/test.csv', quotechar='"')
test.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,T1,B,,15,20000542,1408039037,A,False,"[[-8.585676,41.148522],[-8.585712,41.148639],[..."
1,T2,B,,57,20000108,1408038611,A,False,"[[-8.610876,41.14557],[-8.610858,41.145579],[-..."
2,T3,B,,15,20000370,1408038568,A,False,"[[-8.585739,41.148558],[-8.58573,41.148828],[-..."
3,T4,B,,53,20000492,1408039090,A,False,"[[-8.613963,41.141169],[-8.614125,41.141124],[..."
4,T5,B,,18,20000621,1408039177,A,False,"[[-8.619903,41.148036],[-8.619894,41.148036]]"


In [72]:
test[test.ORIGIN_CALL==2002]

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
36,T37,A,2002,,20000159,1408038095,A,False,"[[-8.640594,41.160105],[-8.640648,41.160033],[..."
63,T64,A,2002,,20000662,1408038842,A,False,"[[-8.601183,41.181885],[-8.601165,41.181903],[..."
90,T92,A,2002,,20000612,1412064659,A,False,"[[-8.606961,41.145093],[-8.606673,41.145732],[..."
93,T95,A,2002,,20000071,1412065352,A,False,"[[-8.621028,41.150016],[-8.61759,41.154921],[-..."
109,T115,A,2002,,20000039,1412065352,A,False,"[[-8.641053,41.153715],[-8.641161,41.153805],[..."
111,T117,A,2002,,20000230,1412065271,A,False,"[[-8.601291,41.181768],[-8.601228,41.181822],[..."
151,T158,A,2002,,20000698,1412617360,A,False,"[[-8.620038,41.146839],[-8.619993,41.146884],[..."
160,T168,A,2002,,20000180,1412617017,A,False,"[[-8.624331,41.179554],[-8.624349,41.179554],[..."
186,T194,A,2002,,20000349,1412617104,A,False,"[[-8.608824,41.153544],[-8.60832,41.153535],[-..."
222,T230,A,2002,,20000434,1412612095,A,False,"[[-8.608734,41.153481],[-8.608581,41.153508],[..."
