In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry import Polygon, Point
import pyproj
import warnings
import folium
import numpy as np
warnings.filterwarnings('ignore')
%matplotlib inline


In [2]:
df_corners = pd.read_csv('../raw_data/fourcorners.csv', ',')
epsg = 'epsg:3857'
geometry = [Point(xy) for xy in zip(df_corners['lon'], df_corners['lat'])]
geo_df_corners = gpd.GeoDataFrame(df_corners, crs = epsg, geometry = geometry)

In [3]:
def get_data(data_path):
    '''returns a DataFrame with search tracks'''
    data = pd.read_json(data_path)
    data = pd.DataFrame.from_dict(data['__collections__']["search_track"])
    return data


def preprocess_data(data):
    data = data.T
    data = data.drop(columns = ['user_longitude', 'user_latitude', '__collections__'])
    data.reset_index(level=0, inplace=True)
    data.drop(columns="index")
    data['search_method'] = data['search_method'].convert_dtypes()
    data = data[data.search_method != 'startup']
    data.drop(columns='index', inplace=True)
    data['timestamp'] = pd.to_datetime(data['timestamp'], utc=True, errors='coerce')
    data['arrive'] = pd.to_datetime(data['arrive'], utc=True, errors='coerce')
    data['leave'] = pd.to_datetime(data['leave'], utc=True, errors='coerce')
    data['timestamp'] = data.timestamp.dt.strftime("%Y-%m-%d %H:%M:%S")
    data['arrive'] = data.arrive.dt.strftime("%Y-%m-%d %H:%M:%S")
    data['leave'] = data.leave.dt.strftime("%Y-%m-%d %H:%M:%S")
    data['timestamp'] = pd.to_datetime(data['timestamp'])
    data['arrive'] = pd.to_datetime(data['arrive'])
    data['leave'] = pd.to_datetime(data['leave'])
    data["arrive_weekday"] = data["arrive"].dt.weekday
    return data

def preproc(data_path):
    data = get_data(data_path)
    return preprocess_data(data)

data = preproc('../raw_data/dataBackup.json')

In [4]:
def grid_coordinates(geo_df_corners, columns, rows):
    nw_lat = geo_df_corners.lat[0]
    nw_lon = geo_df_corners.lon[0]
    ne_lat = geo_df_corners.lat[1]
    ne_lon = geo_df_corners.lon[1]
    se_lat = geo_df_corners.lat[2]
    se_lon = geo_df_corners.lon[2]    
    sw_lat = geo_df_corners.lat[3]
    sw_lon = geo_df_corners.lon[3]

    lat_dist = abs(nw_lat - sw_lat)
    lon_dist = abs(se_lon - sw_lon)

    list_columns = np.linspace(nw_lon,ne_lon,columns)
    list_rows = np.linspace(nw_lat,sw_lat,rows)

    data_coordinates = {'longitude':[], 'latitude':[]}

    for row in list_rows:
        for column in list_columns:
            data_coordinates['longitude'].append(column)
            data_coordinates['latitude'].append(row)
    
    return data_coordinates

data_coordinates = grid_coordinates(geo_df_corners, columns = 20, rows = 20)


In [5]:
geometry = [Point(xy) for xy in zip(data_coordinates['longitude'], data_coordinates['latitude'])]
geo_df_grid = gpd.GeoDataFrame(data_coordinates, crs = epsg, geometry = geometry)

In [6]:
geometry = [Point(xy) for xy in zip(data['search_longitude'], data['search_latitude'])]
geo_df_serches = gpd.GeoDataFrame(data, crs = epsg, geometry = geometry)

In [7]:

poly = Polygon([[df_corners['lon'][0], df_corners['lat'][0]], [df_corners['lon'][1], df_corners['lat'][1]], [df_corners['lon'][2], df_corners['lat'][2]], [df_corners['lon'][3], df_corners['lat'][3]] ])



In [8]:
data['wanted_searches'] = geo_df_serches.within(poly)
data = data[data['wanted_searches'] == True]
data.drop(columns = 'wanted_searches', inplace=True)
data


Unnamed: 0,arrive,search_method,leave,search_longitude,search_latitude,uid,timestamp,arrive_weekday,geometry
0,2021-03-28 16:30:13,google,2021-03-28 17:30:13,-58.404267,-34.614640,unauthenticated,2021-03-28 16:30:39,6,POINT (-58.40427 -34.61464)
4,2021-02-28 15:56:00,google,2021-02-28 17:32:00,-58.440447,-34.620694,,2021-01-31 22:35:42,6,POINT (-58.44045 -34.62069)
5,2021-02-13 15:12:45,searchInThisArea,2021-02-13 16:12:45,-58.418288,-34.581668,e8itgepSU1YX6Q48HtrDpz76OXr1,2021-02-13 15:13:03,5,POINT (-58.41829 -34.58167)
6,2021-03-10 23:58:33,searchInThisArea,2021-03-11 00:58:33,-58.398826,-34.608031,,2021-03-10 23:58:56,2,POINT (-58.39883 -34.60803)
7,2021-03-15 09:15:55,update_time_home,2021-03-15 09:30:55,-58.410840,-34.621459,k0hSCpQxzGZp7X1L49gDgbuYOAA2,2021-03-15 09:09:55,0,POINT (-58.41084 -34.62146)
...,...,...,...,...,...,...,...,...,...
19903,2021-03-06 14:15:01,this_area,2021-03-06 15:15:01,-58.363994,-34.609831,unauthenticated,2021-03-06 14:38:48,5,POINT (-58.36399 -34.60983)
19904,2021-03-15 15:33:43,searchInThisArea,2021-03-15 16:33:43,-58.381600,-34.603700,,2021-03-15 15:33:51,0,POINT (-58.38160 -34.60370)
19908,2021-03-26 19:11:33,searchInThisArea,2021-03-26 20:11:33,-58.425900,-34.605969,MSVmzP7Wb9d8RJ7kueNFZnq9zTj1,2021-03-26 19:12:39,4,POINT (-58.42590 -34.60597)
19910,2021-01-23 04:15:18,this_area,2021-01-23 05:15:18,-58.383415,-34.591915,unauthenticated,2021-01-23 04:12:43,5,POINT (-58.38341 -34.59192)


In [10]:
map_bsas = folium.Map(location=[-34.600,-58.550])


In [12]:
# data.apply(lambda row:folium.CircleMarker(location=[row["search_latitude"], row["search_longitude"]]).add_to(map_bsas), axis=1)
# map_bsas


In [16]:
data["arrive_hour"] = data["arrive"].dt.hour
data["search_hour"] = data["timestamp"].dt.hour
data.drop(columns='uid', inplace=True)


Unnamed: 0,arrive,search_method,leave,search_longitude,search_latitude,timestamp,arrive_weekday,geometry,arrive_hour,search_hour
0,2021-03-28 16:30:13,google,2021-03-28 17:30:13,-58.404267,-34.61464,2021-03-28 16:30:39,6,POINT (-58.40427 -34.61464),16,16.0
4,2021-02-28 15:56:00,google,2021-02-28 17:32:00,-58.440447,-34.620694,2021-01-31 22:35:42,6,POINT (-58.44045 -34.62069),15,22.0
5,2021-02-13 15:12:45,searchInThisArea,2021-02-13 16:12:45,-58.418288,-34.581668,2021-02-13 15:13:03,5,POINT (-58.41829 -34.58167),15,15.0
6,2021-03-10 23:58:33,searchInThisArea,2021-03-11 00:58:33,-58.398826,-34.608031,2021-03-10 23:58:56,2,POINT (-58.39883 -34.60803),23,23.0
7,2021-03-15 09:15:55,update_time_home,2021-03-15 09:30:55,-58.41084,-34.621459,2021-03-15 09:09:55,0,POINT (-58.41084 -34.62146),9,9.0


In [20]:
data.search_method.value_counts()

searchInThisArea    4603
google              2911
this_area           1268
updateTimeHome      1033
update_time_home     828
updateVehicle         81
update_vehicle        49
Name: search_method, dtype: Int64

In [30]:
data_grouped = data.groupby(by=['arrive_hour']).count()
data_grouped = pd.DataFrame(data_grouped['arrive'])
data_grouped['searches'] = data_grouped['arrive']
data_grouped.drop(columns='arrive', inplace=True)

## Models


In [None]:
data_features = data[['search_longitude', 'search_latitude', 'weekday', 'hour']]
data_features

In [None]:
from sklearn.preprocessing import RobustScaler

r_scaler = RobustScaler() # Instanciate Robust Scaler

r_scaler.fit(data_features[['weekday']]) # Fit scaler to feature
r_scaler.fit(data_features[['hour']]) # Fit scaler to feature

data_features['weekday'] = r_scaler.transform(data_features[['weekday']]) #Scale
data_features['hour'] = r_scaler.transform(data_features[['hour']]) #Scale

data_features

In [None]:
X = data_features[['weekday', 'hour']]
y = data_features[['search_longitude', 'search_latitude']]

# Random Forest

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)


In [None]:
forest = MultiOutputRegressor(RandomForestRegressor(n_estimators=3, random_state=1))

forest = forest.fit(X_train, y_train)

In [None]:
y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

print("MSE train:{}".format(mean_squared_error(y_train, y_train_pred)))
print("MSE test;{}".format(mean_squared_error(y_test, y_test_pred)))

print("R2 score train:{}".format(r2_score(y_train, y_train_pred)))
print("R2 score test:{}".format(r2_score(y_test, y_test_pred)))