In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns




In [2]:
def get_data(data_path):
    '''returns a DataFrame with search tracks'''
    data = pd.read_json(data_path)
    data = pd.DataFrame.from_dict(data['__collections__']["search_track"])
    return data


def preprocess_data(data):
    data = data.T
    data = data.drop(columns = ['user_longitude', 'user_latitude', '__collections__'])
    data.reset_index(level=0, inplace=True)
    data.drop(columns="index")
    data['search_method'] = data['search_method'].convert_dtypes()
    
#     data = data[data.search_method == 'update_time_home']
    
    data.drop(columns='index', inplace=True)
    data['timestamp'] = pd.to_datetime(data['timestamp'], utc=True, errors='coerce')
    data['arrive'] = pd.to_datetime(data['arrive'], utc=True, errors='coerce')
    data['leave'] = pd.to_datetime(data['leave'], utc=True, errors='coerce')
    data['timestamp'] = data.timestamp.dt.strftime("%Y-%m-%d %H:%M:%S")
    data['arrive'] = data.arrive.dt.strftime("%Y-%m-%d %H:%M:%S")
    data['leave'] = data.leave.dt.strftime("%Y-%m-%d %H:%M:%S")
    data['timestamp'] = pd.to_datetime(data['timestamp'])
    data['arrive'] = pd.to_datetime(data['arrive'])
    data['leave'] = pd.to_datetime(data['leave'])

    return data

def preproc(data_path):
    data = get_data(data_path)
    return preprocess_data(data)



In [3]:
data = preproc('../raw_data/dataBackup.json')
data.head(2)

Unnamed: 0,search_longitude,search_latitude,timestamp,leave,arrive,search_method,uid
0,-58.404267,-34.61464,2021-03-28 16:30:39,2021-03-28 17:30:13,2021-03-28 16:30:13,google,unauthenticated
1,-58.419543,-34.575729,2021-02-04 15:56:36,2021-02-04 17:00:36,2021-02-04 16:00:36,startup,unauthenticated


In [4]:
data["search_hour"] = data["timestamp"].dt.hour
data["arrive_hour"] = data["arrive"].dt.hour
data["leave_hour"] = data["leave"].dt.hour



In [5]:
data


Unnamed: 0,search_longitude,search_latitude,timestamp,leave,arrive,search_method,uid,search_hour,arrive_hour,leave_hour
0,-58.404267,-34.614640,2021-03-28 16:30:39,2021-03-28 17:30:13,2021-03-28 16:30:13,google,unauthenticated,16.0,16,17
1,-58.419543,-34.575729,2021-02-04 15:56:36,2021-02-04 17:00:36,2021-02-04 16:00:36,startup,unauthenticated,15.0,16,17
2,-58.414274,-34.581036,2021-01-12 10:28:04,2021-01-12 11:30:04,2021-01-12 10:30:04,startup,unauthenticated,10.0,10,11
3,-58.433503,-34.577048,2021-02-05 08:19:35,2021-02-05 09:30:35,2021-02-05 08:30:35,startup,unauthenticated,8.0,8,9
4,-58.440447,-34.620694,2021-01-31 22:35:42,2021-02-28 17:32:00,2021-02-28 15:56:00,google,,22.0,15,17
...,...,...,...,...,...,...,...,...,...,...
21129,-58.507150,-34.575842,2020-09-14 06:35:26,2020-09-14 07:45:26,2020-09-14 06:45:26,startup,unauthenticated,6.0,6,7
21130,-58.383415,-34.591915,2021-01-23 04:12:43,2021-01-23 05:15:18,2021-01-23 04:15:18,this_area,unauthenticated,4.0,4,5
21131,-58.413258,-34.584307,2021-01-06 02:05:01,2021-01-06 03:15:01,2021-01-06 02:15:01,startup,unauthenticated,2.0,2,3
21132,-58.425847,-34.631331,2021-03-21 19:45:08,2021-03-21 20:45:07,2021-03-21 19:45:07,searchInThisArea,byadzYTQX7YaABRFTAXPYz8UKCI2,19.0,19,20


In [6]:

data['stay_time'] = (data["leave_hour"] - data['arrive_hour']).astype('timedelta64[h]')
data["arrive_weekday"] = data["arrive"].dt.weekday
data.head(2)

Unnamed: 0,search_longitude,search_latitude,timestamp,leave,arrive,search_method,uid,search_hour,arrive_hour,leave_hour,stay_time,arrive_weekday
0,-58.404267,-34.61464,2021-03-28 16:30:39,2021-03-28 17:30:13,2021-03-28 16:30:13,google,unauthenticated,16.0,16,17,0 days 01:00:00,6
1,-58.419543,-34.575729,2021-02-04 15:56:36,2021-02-04 17:00:36,2021-02-04 16:00:36,startup,unauthenticated,15.0,16,17,0 days 01:00:00,3


In [53]:
bsas_map

Unnamed: 0,BARRIO,COMUNA,GEOJSON,geometry
0,CHACARITA,15.0,"{""type"":""MultiPolygon"",""coordinates"":[[[[-58.4...","POLYGON ((-58.45282 -34.59599, -58.45366 -34.5..."
1,PATERNAL,15.0,"{""type"":""MultiPolygon"",""coordinates"":[[[[-58.4...","POLYGON ((-58.46558 -34.59656, -58.46562 -34.5..."
2,VILLA CRESPO,15.0,"{""type"":""MultiPolygon"",""coordinates"":[[[[-58.4...","POLYGON ((-58.42375 -34.59783, -58.42495 -34.5..."
3,VILLA DEL PARQUE,11.0,"{""type"":""MultiPolygon"",""coordinates"":[[[[-58.4...","POLYGON ((-58.49461 -34.61487, -58.49479 -34.6..."
4,ALMAGRO,5.0,"{""type"":""MultiPolygon"",""coordinates"":[[[[-58.4...","POLYGON ((-58.41287 -34.61412, -58.41282 -34.6..."
5,CABALLITO,6.0,"{""type"":""MultiPolygon"",""coordinates"":[[[[-58.4...","POLYGON ((-58.43061 -34.60705, -58.43056 -34.6..."
6,DIQUE 3,1.0,"{""type"":""MultiPolygon"",""coordinates"":[[[[-58.3...","POLYGON ((-58.36524 -34.60549, -58.36523 -34.6..."
7,VILLA SANTA RITA,11.0,"{""type"":""MultiPolygon"",""coordinates"":[[[[-58.4...","POLYGON ((-58.48192 -34.62429, -58.48274 -34.6..."
8,MONTE CASTRO,10.0,"{""type"":""MultiPolygon"",""coordinates"":[[[[-58.5...","POLYGON ((-58.50349 -34.62403, -58.50467 -34.6..."
9,VILLA REAL,10.0,"{""type"":""MultiPolygon"",""coordinates"":[[[[-58.5...","POLYGON ((-58.52512 -34.61400, -58.52511 -34.6..."


In [None]:
def search_neighborhood(neighborhood, coordinates):

    bsas_map = gpd.read_file('../gopa_data/barrios-ciudad')
    bsas_map.BARRIO[6] = 'DIQUE 3'
    bsas_map.BARRIO[24] = 'NUÑEZ'
    bsas_map.BARRIO[26] = 'DIQUE 2'
    bsas_map.BARRIO[33] = 'DIQUE 4'
    bsas_map.BARRIO[35] = 'DIQUE 1'  
    neighborhood = neighborhood.upper()
    gpd_data = gpd.GeoDataFrame(coordinates,
                         geometry = gpd.points_from_xy(coordinates.search_longitude, coordinates.search_latitude))
    reservas_bsas = gpd_data[['geometry']]
    reservas_bsas = reservas_bsas.set_crs("EPSG:4326")

    if neighborhood == 'CIUDAD':
        df_barrio = bsas_map
        fig,ax = plt.subplots(figsize = (15,16))
        bsas_map.plot(ax=ax, color='lightgrey')
        gpd_data.plot(ax=ax, markersize=5, color='red')
        ax.set_xlim([-58.550, -58.325])
        ax.set_ylim([-34.700, -34.525])
        plt.show()

    else:

        try:
            df_barrio = bsas_map[bsas_map['BARRIO'] == neighborhood]
            reservas_barrio = reservas_bsas.within(df_barrio)
            cond_reservas_barrio = gpd_data.apply(lambda x: df_barrio.contains(x.geometry), axis=1)
            gpd_data['reservas_barrio'] = cond_reservas_barrio
            gpd_data = gpd_data[gpd_data.reservas_barrio == True]
            fig,ax = plt.subplots(figsize = (15,16))
            bsas_map.plot(ax=ax, color='lightgrey')
            gpd_data.plot(ax=ax, markersize=5, color='red')
            ax.set_xlim([-58.550, -58.325])
            ax.set_ylim([-34.700, -34.525])
            plt.show()
        except ValueError:
            print("Please enter another neighborhood")


    return gpd_data

In [None]:
belgrano = search_neighborhood(neighborhood = 'belgrano', coordinates=data)
belgrano

In [54]:
def merging_neighbor_dataframes(coordinates):
    
    neighborhood_list = ['chacarita',
    "paternal",
    'villa crespo',
    'villa del parque',
    'almagro',
    'caballito',
    'dique 3',
    'villa santa rita',
    'monte castro',
    'villa real',
    'flores',
    'floresta',
    'constitucion',
    'belgrano',
    'recoleta',
    'retiro',
    'san cristobal',
    'boedo',
    'velez sarsfield',
    'villa luro',
    'parque patricios',
    'mataderos',
    'villa lugano',
    'escollera exterior',
    'nuñez',
    'boca',
    'dique 2',
    'san telmo',
    'saavedra',
    'coghlan',
    'villa urquiza',
    'colegiales',
    'balvanera',
    'dique 4',
    'villa gral. mitre',
    'dique 1',
    'parque chas',
    'agronomia',
    'villa ortuzar',
    'barracas',
    'parque avellaneda',
    'parque chacabuco',
    'nueva pompeya',
    'palermo',
    'villa riachuelo',
    'villa soldati',
    'villa pueyrredon',
    'villa devoto',
    'liniers',
    'versalles',
    'puerto madero',
    'monserrat',
    'san nicolas'
        ]
    
    list_upper = []

    for i in neighborhood_list:
        neighbor = i.upper()
        list_upper.append(neighbor)    
   
    gpd_dataframes_list = [] 
    
    bsas_map = gpd.read_file('../gopa_data/barrios-ciudad')
    bsas_map.BARRIO[6] = 'DIQUE 3'
    bsas_map.BARRIO[24] = 'NUÑEZ'
    bsas_map.BARRIO[26] = 'DIQUE 2'
    bsas_map.BARRIO[33] = 'DIQUE 4'
    bsas_map.BARRIO[35] = 'DIQUE 1' 
    
    for neighbor in list_upper:
        gpd_data = gpd.GeoDataFrame(coordinates,
                         geometry = gpd.points_from_xy(coordinates.search_longitude, coordinates.search_latitude))

        polygon_neighbor = bsas_map[bsas_map['BARRIO'] == neighbor].geometry
        gpd_data ['neighbor'] = gpd_data.apply(lambda x: polygon_neighbor.contains(x.geometry), axis=1)
        gpd_data ['neighbor'] = gpd_data ['neighbor'].apply(lambda x: 0 if x == False else neighbor)
        gpd_data = gpd_data[gpd_data.neighbor != 0]
        gpd_dataframes_list.append(gpd_data)
    
    all_data = pd.concat(gpd_dataframes_list)
    
    
    return all_data

        
        
merging_neighbor_dataframes(coordinates=data)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bsas_map.BARRIO[6] = 'DIQUE 3'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bsas_map.BARRIO[24] = 'NUÑEZ'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bsas_map.BARRIO[26] = 'DIQUE 2'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bsas_map.BARRIO[33] = 'DIQUE 4'
A value is trying to be set on a copy of a 

Unnamed: 0,search_longitude,search_latitude,timestamp,leave,arrive,search_method,uid,search_hour,arrive_hour,leave_hour,stay_time,arrive_weekday,geometry,neighbor
146,-58.446923,-34.583735,2021-01-21 10:23:25,2021-01-21 11:30:17,2021-01-21 10:30:17,google,REwY2MGhNFSmVq4smD6dgXBniu83,10.0,10,11,0 days 01:00:00,3,POINT (-58.44692 -34.58373),CHACARITA
214,-58.446829,-34.583628,2020-09-24 14:38:32,2020-09-24 15:45:32,2020-09-24 14:45:32,startup,BLLbW2aUT4Xd5Kyu4lUX98MMget1,14.0,14,15,0 days 01:00:00,3,POINT (-58.44683 -34.58363),CHACARITA
357,-58.441093,-34.587517,2021-03-06 03:40:51,2021-03-10 23:38:00,2021-03-10 10:38:00,searchInThisArea,IdVRqzjM1OQGXufqTmZIRv2CPhk1,3.0,10,23,0 days 13:00:00,2,POINT (-58.44109 -34.58752),CHACARITA
443,-58.446923,-34.583735,2021-02-08 12:18:25,2021-02-08 13:15:34,2021-02-08 12:15:34,google,RMnlCQYA06TyfsrGhw8Rscp6dTp1,12.0,12,13,0 days 01:00:00,0,POINT (-58.44692 -34.58373),CHACARITA
616,-58.448010,-34.584375,2021-03-29 00:28:46,2021-03-29 01:27:53,2021-03-29 00:27:53,google,,0.0,0,1,0 days 01:00:00,0,POINT (-58.44801 -34.58438),CHACARITA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21093,-58.368752,-34.602119,2021-03-10 13:51:04,2021-03-10 14:50:34,2021-03-10 13:50:34,google,,13.0,13,14,0 days 01:00:00,2,POINT (-58.36875 -34.60212),SAN NICOLAS
21106,-58.373486,-34.606694,2020-11-10 19:59:36,2020-11-11 08:30:00,2020-11-11 07:30:00,this_area,unauthenticated,19.0,7,8,0 days 01:00:00,2,POINT (-58.37349 -34.60669),SAN NICOLAS
21111,-58.392193,-34.601958,2020-11-06 08:02:38,2020-11-06 09:15:38,2020-11-06 08:15:38,startup,fPYSBYyQLYRcgIXJn9xAZ2eXf423,8.0,8,9,0 days 01:00:00,4,POINT (-58.39219 -34.60196),SAN NICOLAS
21114,-58.383575,-34.604396,2021-03-15 09:30:47,2021-03-15 18:29:00,2021-03-15 16:29:00,updateTimeHome,,9.0,16,18,0 days 02:00:00,0,POINT (-58.38357 -34.60440),SAN NICOLAS


In [None]:
sns.histplot(data=data, x="search_hour", binwidth=1, kde=True)



In [None]:
sns.histplot(data=palermo, x="search_hour", binwidth=1, kde=True)


In [None]:
sns.histplot(data=recoleta, x="search_hour", binwidth=1, kde=True)


In [None]:
sns.histplot(data=data, x="arrive_hour", binwidth=1, kde=True)


In [None]:
sns.histplot(data=data, x="arrive_weekday", binwidth=1, kde=True)


In [55]:
sns.histplot(data=data, x="neighbor", hue="arrive_weekday")


ValueError: Could not interpret value `neighbor` for parameter `x`

In [None]:
sns.histplot(data=data, x="arrive_hour", y="arrive_weekday", legend=True)

In [None]:
caba = search_neighborhood(neighborhood = 'ciudad', coordinates=data)
ax = sns.regplot(x="arrive_hour", y="arrive", data=caba)