In [1]:
from glob import glob
import pandas as pd

In [2]:
# Opción que deshabilita el limite de columnas y filas mostradas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

---
## Carga de data Google

In [6]:
# Path de los archivos no-procesados (formato parquet)
path_data = '../../data/raw'

## Metadata Sitios

In [7]:
df_sitios = pd.read_parquet(f'{path_data}/g-sitios.parquet')

## Google review 

In [8]:
# Crear lista de archivos en /data/raw/g-review/
review_parquets = glob(f'{path_data}/g-review/*.parquet')
review_parquets

['../../data/raw/g-review/g-review_Pennsylvania.parquet',
 '../../data/raw/g-review/g-review_New_Jersey.parquet',
 '../../data/raw/g-review/g-review_Delaware.parquet']

In [9]:
# Crear dataframe donde se unen los datos extraidos de parquet
df_review = pd.DataFrame()
# Iterar por cada parquet dentro de /data/raw/g-review/
for p in review_parquets:
    # Leer parquet
    df = pd.read_parquet(p)
    # Unir a df_review
    df_review = pd.concat([df_review, df], ignore_index=True)

---
## Análisis Exploratorio

### `df_sitios`

In [10]:
df_sitios.sample(2)

Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
681502,Crawler Conceptz,"Crawler Conceptz, 1231 S Buena Vista St, San J...",0x80db68bec61c3625:0xd0ca98345246d568,,33.775506,-116.965805,[Manufacturer],3.3,18,,,,,"[0x80db68bec61c3625:0x1997a97b49ef0773, 0x80db...",https://www.google.com/maps/place//data=!4m2!3...
1847933,Zadig & Voltaire,"Zadig & Voltaire, 3866 Cross Creek Rd, Malibu,...",0x80e81e32734128ab:0x66423a6ec1f4e99e,,34.035657,-118.684165,"[Clothing store, Men's clothing store, Store, ...",5.0,1,,"[[Sunday, 12–5PM], [Monday, 10AM–6PM], [Tuesda...",{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 12PM,"[0x80e81e328dea9413:0x3166c86d7c7cf5df, 0x80e8...",https://www.google.com/maps/place//data=!4m2!3...


Usando col `address`, crear columnas `city`, `state`, `postal_code`

In [11]:
sample = df_sitios['address'].sample().iloc[0]
print(type(sample), '\n', sample)

<class 'str'> 
 wwdb integrated marketing, 412 SE 13th St, Fort Lauderdale, FL 33316


In [12]:
# Convertir col `address` de tipo str a list
df_sitios['address'] = df_sitios['address'].apply(lambda x: x.split(',') if x else None)
df_sitios['address'].sample()

464005    [Ashland City Fitness,  104 Cumberland St,  As...
Name: address, dtype: object

In [13]:
# Crear col `state` y `postal_code`
df_sitios['state'] = df_sitios['address'].apply(lambda x: x[-1].split()[0] if x else None)
df_sitios['postal_code'] = df_sitios['address'].apply(lambda x: x[-1].split()[1] if x else None)

IndexError: list index out of range

In [None]:
df_sitios['city'] = df_sitios['address'].apply(lambda x: x[-3])

In [None]:
l = df_sitios['category'].iloc[2415057]

In [None]:
l[1]

In [None]:
type(df_sitios['category'].iloc[0])

In [None]:
df_sitios['address'].sample()

### `df_review`

## Mapa

In [None]:
import geopandas as gpd
from geopandas import GeoDataFrame
import matplotlib.pyplot as plt
from shapely.geometry import Point

In [None]:
df_sitios.columns

In [None]:
# Mapeamos las coordinadas de 'pos_x' y 'pos_y'
# Descartamos los nulos
df = df_sitios.dropna(axis=0, subset=['longitude', 'latitude'])

# Crear un GeoDataFrame con la data de coordinadas
geometry = [Point(xy) for xy in zip(df['longitude'], df['latitude'])]
gdf = GeoDataFrame(df, geometry=geometry)

# Cargar archivo shapefile
mapa = gpd.read_file('../assets/map_urban_500k.shx')

# Calculamos los limites del mapa
minx, miny, maxx, maxy = gdf.total_bounds

# Creamos los plots del mapa
fig, ax = plt.subplots(figsize=(16, 14))
mapa.plot(ax=ax, color='white', edgecolor='black')
gdf.plot(ax=ax, marker='o', color='red', markersize=15)

# Calculamos el margen del grafico
margin_ratio = 0.1
marginx = (maxx - minx) * margin_ratio
marginy = (maxy - miny) * margin_ratio
# Aplicamos los margenes en relacion a los plots en los extremos
ax.set_xlim(minx - marginx, maxx + marginx)
ax.set_ylim(miny - marginy, maxy + marginy)

plt.show()