In [1]:
import zipfile
import json
import pandas as pd
import os
import gc

In [4]:
nombre_archivo_zip = 'Google Maps-20231115T202205Z-001.zip'

# Inicializar una lista para almacenar DataFrames individuales
reviews = []
metadata = []

# Abrir el archivo ZIP
with zipfile.ZipFile(nombre_archivo_zip, 'r') as zip_ref:

    # Obtener una lista de los archivos de reviews
    archivos_reviews = [f for f in zip_ref.namelist() if f.startswith("Google Maps/reviews-estados") and f.endswith('.json')]
    # Iterar sobre los archivos en la carpeta
    for nombre_archivo in archivos_reviews:
        with zip_ref.open(nombre_archivo) as json_file:
            # Leer el contenido del archivo como una cadena
            json_str = json_file.read().decode('utf-8')

            # Dividir la cadena en objetos JSON individuales
            objetos_json = [json.loads(obj_str) for obj_str in json_str.split('\n') if obj_str.strip()]

            # Cargar los archivos en un dataframe
            df = pd.DataFrame(objetos_json)
            reviews.append(df)

    # Obtener una lista de los archivos de reviews
    metadata_sitios = [f for f in zip_ref.namelist() if f.startswith("Google Maps/metadata-sitios") and f.endswith('.json')]
    # Iterar sobre los archivos en la carpeta
    for sitio in metadata_sitios:
        with zip_ref.open(sitio) as meta_json:
            # Leer el contenido del archivo como una cadena
            meta_str = meta_json.read().decode('utf-8')

            # Dividir la cadena en objetos JSON individuales
            objetos_meta = [json.loads(obj_str1) for obj_str1 in meta_str.split('\n') if obj_str1.strip()]

            # Cargar los archivos en un dataframe
            meta = pd.DataFrame(objetos_meta)
            metadata.append(meta)

# Concatenar todos los DataFrames en uno solo
if reviews:
    df_reviews = pd.concat(reviews, ignore_index=True)

# Concatenar todos los DataFrames en uno solo
if metadata:
    df_metadata = pd.concat(metadata, ignore_index=True)

In [5]:
df_reviews.head()

Unnamed: 0,user_id,name,time,rating,text,pics,resp,gmap_id
0,103388204835998302713,Mmustangg Ggnatsumm,1592280059342,5,,,,0x8760bbfbfa6847e1:0xad949b36fcb194a2
1,101493059298687786658,Luis Antonio,1535471202673,5,,,,0x8760bbfbfa6847e1:0xad949b36fcb194a2
2,108065566978040119641,Abby N,1548907384201,4,,,,0x8760bbfbfa6847e1:0xad949b36fcb194a2
3,112164287381301303443,Marsha Davidson,1531441401770,4,,,,0x8760bbfbfa6847e1:0xad949b36fcb194a2
4,112590721316665651581,Respect my Hussle!,1500047718230,4,,,,0x8760bbfbfa6847e1:0xad949b36fcb194a2


In [6]:
df_metadata.head()

Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
0,Porter Pharmacy,"Porter Pharmacy, 129 N Second St, Cochran, GA ...",0x88f16e41928ff687:0x883dad4fd048e8f8,,32.3883,-83.3571,[Pharmacy],4.9,16,,"[[Friday, 8AM–6PM], [Saturday, 8AM–12PM], [Sun...","{'Service options': ['In-store shopping', 'Sam...",Open ⋅ Closes 6PM,"[0x88f16e41929435cf:0x5b2532a2885e9ef6, 0x88f1...",https://www.google.com/maps/place//data=!4m2!3...
1,City Textile,"City Textile, 3001 E Pico Blvd, Los Angeles, C...",0x80c2c98c0e3c16fd:0x29ec8a728764fdf9,,34.018891,-118.21529,[Textile exporter],4.5,6,,,,Open now,"[0x80c2c624136ea88b:0xb0315367ed448771, 0x80c2...",https://www.google.com/maps/place//data=!4m2!3...
2,San Soo Dang,"San Soo Dang, 761 S Vermont Ave, Los Angeles, ...",0x80c2c778e3b73d33:0xbdc58662a4a97d49,,34.058092,-118.29213,[Korean restaurant],4.4,18,,"[[Thursday, 6:30AM–6PM], [Friday, 6:30AM–6PM],...","{'Service options': ['Takeout', 'Dine-in', 'De...",Open ⋅ Closes 6PM,"[0x80c2c78249aba68f:0x35bf16ce61be751d, 0x80c2...",https://www.google.com/maps/place//data=!4m2!3...
3,Nova Fabrics,"Nova Fabrics, 2200 E 11th St, Los Angeles, CA ...",0x80c2c89923b27a41:0x32041559418d447,,34.023669,-118.23293,[Fabric store],3.3,6,,"[[Thursday, 9AM–5PM], [Friday, 9AM–5PM], [Satu...","{'Service options': ['In-store shopping'], 'Pa...",Open ⋅ Closes 5PM,"[0x80c2c8811477253f:0x23a8a492df1918f7, 0x80c2...",https://www.google.com/maps/place//data=!4m2!3...
4,Nobel Textile Co,"Nobel Textile Co, 719 E 9th St, Los Angeles, C...",0x80c2c632f933b073:0xc31785961fe826a6,,34.036694,-118.249421,[Fabric store],4.3,7,,"[[Thursday, 9AM–5PM], [Friday, 9AM–5PM], [Satu...",{'Service options': ['In-store pickup']},Open ⋅ Closes 5PM,"[0x80c2c62c496083d1:0xdefa11317fe870a1, 0x80c2...",https://www.google.com/maps/place//data=!4m2!3...


In [7]:
gc.collect()

7

In [8]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8538485 entries, 0 to 8538484
Data columns (total 8 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   user_id  object
 1   name     object
 2   time     int64 
 3   rating   int64 
 4   text     object
 5   pics     object
 6   resp     object
 7   gmap_id  object
dtypes: int64(2), object(6)
memory usage: 521.1+ MB


In [10]:
df_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 275001 entries, 0 to 275000
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   name              274994 non-null  object 
 1   address           264939 non-null  object 
 2   gmap_id           275001 non-null  object 
 3   description       13155 non-null   object 
 4   latitude          275001 non-null  float64
 5   longitude         275001 non-null  float64
 6   category          272740 non-null  object 
 7   avg_rating        275001 non-null  float64
 8   num_of_reviews    275001 non-null  int64  
 9   price             13450 non-null   object 
 10  hours             192448 non-null  object 
 11  MISC              194972 non-null  object 
 12  state             195523 non-null  object 
 13  relative_results  238771 non-null  object 
 14  url               275001 non-null  object 
dtypes: float64(3), int64(1), object(11)
memory usage: 31.5+ MB


In [11]:
reviews_completo = pd.merge(df_reviews, df_metadata, on="gmap_id", how="inner")
reviews_completo.head()

Unnamed: 0,user_id,name_x,time,rating,text,pics,resp,gmap_id,name_y,address,...,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
0,103563353519118158246,Peri Gray,1516122675780,5,Great place to care for our children.,,,0x532af45db8f30779:0xd9be9359f1e56178,CRST WIC Office,"CRST WIC Office, 16618 243rd Avenue, Eagle But...",...,-101.239919,,4.7,8,,,,,"[0x532af4588c5f80b1:0x30071640ecb03b5, 0x532af...",https://www.google.com/maps/place//data=!4m2!3...
1,103563353519118158246,Peri Gray,1516122675780,5,Great place to care for our children.,,,0x532af45db8f30779:0xd9be9359f1e56178,CRST WIC Office,"CRST WIC Office, 16618 243rd Avenue, Eagle But...",...,-101.239919,,4.7,8,,,,,"[0x532af4588c5f80b1:0x30071640ecb03b5, 0x532af...",https://www.google.com/maps/place//data=!4m2!3...
2,101824980797027243148,Suzy Berndt,1532922350314,5,Th sw y are so nice,,,0x532af45db8f30779:0xd9be9359f1e56178,CRST WIC Office,"CRST WIC Office, 16618 243rd Avenue, Eagle But...",...,-101.239919,,4.7,8,,,,,"[0x532af4588c5f80b1:0x30071640ecb03b5, 0x532af...",https://www.google.com/maps/place//data=!4m2!3...
3,101824980797027243148,Suzy Berndt,1532922350314,5,Th sw y are so nice,,,0x532af45db8f30779:0xd9be9359f1e56178,CRST WIC Office,"CRST WIC Office, 16618 243rd Avenue, Eagle But...",...,-101.239919,,4.7,8,,,,,"[0x532af4588c5f80b1:0x30071640ecb03b5, 0x532af...",https://www.google.com/maps/place//data=!4m2!3...
4,108711640480272772398,Rosemary Red Legs,1530969093932,5,Went with my daughter,,,0x532af45db8f30779:0xd9be9359f1e56178,CRST WIC Office,"CRST WIC Office, 16618 243rd Avenue, Eagle But...",...,-101.239919,,4.7,8,,,,,"[0x532af4588c5f80b1:0x30071640ecb03b5, 0x532af...",https://www.google.com/maps/place//data=!4m2!3...


In [12]:
reviews_completo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29601 entries, 0 to 29600
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   user_id           29601 non-null  object 
 1   name_x            29601 non-null  object 
 2   time              29601 non-null  int64  
 3   rating            29601 non-null  int64  
 4   text              17467 non-null  object 
 5   pics              665 non-null    object 
 6   resp              3428 non-null   object 
 7   gmap_id           29601 non-null  object 
 8   name_y            29601 non-null  object 
 9   address           29482 non-null  object 
 10  description       2442 non-null   object 
 11  latitude          29601 non-null  float64
 12  longitude         29601 non-null  float64
 13  category          29549 non-null  object 
 14  avg_rating        29601 non-null  float64
 15  num_of_reviews    29601 non-null  int64  
 16  price             3084 non-null   object

In [13]:
reviews_completo1 = pd.merge(df_reviews, df_metadata, on="gmap_id", how="left")
reviews_completo1.head()

Unnamed: 0,user_id,name_x,time,rating,text,pics,resp,gmap_id,name_y,address,...,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
0,103388204835998302713,Mmustangg Ggnatsumm,1592280059342,5,,,,0x8760bbfbfa6847e1:0xad949b36fcb194a2,,,...,,,,,,,,,,
1,101493059298687786658,Luis Antonio,1535471202673,5,,,,0x8760bbfbfa6847e1:0xad949b36fcb194a2,,,...,,,,,,,,,,
2,108065566978040119641,Abby N,1548907384201,4,,,,0x8760bbfbfa6847e1:0xad949b36fcb194a2,,,...,,,,,,,,,,
3,112164287381301303443,Marsha Davidson,1531441401770,4,,,,0x8760bbfbfa6847e1:0xad949b36fcb194a2,,,...,,,,,,,,,,
4,112590721316665651581,Respect my Hussle!,1500047718230,4,,,,0x8760bbfbfa6847e1:0xad949b36fcb194a2,,,...,,,,,,,,,,


In [14]:
reviews_completo1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8544251 entries, 0 to 8544250
Data columns (total 22 columns):
 #   Column            Dtype  
---  ------            -----  
 0   user_id           object 
 1   name_x            object 
 2   time              int64  
 3   rating            int64  
 4   text              object 
 5   pics              object 
 6   resp              object 
 7   gmap_id           object 
 8   name_y            object 
 9   address           object 
 10  description       object 
 11  latitude          float64
 12  longitude         float64
 13  category          object 
 14  avg_rating        float64
 15  num_of_reviews    float64
 16  price             object 
 17  hours             object 
 18  MISC              object 
 19  state             object 
 20  relative_results  object 
 21  url               object 
dtypes: float64(4), int64(2), object(16)
memory usage: 1.4+ GB


In [15]:
gc.collect()

493

In [16]:
reviews_completo["category"].value_counts()

category
[Nail salon]                                                  645
[Bar]                                                         565
[Barber shop]                                                 545
[Park]                                                        529
[Restaurant]                                                  521
                                                             ... 
[Phone repair service, Computer repair service]                10
[Eye care center, Contact lenses supplier, Optometrist]        10
[Paint store, Hardware store]                                  10
[Surgical center]                                              10
[Federal credit union, ATM, Credit union, Mortgage lender]     10
Name: count, Length: 783, dtype: int64

In [26]:
categorias = reviews_completo["category"].explode().unique()
categorias

array([None, 'Youth social services organization', 'Auto repair shop',
       'Truck repair shop', 'Auto body shop', 'Towing service',
       'Coffee shop', 'Cafe', 'Coffee store', 'Espresso bar',
       'Nail salon', 'Gift shop', 'Bar', 'Animal feed store',
       'Beauty salon', 'Boat dealer', 'Boat repair shop', 'Park',
       'Urgent care center', 'Medical clinic', 'Lake', 'Uniform store',
       'HVAC contractor', 'Air conditioning contractor',
       'Heating contractor', 'Mechanical contractor', 'Metal fabricator',
       'Hair salon', 'Motorcycle dealer', 'Hot tub store',
       'Billiards supply store', 'Furniture store',
       'Hot tub repair service', 'Outdoor furniture store',
       'Swimming pool contractor', 'Swimming pool repair service',
       'Swimming pool supply store', 'Baseball field',
       'Beauty supply store', 'Grocery store', 'Transportation service',
       'Vitamin & supplements store', 'Sports nutrition store',
       'Weight loss service', 'Church', 'T

In [27]:
len(categorias)

846

In [28]:
# Se abre el dataframe en las diferentes categorías
reviews_completo = reviews_completo.explode("category")
reviews_completo.head()

Unnamed: 0,user_id,name_x,time,rating,text,pics,resp,gmap_id,name_y,address,...,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
0,103563353519118158246,Peri Gray,1516122675780,5,Great place to care for our children.,,,0x532af45db8f30779:0xd9be9359f1e56178,CRST WIC Office,"CRST WIC Office, 16618 243rd Avenue, Eagle But...",...,-101.239919,,4.7,8,,,,,"[0x532af4588c5f80b1:0x30071640ecb03b5, 0x532af...",https://www.google.com/maps/place//data=!4m2!3...
1,103563353519118158246,Peri Gray,1516122675780,5,Great place to care for our children.,,,0x532af45db8f30779:0xd9be9359f1e56178,CRST WIC Office,"CRST WIC Office, 16618 243rd Avenue, Eagle But...",...,-101.239919,,4.7,8,,,,,"[0x532af4588c5f80b1:0x30071640ecb03b5, 0x532af...",https://www.google.com/maps/place//data=!4m2!3...
2,101824980797027243148,Suzy Berndt,1532922350314,5,Th sw y are so nice,,,0x532af45db8f30779:0xd9be9359f1e56178,CRST WIC Office,"CRST WIC Office, 16618 243rd Avenue, Eagle But...",...,-101.239919,,4.7,8,,,,,"[0x532af4588c5f80b1:0x30071640ecb03b5, 0x532af...",https://www.google.com/maps/place//data=!4m2!3...
3,101824980797027243148,Suzy Berndt,1532922350314,5,Th sw y are so nice,,,0x532af45db8f30779:0xd9be9359f1e56178,CRST WIC Office,"CRST WIC Office, 16618 243rd Avenue, Eagle But...",...,-101.239919,,4.7,8,,,,,"[0x532af4588c5f80b1:0x30071640ecb03b5, 0x532af...",https://www.google.com/maps/place//data=!4m2!3...
4,108711640480272772398,Rosemary Red Legs,1530969093932,5,Went with my daughter,,,0x532af45db8f30779:0xd9be9359f1e56178,CRST WIC Office,"CRST WIC Office, 16618 243rd Avenue, Eagle But...",...,-101.239919,,4.7,8,,,,,"[0x532af4588c5f80b1:0x30071640ecb03b5, 0x532af...",https://www.google.com/maps/place//data=!4m2!3...


In [29]:
reviews_completo.info()

<class 'pandas.core.frame.DataFrame'>
Index: 71159 entries, 0 to 29600
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   user_id           71159 non-null  object 
 1   name_x            71159 non-null  object 
 2   time              71159 non-null  int64  
 3   rating            71159 non-null  int64  
 4   text              42715 non-null  object 
 5   pics              1358 non-null   object 
 6   resp              10635 non-null  object 
 7   gmap_id           71159 non-null  object 
 8   name_y            71159 non-null  object 
 9   address           70834 non-null  object 
 10  description       9481 non-null   object 
 11  latitude          71159 non-null  float64
 12  longitude         71159 non-null  float64
 13  category          71107 non-null  object 
 14  avg_rating        71159 non-null  float64
 15  num_of_reviews    71159 non-null  int64  
 16  price             10895 non-null  object 
 17

In [31]:
palabra_clave = "restaurant"
reviews_restaurant = reviews_completo.dropna(subset="category")
filtro = reviews_restaurant["category"].str.contains(palabra_clave, case=False)
restaurantes = reviews_restaurant[filtro]
restaurantes.head()

Unnamed: 0,user_id,name_x,time,rating,text,pics,resp,gmap_id,name_y,address,...,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
1274,111058951077443588250,Mellissa Newling,1491249563055,3,Orders wrong every time I go here. Takes forev...,,,0x878011faadb8eead:0xe778cbfd3bfbed0,KFC,"KFC, 603 E Sioux Ave, Pierre, SD 57501",...,-100.347168,Fast food restaurant,2.4,18,$,"[[Monday, 11AM–9PM], [Tuesday, 11AM–9PM], [Wed...","{'Service options': ['Delivery', 'Takeout'], '...",Permanently closed,"[0x878011faac2b47fb:0x544e98484ed978fc, 0x8780...",https://www.google.com/maps/place//data=!4m2!3...
1274,111058951077443588250,Mellissa Newling,1491249563055,3,Orders wrong every time I go here. Takes forev...,,,0x878011faadb8eead:0xe778cbfd3bfbed0,KFC,"KFC, 603 E Sioux Ave, Pierre, SD 57501",...,-100.347168,American restaurant,2.4,18,$,"[[Monday, 11AM–9PM], [Tuesday, 11AM–9PM], [Wed...","{'Service options': ['Delivery', 'Takeout'], '...",Permanently closed,"[0x878011faac2b47fb:0x544e98484ed978fc, 0x8780...",https://www.google.com/maps/place//data=!4m2!3...
1274,111058951077443588250,Mellissa Newling,1491249563055,3,Orders wrong every time I go here. Takes forev...,,,0x878011faadb8eead:0xe778cbfd3bfbed0,KFC,"KFC, 603 E Sioux Ave, Pierre, SD 57501",...,-100.347168,Chicken restaurant,2.4,18,$,"[[Monday, 11AM–9PM], [Tuesday, 11AM–9PM], [Wed...","{'Service options': ['Delivery', 'Takeout'], '...",Permanently closed,"[0x878011faac2b47fb:0x544e98484ed978fc, 0x8780...",https://www.google.com/maps/place//data=!4m2!3...
1274,111058951077443588250,Mellissa Newling,1491249563055,3,Orders wrong every time I go here. Takes forev...,,,0x878011faadb8eead:0xe778cbfd3bfbed0,KFC,"KFC, 603 E Sioux Ave, Pierre, SD 57501",...,-100.347168,Chicken wings restaurant,2.4,18,$,"[[Monday, 11AM–9PM], [Tuesday, 11AM–9PM], [Wed...","{'Service options': ['Delivery', 'Takeout'], '...",Permanently closed,"[0x878011faac2b47fb:0x544e98484ed978fc, 0x8780...",https://www.google.com/maps/place//data=!4m2!3...
1275,117708793057964375634,Darcy Green,1499523764411,2,I had taco bell food and it was cold. I waited...,,,0x878011faadb8eead:0xe778cbfd3bfbed0,KFC,"KFC, 603 E Sioux Ave, Pierre, SD 57501",...,-100.347168,Fast food restaurant,2.4,18,$,"[[Monday, 11AM–9PM], [Tuesday, 11AM–9PM], [Wed...","{'Service options': ['Delivery', 'Takeout'], '...",Permanently closed,"[0x878011faac2b47fb:0x544e98484ed978fc, 0x8780...",https://www.google.com/maps/place//data=!4m2!3...


In [32]:
restaurantes["category"].value_counts()

category
Restaurant                      1087
Pizza restaurant                 543
Delivery Restaurant              397
Takeout Restaurant               322
Chinese restaurant               297
Fast food restaurant             247
Asian restaurant                 221
Breakfast restaurant             164
Vegetarian restaurant            164
American restaurant              150
Italian restaurant               107
Tapas restaurant                  80
Taco restaurant                   63
Mexican restaurant                63
Restaurant supply store           56
Spanish restaurant                48
Latin American restaurant         48
Burrito restaurant                48
Seafood restaurant                47
Dominican restaurant              44
Ecuadorian restaurant             43
Colombian restaurant              41
Hawaiian restaurant               38
Health food restaurant            36
Gluten-free restaurant            30
Soup restaurant                   28
Chicken wings restaurant     

In [33]:
restaurantes["category"].unique()

array(['Fast food restaurant', 'American restaurant',
       'Chicken restaurant', 'Chicken wings restaurant', 'Restaurant',
       'Breakfast restaurant', 'Pizza restaurant', 'Norwegian restaurant',
       'Down home cooking restaurant', 'Takeout Restaurant',
       'Soup restaurant', 'Health food restaurant',
       'Dominican restaurant', 'Italian restaurant', 'Taco restaurant',
       'Cape Verdean restaurant', 'Seafood restaurant',
       'Restaurant supply store', 'Chinese restaurant',
       'Delivery Chinese restaurant', 'Mexican restaurant',
       'Delivery Restaurant', 'Asian restaurant', 'Hot pot restaurant',
       'Burrito restaurant', 'Latin American restaurant',
       'Spanish restaurant', 'Tapas restaurant', 'Vegetarian restaurant',
       'Japanese restaurant', 'Tex-Mex restaurant',
       'Guatemalan restaurant', 'Cajun restaurant',
       'Caribbean restaurant', 'Southern restaurant (US)',
       'Lunch restaurant', 'Colombian restaurant',
       'Asian fusion rest

In [42]:
palabra_clave = "lodging"
reviews_hotel = reviews_completo.dropna(subset="category")
filtro1 = reviews_hotel["category"].str.contains(palabra_clave, case=False)
hoteles = reviews_hotel[filtro1]
hoteles.head()

Unnamed: 0,user_id,name_x,time,rating,text,pics,resp,gmap_id,name_y,address,...,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
