In [1]:
import numpy as np
import pandas as pd

## Import the dataset

[Google Local Data (2021). Alaska 10-core (521,515 reviews)](https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/googlelocal/)

This Dataset contains review information on Google map (ratings, text, images, etc.), business metadata (address, geographical info, descriptions, category information, price, open hours, and MISC info), and links (relative businesses) up to Sep 2021 in the United States.

These data have been reduced to extract the k-core, such that each of the remaining users and items have k reviews each.

<ul>
<li>user_id - ID of the reviewer
<li>name - name of the reviwer
<li>time - time of the review (unix time)
<li>rating - rating of the business
<li>text - text of the review
<li>pics - pictures of the review
<li>resp - business response to the review including unix time and text of the response
<li>gmap_id - ID of the business
</ul>

Metadata:
<ul>
<li>name - name of the business
<li>address - address of the business
<li>gmap_id - ID of the business
<li>description - description of the business
<li>latitude - latitude of the business
<li>longitude - longitude of the business
<li>category - category of the business
<li>avg_rating - average rating of the business
<li>num_of_reviews - number of reviews
<li>price - price of the business
<li>hours - open hours
<li>MISC - MISC information
<li>state - the current status of the business (e.g., permanently closed)
<li>relative_results - relative businesses recommended by Google
<li>url - URL of the business
</ul>

Format is one-review-per-line in json.

Citation
1. UCTopic: Unsupervised Contrastive Learning for Phrase Representations and Topic Mining
Jiacheng Li, Jingbo Shang, Julian McAuley
Annual Meeting of the Association for Computational Linguistics (ACL), 2022
[pdf](https://aclanthology.org/2022.acl-long.426.pdf)

2. Personalized Showcases: Generating Multi-Modal Explanations for Recommendations
An Yan, Zhankui He, Jiacheng Li, Tianyang Zhang, Julian Mcauley
The 46th International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR), 2023
[pdf](https://arxiv.org/pdf/2207.00422.pdf)

In [2]:
df_reviews = pd.read_json('./data/review-Alaska_10.json.gz', lines=True, compression='gzip')

In [3]:
df_reviews.head(3)

Unnamed: 0,user_id,name,time,rating,text,pics,resp,gmap_id
0,1.091298e+20,Nicki Gore,1566331951619,5,We always stay here when in Valdez for silver ...,,,0x56b646ed2220b77f:0xd8975e316de80952
1,1.132409e+20,Allen Ratliff,1504917982385,5,Great campground for the price. Nice hot unlim...,,,0x56b646ed2220b77f:0xd8975e316de80952
2,1.130448e+20,Jonathan Tringali,1474765901185,4,We tent camped here for 2 nights while explori...,,,0x56b646ed2220b77f:0xd8975e316de80952


In [4]:
df_meta = pd.read_json('./data/meta-Alaska.json.gz', lines=True, compression='gzip')

In [5]:
df_meta.head(3)

Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
0,Bear Creek Cabins & RV Park,"Bear Creek Cabins & RV Park, 3181 Richardson H...",0x56b646ed2220b77f:0xd8975e316de80952,,61.100644,-146.214552,"[RV park, Cabin rental agency, Campground]",4.5,18,,,,,"[0x56b6445fd9f9e387:0x6dd3d374ef56431a, 0x56b6...",https://www.google.com/maps/place//data=!4m2!3...
1,Anchorage Market,"Anchorage Market, 88th Ave, Anchorage, AK 99515",0x56c8992b5dee7225:0x9f7f4bf151868cf7,,61.141435,-149.868482,[Farmers' market],4.2,18,,"[[Thursday, Closed], [Friday, 10AM–5PM], [Satu...","{'Service options': ['In-store shopping'], 'Ac...",Closed ⋅ Opens 10AM Fri,,https://www.google.com/maps/place//data=!4m2!3...
2,Happy Camper RV,"Happy Camper RV, 1151 N Shenandoah Dr # 4, Pal...",0x56c8e0455225be87:0xf24828df75e2f8ae,,61.591855,-149.290657,[RV repair shop],4.4,28,,,{'Accessibility': ['Wheelchair accessible entr...,,"[0x56c8e104d9929a1d:0x2070ad63defadbf, 0x56c91...",https://www.google.com/maps/place//data=!4m2!3...


## Preprocessing

### Count of businesses by city

In [6]:
def get_city(address):
    if not address:
        return None
    
    address_fields = address.split(', ')

    if len(address_fields) == 1:
        return address_fields[0]
    
    return address_fields[-2]

df_meta['address'].apply(lambda address: get_city(address)).value_counts()[:10]

address
Anchorage      4228
Fairbanks      1285
Wasilla         863
Juneau          642
Palmer          428
Soldotna        344
Ketchikan       323
Homer           307
Eagle River     270
Kenai           240
Name: count, dtype: int64

In [7]:
# City we want to analize
city = 'Fairbanks'

### Get businesses from the city

In [8]:
def is_address_from_city(address, city):
    if not address:
        return False
    
    address_fields = address.split(', ')

    if len(address_fields) == 1:
        return address_fields[0] == city
    
    if address_fields[-2] == city:
        return True
    return False

city_mask = df_meta['address'].apply(lambda address: is_address_from_city(address, city))
businesses_from_city = df_meta[city_mask]

### Get users from the city

In [9]:
def user_is_good_for_city(user_id, city, threshold):
    global businesses_from_city, df_reviews

    user_reviews = df_reviews[df_reviews['user_id'] == user_id]
    reviews_count = user_reviews['gmap_id'].isin(businesses_from_city['gmap_id']).sum()

    return reviews_count >= threshold

user_ids = pd.Series(df_reviews['user_id'].unique())
user_mask = user_ids.apply(lambda user_id: user_is_good_for_city(user_id, city, 5))

users_from_city = user_ids[user_mask]

In [10]:
users_from_city

3        1.103292e+20
37       1.007600e+20
104      1.183128e+20
186      1.088855e+20
193      1.097375e+20
             ...     
19997    1.170402e+20
19998    1.039482e+20
20007    1.112864e+20
20008    1.075783e+20
20020    1.000112e+20
Length: 3622, dtype: float64

#### Check cities of reviews for the first found user

In [11]:
df_reviews[df_reviews['user_id'] == users_from_city[3]]['gmap_id'].apply(
    lambda gmap_id: get_city(
        df_meta[df_meta['gmap_id'] == gmap_id]['address'].iloc[0]
    )
)

3                                    Valdez
244                                  Valdez
34802                             Fairbanks
49967                                 Homer
90360                             Fairbanks
123046                      Fort Wainwright
164050                            Fairbanks
167335                            Fairbanks
176855    Denali National Park and Preserve
176982    Denali National Park and Preserve
177672                            Fairbanks
206507                                Healy
211309                               Palmer
227036                            Fairbanks
252110                                Healy
264412                            Fairbanks
274687                            Fairbanks
312048                               Palmer
346516                            Fairbanks
349357                            Fairbanks
376777                            Fairbanks
487423                            Fairbanks
517661                          