In [1]:
import pandas as pd
import pickle
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from datetime import datetime, timedelta
import ast
import json

In [12]:
# unix datetime
base = pd.Timestamp("1970-01-01")
CHUNK_SIZE = 1000000
REVIEW_DROP = 10

In [3]:
# Function that extract keys from the nested dictionary
def extract_keys(attr, key):
    if attr == None:
        return "{}"
    if key in attr:
        return attr.pop(key)

In [4]:
# convert string to dictionary
def str_to_dict(attr):
    if attr != None:
        return ast.literal_eval(attr)
    else:
        return ast.literal_eval("{}")

In [5]:
def sub_timestamp(element):
    element = element[0]
    a, b = element.split('-')
    a = datetime.strptime(a, "%H:%M")
    b = datetime.strptime(b, "%H:%M")
    return timedelta.total_seconds(b - a)

In [6]:
# check whether CUDA (GPU acceleration) is available on the current system
def get_device():
    if torch.cuda.is_available():
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu')
    return device

# convert the values of the DataFrame to a PyTorch tensor
def df_to_tensor(df):
    device = get_device()
    return torch.from_numpy(df.values).long().to(device)

# converts a pandas DataFrame to a PyTorch tensor
def df_to_tensor_cpu(df):
    return torch.from_numpy(df.values).long()

In [7]:
def process_data_chunk(reviews, users, restaurants):
    reviews = pd.merge(reviews, users, how='inner', on='user_id')
    reviews = reviews.drop(columns='user_id')
    reviews = pd.merge(reviews, restaurants, how='inner', on='business_id')
    reviews = reviews.drop(columns='business_id')
    print("REVIEWS.HEAD() -------------------------------------------------------------------")
    print(reviews.head())
    reviews = reviews.drop(columns=reviews.columns[0], axis=1)
    print("REVIEWS.DROP() -------------------------------------------------------------------")
    print(reviews.head())
    return df_to_tensor(reviews)

In [8]:
def load_data(train_percent, val_percent, test_percent):
    print("Reading users")
    with open('dataset/yelp_academic_dataset_user.json', 'r') as file:
        data = [json.loads(line) for line in file]

    users = pd.DataFrame(data)
    users = users[users['review_count'] > REVIEW_DROP]
    users['user_id'] = users['user_id'].astype('category')
    users['user_id_num'] = users['user_id'].cat.codes
    users = users[['user_id', 'user_id_num', 'review_count']]
    user_id_to_num = dict(zip(users['user_id'], users['user_id_num']))

    print("Reading businesses")
    with open('dataset/yelp_academic_dataset_business.json', 'r') as file:
        data = [json.loads(line) for line in file]

    restaurants = pd.DataFrame(data)
    restaurants['business_id'] = restaurants['business_id'].astype('category')
    restaurants['business_id_num'] = restaurants['business_id'].cat.codes
    restaurants = restaurants[['business_id', 'business_id_num']]
    rest_id_to_num = dict(zip(restaurants['business_id'], restaurants['business_id_num']))

    print("Reading reviews")
    with open('dataset/yelp_academic_dataset_review.json', 'r') as file:
        data = [json.loads(line) for line in file]

    reviews = pd.DataFrame(data)

    reviews = pd.merge(reviews, users, how='inner', on='user_id')
    reviews = reviews.drop(columns='user_id')
    reviews = pd.merge(reviews, restaurants, how='inner', on='business_id')
    reviews = reviews.drop(columns=['business_id', 'text','date'])
    print("REVIEWS.HEAD() -------------------------------------------------------------------")
    print(reviews.head())
    reviews = reviews.drop(columns=reviews.columns[0], axis=1)
    print("REVIEWS.DROP() -------------------------------------------------------------------")
    print(reviews.head())

    pickle.dump(user_id_to_num, open('dataset/user_id_to_num.pkl', 'wb'))
    pickle.dump(rest_id_to_num, open('dataset/rest_id_to_num.pkl', 'wb'))
    np.save('dataset/data.npy', reviews.values)

    training = reviews.sample(frac=train_percent)

    left = reviews.drop(training.index)
    validation = left.sample(frac=val_percent / (val_percent + test_percent))

    test = left.drop(validation.index)

    print("loaded")

    return df_to_tensor_cpu(training), df_to_tensor_cpu(validation), df_to_tensor_cpu(test), user_id_to_num, rest_id_to_num

In [9]:
if __name__ == "__main__":
    train, val, test, user, rest = load_data(0.6, 0.3, 0.1)
    print("TRAIN ----------------------------------------------")
    print(train.shape)
    print("VAL ----------------------------------------------")
    print(val.shape)
    print("TEST ----------------------------------------------")
    print(test.shape)

Reading users
Reading businesses
Reading reviews
REVIEWS.HEAD() -------------------------------------------------------------------
                review_id  stars  useful  funny  cool  user_id_num   
0  KU_O5udG6zpxOg-VcAEodg    3.0       0      0     0      1575913  \
1  jHmqmoEI-78BGHFJaDKlhQ    2.0       0      0     0       194076   
2  vwIXZHod-jQmGFvx0wCqSg    5.0       0      0     0      1679313   
3  SP32nOhRm-KRAjYMPgf_MQ    3.0       0      0     0      1888226   
4  fvu5n5shkAJDbQjulKNuqw    2.0       8      0     0       758236   

   review_count  business_id_num  
0            33            80739  
1            39            80739  
2             7            80739  
3           490            80739  
4           483            80739  
REVIEWS.DROP() -------------------------------------------------------------------
   stars  useful  funny  cool  user_id_num  review_count  business_id_num
0    3.0       0      0     0      1575913            33            80739
1    2

###  Output data with the text column

In [13]:
# Output data with the text column
print("Reading users")
with open('dataset/yelp_academic_dataset_user.json', 'r') as file:
    data = [json.loads(line) for line in file]

users = pd.DataFrame(data)
users = users[users['review_count'] > REVIEW_DROP]
users['user_id'] = users['user_id'].astype('category')
users['user_id_num'] = users['user_id'].cat.codes
users = users[['user_id', 'user_id_num', 'review_count', 'average_stars']]
user_id_to_num = dict(zip(users['user_id'], users['user_id_num']))

Reading users


In [14]:
print("Reading businesses")
with open('dataset/yelp_academic_dataset_business.json', 'r') as file:
    data = [json.loads(line) for line in file]

restaurants = pd.DataFrame(data)
restaurants['business_id'] = restaurants['business_id'].astype('category')
restaurants['business_id_num'] = restaurants['business_id'].cat.codes
restaurants.rename(columns={'stars': 'business_avg_stars'}, inplace=True)


# Adding categories of the restaurant
restaurants['categories'] = restaurants['categories'].apply(lambda x: x.split(',') if pd.notnull(x) else [])
all_categories = set(category.strip() for categories in restaurants['categories'] for category in categories)
category_to_code = {category: code for code, category in enumerate(all_categories)}
restaurants['category_codes'] = restaurants['categories'].apply(lambda x: [category_to_code[category.strip()] for category in x])

restaurants = restaurants[['business_id', 'business_id_num', 'business_avg_stars', 'category_codes']]
rest_id_to_num = dict(zip(restaurants['business_id'], restaurants['business_id_num']))


Reading businesses


In [15]:
print("Reading reviews")
with open('dataset/yelp_academic_dataset_review.json', 'r') as file:
    data = [json.loads(line) for line in file]

reviews = pd.DataFrame(data)

reviews = pd.merge(reviews, users, how='inner', on='user_id')
reviews = reviews.drop(columns='user_id')
reviews = pd.merge(reviews, restaurants, how='inner', on='business_id')

Reading reviews


In [16]:
reviews.head(2)

Unnamed: 0,review_id,business_id,stars,useful,funny,cool,text,date,user_id_num,review_count,average_stars,business_id_num,business_avg_stars,category_codes
0,KU_O5udG6zpxOg-VcAEodg,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11,540278,33,4.06,80739,3.0,"[344, 1252, 736, 114, 412, 1175, 616]"
1,jHmqmoEI-78BGHFJaDKlhQ,XQfwVwDr-v0ZS3_CbbE5Xw,2.0,0,0,0,"Excellent food but slow, slow, slow. Staff nee...",2017-10-09 15:55:09,66129,39,3.05,80739,3.0,"[344, 1252, 736, 114, 412, 1175, 616]"


In [17]:
len(category_to_code)

1311

In [18]:
reviews = reviews.drop(columns=['review_id', 'business_id', 'useful', 'funny', 'cool', 'text', 'date'])
reviews.head(2)

Unnamed: 0,stars,user_id_num,review_count,average_stars,business_id_num,business_avg_stars,category_codes
0,3.0,540278,33,4.06,80739,3.0,"[344, 1252, 736, 114, 412, 1175, 616]"
1,2.0,66129,39,3.05,80739,3.0,"[344, 1252, 736, 114, 412, 1175, 616]"


In [19]:
# pickle.dump(user_id_to_num, open('dataset/user_id_to_num.pkl', 'wb'))
# pickle.dump(rest_id_to_num, open('dataset/rest_id_to_num.pkl', 'wb'))
pickle.dump(category_to_code, open('dataset/category_to_code.pkl', 'wb'))
np.save('dataset/data_new.npy', reviews.values)