In [1]:
import pandas as pd
import os
import ast
import torch
from transformers import BertTokenizer, BertModel

# yelp datasets file
yelp_datasets_path = '../../Data_preprocessing/yelp_datasets/'

# yelp dataset (from yelp offical website)
yelp_offical_dataset_path = yelp_datasets_path + 'yelp_dataset_official/'

# business, user and review dataset
yelp_academic_dataset_business_path = yelp_offical_dataset_path + 'yelp_academic_dataset_business.json'
yelp_academic_dataset_user_path = yelp_offical_dataset_path + 'yelp_academic_dataset_user.json'
yelp_academic_dataset_review_path = yelp_offical_dataset_path + 'yelp_academic_dataset_review.json'

# yelp photo dataset (from yelp official website)
yelp_offical_photo_dataset_path = yelp_datasets_path + 'yelp_dataset_official_photos/'

# photo dataset
photo_dataset_path = yelp_offical_photo_dataset_path + 'photos/'

def check_users_and_business_from_valid_and_test_is_in_training_set(train_data, valid_data, test_data):
    
    # Check if users and businesses in the validation set are all in the training set
    valid_users_in_train = valid_data['user_id'].isin(train_data['user_id']).all()
    valid_businesses_in_train = valid_data['business_id'].isin(train_data['business_id']).all()

    # Check if users and businesses in the test set are all in the training set
    test_users_in_train = test_data['user_id'].isin(train_data['user_id']).all()
    test_businesses_in_train = test_data['business_id'].isin(train_data['business_id']).all()

    # Print the results
    print('Are all users in the validation set also in the training set?', valid_users_in_train)
    print('Are all businesses in the validation set also in the training set?', valid_businesses_in_train)
    print('Are all users in the test set also in the training set?', test_users_in_train)
    print('Are all businesses in the test set also in the training set?', test_businesses_in_train)


def get_bert_sentence_cls_embedding(text):
    
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    cls_sentence_embedding = outputs.last_hidden_state[:, 0, :]  # Extract [CLS] token embedding
    return cls_sentence_embedding

def get_vector_or_embedding_and_save_to_txt(folder_file_path, data, column_id_name, column_to_tranform,
                                     method="bert-base-uncased", embedding_level="sentence_cls_embedding"):
    
    # make sure file is empty
    with open(folder_file_path, 'w') as output_file:
        pass
    
    with open(folder_file_path, 'a') as output_file:

        if method=="bert-base-uncased" and embedding_level=="sentence_cls_embedding":
            # Iterate through user_reviews and Append Results to Text File
            for index, row in data.iterrows():

                _id = row[column_id_name]
                text = row[column_to_tranform]

                feature_vector = get_bert_sentence_cls_embedding(text)

                # Convert PyTorch Tensor to a list for easy storage
                feature_vector_list = feature_vector.squeeze().tolist()

                # Convert the list to a string for storage
                feature_vector_str = str(feature_vector_list)

                # Write user_id and feature_vector_str to the text file
                output_file.write(f"{_id}\t{feature_vector_str}\n")

            # Print a message indicating the successful saving of feature vectors
            print(f'Feature vectors saved to {folder_file_path}')


def read_vector_or_embedding_txt_return_df(folder_file_path, data_columns):

    data = []

    with open(folder_file_path, 'r') as file:
        for line in file:
            # Split the line into _id and vector(embedding)
            _id, feature_vector_str = line.strip().split('\t')

            # Convert the vector string back to a list using ast.literal_eval
            feature_vector = ast.literal_eval(feature_vector_str)

            # Append _id and feature_vector to the data list
            data.append([_id, feature_vector])

    # Create the DataFrame
    df = pd.DataFrame(data, columns=data_columns)
    
    return df

def training_dataset_get_vector_or_embedding(folder_name, train_data, business_data, user_data, 
                                   method="bert-base-uncased", embedding_level="sentence_cls_embedding",
                                   text_name="user_concatenated_reviews"):
    
    if text_name == "user_concatenated_reviews" or text_name == "business_concatenated_reviews":
        
        for_who = ""
        
        if text_name == "user_concatenated_reviews":
            for_who = "user"
        else:
            for_who = "business"
            
        id_name = f"{for_who}_id"
        column_to_tranform = f"{text_name}_text"
        
        # Preprocess Text Data
        concatenated_reviews = train_data.groupby(id_name)['text'].apply(lambda x: ';'.join(x)).reset_index()
        concatenated_reviews = concatenated_reviews.rename(columns={'text': column_to_tranform})
        # Save as csv
        concatenated_reviews.to_csv(os.path.join(folder_name, f'{text_name}.csv'), index=False)


        # Get BERT embedding
        output_file_path = f'{text_name}_{method}_{embedding_level}.txt'
        folder_file_path = os.path.join(folder_name, output_file_path)
        get_vector_or_embedding_and_save_to_txt(folder_file_path, concatenated_reviews, 
                                                method=method, embedding_level=embedding_level, 
                                                column_id_name=id_name, column_to_tranform=column_to_tranform)

        feature_vectors_df = read_vector_or_embedding_txt_return_df(folder_file_path, 
                                                data_columns=[id_name, f"{for_who}_feature_vector"])

        # Merge data
        concatenated_reviews_with_feature_vectors_df = pd.merge(concatenated_reviews, feature_vectors_df, on=id_name)
        
        return concatenated_reviews_with_feature_vectors_df


    elif text_name=="categories":

        merged_data = pd.merge(train_data, business_data, on="business_id", how="left")

        # Get BERT embedding
        output_file_path = f'{text_name}_{method}_{embedding_level}.txt'
        folder_file_path = os.path.join(folder_name, output_file_path)
        
        get_vector_or_embedding_and_save_to_txt(folder_file_path, merged_data, 
                                                method=method, embedding_level=embedding_level, 
                                                column_id_name="business_id", column_to_tranform=text_name)

        feature_vectors_df = read_vector_or_embedding_txt_return_df(folder_file_path, 
                                                data_columns=["business_id", f"{text_name}_feature_vector"])
        
        feature_vectors_df = feature_vectors_df.drop_duplicates(subset=['business_id']).reset_index(drop=True)
        
        return feature_vectors_df

    else:
        print("Text name has not handled yet or wrong text name !")

In [2]:
# check if all user and business in validation set and test set also in the training set

import pandas as pd

business_data = pd.read_json(yelp_academic_dataset_business_path, lines=True, encoding='utf-8-sig')
# user_data = pd.read_json(yelp_academic_dataset_user_path, lines=True, encoding='utf-8-sig')
user_data = [] # prevent memory explode

# folder_name = "13percent_5reviews_num_threshold_GPT-3.5 Turbo_random_stars_research_data"
# folder_name = "../13percent_5reviews_num_threshold_GPT-3.5 Turbo_random_stars_research_data"
folder_name = "./"

train_data = pd.read_csv(os.path.join(folder_name, "research_training_set.csv"))
valid_data = pd.read_csv(os.path.join(folder_name, "research_validation_set.csv"))
test_data = pd.read_csv(os.path.join(folder_name, "research_test_set.csv"))

check_users_and_business_from_valid_and_test_is_in_training_set(train_data, valid_data, test_data)

Are all users in the validation set also in the training set? True
Are all businesses in the validation set also in the training set? True
Are all users in the test set also in the training set? True
Are all businesses in the test set also in the training set? True


In [3]:
# business_id, name, address, city, state, postal_code, latitude, longitude, stars, review_count, is_open, attributes, categories, hours

# user_id, name, review_count, yelping_since(time), useful, funny, cool, elite, friends, fans(count), 
# compliment_more, compliment_profile, compliment_cute, compliment_list, compliment_note, compliment_plain, 
# compliment_cool, compliment_funny, compliment_writer, compliment_photos

user_concatenated_reviews_with_feature_vectors_df = training_dataset_get_vector_or_embedding(folder_name, train_data, business_data, user_data, 
                                   method="bert-base-uncased", embedding_level="sentence_cls_embedding",
                                   text_name="user_concatenated_reviews")
user_concatenated_reviews_with_feature_vectors_df

Feature vectors saved to 13percent_5reviews_num_threshold_GPT-3.5 Turbo_random_stars_research_data/user_concatenated_reviews_bert-base-uncased_sentence_cls_embedding.txt


Unnamed: 0,user_id,user_concatenated_reviews_text,user_feature_vector
0,-2cKJFFNJ9XVyWBt62mWvA,When it comes to pastries... especially Italia...,"[-0.11832541972398758, -0.06039704009890556, 0..."
1,-3s52C4zL_DHRK0ULG6qtg,Stopped in for a quick coffee and ice cream be...,"[0.04805588349699974, -0.13977167010307312, 0...."
2,-FxsSuwDbIII7yo5BjHpiA,"""Ping! Pow! Boom! Bing!"" - Tommy DeVito, Goodf...","[-0.05209613963961601, 0.06487774848937988, 0...."
3,-G7Zkl1wIWBBmD0KRy_sCw,I totally get the cult of Federal Donuts; the ...,"[0.01472178753465414, -0.23244208097457886, 0...."
4,-GowNe73gDZs9MfS3ugJDQ,Been a little while since last I dropped by Br...,"[0.005706729367375374, -0.09477227181196213, 0..."
...,...,...,...
1803,zsXoPyTcU8ThZGbtAB-Vug,Martinez women's Christmas dinner ...great tim...,"[-0.2649359703063965, -0.2814408242702484, 0.1..."
1804,zu-e06_BM_TdkAZEKMrIww,You can always count on the Foodery to have an...,"[-0.3454887270927429, -0.1860538125038147, 0.2..."
1805,zv7tpu7xeaNyAeFG03d2CA,I am simply only leaving one star so I leave a...,"[-0.19689643383026123, -0.11205922812223434, 0..."
1806,zwXmvn1op5LuFF2Kveqaug,Wooooow. So good.\n\nWith a pretty simple inte...,"[-0.22505134344100952, -0.430458128452301, -0...."


In [4]:
business_concatenated_reviews_with_feature_vectors_df = training_dataset_get_vector_or_embedding(folder_name, train_data, business_data, user_data, 
                                   method="bert-base-uncased", embedding_level="sentence_cls_embedding",
                                   text_name="business_concatenated_reviews")
business_concatenated_reviews_with_feature_vectors_df

Feature vectors saved to 13percent_5reviews_num_threshold_GPT-3.5 Turbo_random_stars_research_data/business_concatenated_reviews_bert-base-uncased_sentence_cls_embedding.txt


Unnamed: 0,business_id,business_concatenated_reviews_text,business_feature_vector
0,-0TffRSXXIlBYVbb5AwfTg,"Among Indian food fans in the Philly area, the...","[-0.011190894991159439, -0.14645609259605408, ..."
1,-1PG6k_iezwJmRZLB7f6og,We stumbled upon this place after a bridal sho...,"[-0.06079084426164627, -0.23591890931129456, 0..."
2,-4mDMBfT6N0d-VIUTKVaLg,Simply put: this place is an adorable hole on ...,"[-0.15170690417289734, -0.029171377420425415, ..."
3,-81BXpO5Fuk-RqCabS7LMw,What a great brewery! If you are looking to tr...,"[-0.2533146142959595, -0.404371052980423, 0.45..."
4,-AanHawaDlzWHQjrqRRWig,Meg's team runs so smoothly and I enjoy the se...,"[-0.371690571308136, -0.29266345500946045, -0...."
...,...,...,...
1568,zropQGh2fc7PMpzd32vneQ,I ordered the bulgogi tacos with some chips an...,"[-0.14535580575466156, -0.2953207194805145, 0...."
1569,zujdPV3HT-Y-CKE1GgkMHQ,It's a decent place to go for pastries. So far...,"[0.000845844391733408, -0.2679324448108673, 0...."
1570,zun6IVJa7wYe3wAPqWnPGw,I like this place better than Black Market Eat...,"[-0.07951334863901138, -0.3137638568878174, 0...."
1571,zvzmKaltuHKPeEcBkiUp1w,"I really wanted to like @WedgeCheeseShop , sin...","[0.11999098211526871, -0.08705776184797287, -0..."


In [5]:
categories_feature_vectors_df = training_dataset_get_vector_or_embedding(folder_name, train_data, business_data, user_data, 
                                   method="bert-base-uncased", embedding_level="sentence_cls_embedding",
                                   text_name="categories")
categories_feature_vectors_df

Feature vectors saved to 13percent_5reviews_num_threshold_GPT-3.5 Turbo_random_stars_research_data/categories_bert-base-uncased_sentence_cls_embedding.txt


Unnamed: 0,business_id,categories_feature_vector
0,EFci0tbSrb7wko6tpVDnbA,"[-0.07826090604066849, 0.0407031811773777, 0.2..."
1,EPMHuTHu46B123ivRIZ-Xg,"[0.15846343338489532, -0.04316771775484085, -0..."
2,J-ciDDEdIHMcChGIyKZnOg,"[-0.05295887589454651, -0.06621156632900238, 0..."
3,teFjQxUqT8c-yxQdoILDVQ,"[-0.3904491662979126, 0.06892769038677216, 0.4..."
4,IzyQVIJG8JAnOiRQPb0-wg,"[-0.05826997384428978, -0.24565061926841736, 0..."
...,...,...
11191,jCKjT0w6BnPxNSZO9Q2uuw,"[-0.1557040810585022, -0.23840990662574768, 0...."
11192,JDv3h1xRFIW8fXckqgTdRg,"[0.1256754845380783, -0.181650310754776, 0.068..."
11193,QGYzYUMsQe6k7__LD91E5w,"[-0.07354993373155594, 0.15001621842384338, 0...."
11194,dRKztV_Vtl7AvZg052SgRQ,"[-0.3296162486076355, -0.041647765785455704, 0..."


In [9]:
# categories_feature_vectors_df = categories_feature_vectors_df.drop_duplicates(subset=['business_id']).reset_index(drop=True)
# categories_feature_vectors_df

Unnamed: 0,business_id,categories_feature_vector
0,EFci0tbSrb7wko6tpVDnbA,"[-0.07826090604066849, 0.0407031811773777, 0.2..."
1,EPMHuTHu46B123ivRIZ-Xg,"[0.15846343338489532, -0.04316771775484085, -0..."
2,J-ciDDEdIHMcChGIyKZnOg,"[-0.05295887589454651, -0.06621156632900238, 0..."
3,teFjQxUqT8c-yxQdoILDVQ,"[-0.3904491662979126, 0.06892769038677216, 0.4..."
4,IzyQVIJG8JAnOiRQPb0-wg,"[-0.05826997384428978, -0.24565061926841736, 0..."
...,...,...
1568,8xuhKP08513N2W32JOMW4Q,"[-0.15446922183036804, -0.2641109526157379, 0...."
1569,WAPhi7JdwIvdTBgM9KsxHA,"[-0.13093726336956024, 0.024154286831617355, 0..."
1570,H91qZiLaUQEHNkDP4DO5pg,"[0.05852658301591873, -0.2432709038257599, 0.1..."
1571,XpTNCVbO5wL1DtDyKzKqfA,"[0.0391341857612133, 0.0245373472571373, 0.024..."


In [10]:
# merge the dataframe above to training_data, validation_data and test_data

import os
# folder_name = "13percent_5reviews_num_threshold_GPT-3.5 Turbo_random_stars_research_data"
# folder_name = "../13percent_5reviews_num_threshold_GPT-3.5 Turbo_random_stars_research_data"
folder_name = "./"

train_data = pd.read_csv(os.path.join(folder_name, "research_training_set.csv"))
valid_data = pd.read_csv(os.path.join(folder_name, "research_validation_set.csv"))
test_data = pd.read_csv(os.path.join(folder_name, "research_test_set.csv"))

train_data = pd.merge(train_data, user_concatenated_reviews_with_feature_vectors_df, on='user_id')
train_data = pd.merge(train_data, business_concatenated_reviews_with_feature_vectors_df, on='business_id')
train_data = pd.merge(train_data, categories_feature_vectors_df, on='business_id')
print("Train data shape:", train_data.shape)
# save as csv
train_data.to_csv(os.path.join(folder_name, "research_training_set_with_concatenated_reviews_and_feature_vectors.csv"), index=False)

valid_data = pd.merge(valid_data, user_concatenated_reviews_with_feature_vectors_df, on='user_id')
valid_data = pd.merge(valid_data, business_concatenated_reviews_with_feature_vectors_df, on='business_id')
valid_data = pd.merge(valid_data, categories_feature_vectors_df, on='business_id')
print("Valid data shape:", valid_data.shape)
# save as csv
valid_data.to_csv(os.path.join(folder_name, "research_validation_set_with_concatenated_reviews_and_feature_vectors.csv"), index=False)

test_data = pd.merge(test_data, user_concatenated_reviews_with_feature_vectors_df, on='user_id')
test_data = pd.merge(test_data, business_concatenated_reviews_with_feature_vectors_df, on='business_id')
test_data = pd.merge(test_data, categories_feature_vectors_df, on='business_id')
print("Test data shape:", test_data.shape)
# save as csv
test_data.to_csv(os.path.join(folder_name, "research_test_set_with_concatenated_reviews_and_feature_vectors.csv"), index=False)

Train data shape: (11196, 15)
Valid data shape: (1400, 15)
Test data shape: (1400, 15)
