In [9]:
import pandas as pd
import os
import ast
import torch
from transformers import BertTokenizer, BertModel

def check_users_and_business_from_valid_and_test_is_in_training_set(train_data, valid_data, test_data):
    
    # Check if users and businesses in the validation set are all in the training set
    valid_users_in_train = valid_data['user_id'].isin(train_data['user_id']).all()
    valid_businesses_in_train = valid_data['business_id'].isin(train_data['business_id']).all()

    # Check if users and businesses in the test set are all in the training set
    test_users_in_train = test_data['user_id'].isin(train_data['user_id']).all()
    test_businesses_in_train = test_data['business_id'].isin(train_data['business_id']).all()

    # Print the results
    print('Are all users in the validation set also in the training set?', valid_users_in_train)
    print('Are all businesses in the validation set also in the training set?', valid_businesses_in_train)
    print('Are all users in the test set also in the training set?', test_users_in_train)
    print('Are all businesses in the test set also in the training set?', test_businesses_in_train)


def get_bert_sentence_cls_embedding(text):
    
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    cls_sentence_embedding = outputs.last_hidden_state[:, 0, :]  # Extract [CLS] token embedding
    return cls_sentence_embedding

def get_vector_or_embedding_and_save_to_txt(folder_file_path, data, column_id_name, column_to_tranform,
                                     method="bert-base-uncased", embedding_level="sentence_cls_embedding"):
    
    # make sure file is empty
    with open(folder_file_path, 'w') as output_file:
        pass
    
    with open(folder_file_path, 'a') as output_file:

        if method=="bert-base-uncased" and embedding_level=="sentence_cls_embedding":
            # Iterate through user_reviews and Append Results to Text File
            for index, row in data.iterrows():

                _id = row[column_id_name]
                text = row[column_to_tranform]

                feature_vector = get_bert_sentence_cls_embedding(text)

                # Convert PyTorch Tensor to a list for easy storage
                feature_vector_list = feature_vector.squeeze().tolist()

                # Convert the list to a string for storage
                feature_vector_str = str(feature_vector_list)

                # Write user_id and feature_vector_str to the text file
                output_file.write(f"{_id}\t{feature_vector_str}\n")

            # Print a message indicating the successful saving of feature vectors
            print(f'Feature vectors saved to {folder_file_path}')


def read_vector_or_embedding_txt_return_df(folder_file_path, data_columns):

    data = []

    with open(folder_file_path, 'r') as file:
        for line in file:
            # Split the line into _id and vector(embedding)
            _id, feature_vector_str = line.strip().split('\t')

            # Convert the vector string back to a list using ast.literal_eval
            feature_vector = ast.literal_eval(feature_vector_str)

            # Append _id and feature_vector to the data list
            data.append([_id, feature_vector])

    # Create the DataFrame
    df = pd.DataFrame(data, columns=data_columns)
    
    return df

def training_dataset_get_vector_or_embedding(
    folder_path,
    argumented_results_data,
    method="bert-base-uncased", 
    embedding_level="sentence_cls_embedding",
    text_name="argumented_text_result"):
    
    cols = list(argumented_results_data.columns)
    id_column = [col for col in cols if 'id' in col.lower()][0]
    forwhat_column = [col for col in cols if text_name in col.lower()][0]
    
    if text_name=="argumented_text_result":

        # Get BERT embedding
        output_file_path = f'{forwhat_column}_{method}_{embedding_level}.txt'
        folder_file_path = os.path.join(folder_path, output_file_path)
        get_vector_or_embedding_and_save_to_txt(folder_file_path, argumented_results_data, 
                                                method=method, embedding_level=embedding_level, 
                                                column_id_name=id_column, column_to_tranform=forwhat_column)

        feature_vectors_df = read_vector_or_embedding_txt_return_df(folder_file_path, 
                                                data_columns=[id_column, f"{forwhat_column}_feature_vector"])

        # Merge data
        argumented_results_data_with_feature_vectors_df = pd.merge(argumented_results_data, feature_vectors_df, on=id_column)
        
        return argumented_results_data_with_feature_vectors_df

    else:
        print("Text name has not handled yet or wrong text name !")

In [10]:
# check if all user and business in validation set and test set also in the training set

import pandas as pd

folder_path = "../original"

train_data = pd.read_csv(os.path.join(folder_path, "research_training_set.csv"))
valid_data = pd.read_csv(os.path.join(folder_path, "research_validation_set.csv"))
test_data = pd.read_csv(os.path.join(folder_path, "research_test_set.csv"))

check_users_and_business_from_valid_and_test_is_in_training_set(train_data, valid_data, test_data)

Are all users in the validation set also in the training set? True
Are all businesses in the validation set also in the training set? True
Are all users in the test set also in the training set? True
Are all businesses in the test set also in the training set? True


In [11]:
# business_id, name, address, city, state, postal_code, latitude, longitude, stars, review_count, is_open, attributes, categories, hours

# user_id, name, review_count, yelping_since(time), useful, funny, cool, elite, friends, fans(count), 
# compliment_more, compliment_profile, compliment_cute, compliment_list, compliment_note, compliment_plain, 
# compliment_cool, compliment_funny, compliment_writer, compliment_photos

folder_path = './'
user_reviews_argumented = pd.read_csv("user_id_user_concatenated_reviews_text_with_argumented_results.csv")
user_reviews_argumented_text_with_feature_vectors_df = training_dataset_get_vector_or_embedding(
    folder_path=folder_path,
    argumented_results_data=user_reviews_argumented,
    method="bert-base-uncased", 
    embedding_level="sentence_cls_embedding", 
    text_name="argumented_text_result")
user_reviews_argumented_text_with_feature_vectors_df

Feature vectors saved to ./user_concatenated_reviews_text_argumented_text_result_bert-base-uncased_sentence_cls_embedding.txt


Unnamed: 0,user_id,user_concatenated_reviews_text,system_role_content,prompt,user_concatenated_reviews_text_argumented_text_result,user_concatenated_reviews_text_argumented_text_result_feature_vector
0,-2cKJFFNJ9XVyWBt62mWvA,When it comes to pastries... especially Italia...,test,test:,argumented text test,"[-0.3077409565448761, -0.05624747276306152, -0..."
1,-3s52C4zL_DHRK0ULG6qtg,Stopped in for a quick coffee and ice cream be...,test,test:,argumented text test,"[-0.3077409565448761, -0.05624747276306152, -0..."
2,-FxsSuwDbIII7yo5BjHpiA,"""Ping! Pow! Boom! Bing!"" - Tommy DeVito, Goodf...",test,test:,argumented text test,"[-0.3077409565448761, -0.05624747276306152, -0..."
3,-G7Zkl1wIWBBmD0KRy_sCw,I totally get the cult of Federal Donuts; the ...,test,test:,argumented text test,"[-0.3077409565448761, -0.05624747276306152, -0..."
4,-GowNe73gDZs9MfS3ugJDQ,Been a little while since last I dropped by Br...,test,test:,argumented text test,"[-0.3077409565448761, -0.05624747276306152, -0..."
...,...,...,...,...,...,...
1803,zsXoPyTcU8ThZGbtAB-Vug,Martinez women's Christmas dinner ...great tim...,test,test:,argumented text test,"[-0.3077409565448761, -0.05624747276306152, -0..."
1804,zu-e06_BM_TdkAZEKMrIww,You can always count on the Foodery to have an...,test,test:,argumented text test,"[-0.3077409565448761, -0.05624747276306152, -0..."
1805,zv7tpu7xeaNyAeFG03d2CA,I am simply only leaving one star so I leave a...,test,test:,argumented text test,"[-0.3077409565448761, -0.05624747276306152, -0..."
1806,zwXmvn1op5LuFF2Kveqaug,Wooooow. So good.\n\nWith a pretty simple inte...,test,test:,argumented text test,"[-0.3077409565448761, -0.05624747276306152, -0..."


In [4]:
# folder_path = './'
# business_reviews_argumented = pd.read_csv("business_id_business_concatenated_reviews_text_with_argumented_results.csv")
# business_reviews_argumented_text_with_feature_vectors_df = training_dataset_get_vector_or_embedding(
#     folder_path=folder_path,
#     argumented_results_data=business_reviews_argumented,
#     method="bert-base-uncased", 
#     embedding_level="sentence_cls_embedding",
#     text_name="argumented_text_result")
# business_reviews_argumented_text_with_feature_vectors_df

Feature vectors saved to 13percent_5reviews_num_threshold_GPT-3.5 Turbo_random_stars_research_data/business_concatenated_reviews_bert-base-uncased_sentence_cls_embedding.txt


Unnamed: 0,business_id,business_concatenated_reviews_text,business_feature_vector
0,-0TffRSXXIlBYVbb5AwfTg,"Among Indian food fans in the Philly area, the...","[-0.011190894991159439, -0.14645609259605408, ..."
1,-1PG6k_iezwJmRZLB7f6og,We stumbled upon this place after a bridal sho...,"[-0.06079084426164627, -0.23591890931129456, 0..."
2,-4mDMBfT6N0d-VIUTKVaLg,Simply put: this place is an adorable hole on ...,"[-0.15170690417289734, -0.029171377420425415, ..."
3,-81BXpO5Fuk-RqCabS7LMw,What a great brewery! If you are looking to tr...,"[-0.2533146142959595, -0.404371052980423, 0.45..."
4,-AanHawaDlzWHQjrqRRWig,Meg's team runs so smoothly and I enjoy the se...,"[-0.371690571308136, -0.29266345500946045, -0...."
...,...,...,...
1568,zropQGh2fc7PMpzd32vneQ,I ordered the bulgogi tacos with some chips an...,"[-0.14535580575466156, -0.2953207194805145, 0...."
1569,zujdPV3HT-Y-CKE1GgkMHQ,It's a decent place to go for pastries. So far...,"[0.000845844391733408, -0.2679324448108673, 0...."
1570,zun6IVJa7wYe3wAPqWnPGw,I like this place better than Black Market Eat...,"[-0.07951334863901138, -0.3137638568878174, 0...."
1571,zvzmKaltuHKPeEcBkiUp1w,"I really wanted to like @WedgeCheeseShop , sin...","[0.11999098211526871, -0.08705776184797287, -0..."


In [5]:
# folder_path='./'
# categories_argumented = pd.read_csv("business_id_categories_with_argumented_results.csv")
# categories_argumented_text_with_feature_vectors_df = training_dataset_get_vector_or_embedding(
#     folder_path=folder_path
#     argumented_results_data=categories_argumented,
#     method="bert-base-uncased", 
#     embedding_level="sentence_cls_embedding",
#     text_name="argumented_text_result")
# categories_argumented_text_with_feature_vectors_df

Feature vectors saved to 13percent_5reviews_num_threshold_GPT-3.5 Turbo_random_stars_research_data/categories_bert-base-uncased_sentence_cls_embedding.txt


Unnamed: 0,business_id,categories_feature_vector
0,EFci0tbSrb7wko6tpVDnbA,"[-0.07826090604066849, 0.0407031811773777, 0.2..."
1,EPMHuTHu46B123ivRIZ-Xg,"[0.15846343338489532, -0.04316771775484085, -0..."
2,J-ciDDEdIHMcChGIyKZnOg,"[-0.05295887589454651, -0.06621156632900238, 0..."
3,teFjQxUqT8c-yxQdoILDVQ,"[-0.3904491662979126, 0.06892769038677216, 0.4..."
4,IzyQVIJG8JAnOiRQPb0-wg,"[-0.05826997384428978, -0.24565061926841736, 0..."
...,...,...
11191,jCKjT0w6BnPxNSZO9Q2uuw,"[-0.1557040810585022, -0.23840990662574768, 0...."
11192,JDv3h1xRFIW8fXckqgTdRg,"[0.1256754845380783, -0.181650310754776, 0.068..."
11193,QGYzYUMsQe6k7__LD91E5w,"[-0.07354993373155594, 0.15001621842384338, 0...."
11194,dRKztV_Vtl7AvZg052SgRQ,"[-0.3296162486076355, -0.041647765785455704, 0..."


In [10]:
# merge the dataframe above to training_data, validation_data and test_data

# import os

# folder_path = "../original"

# train_data = pd.read_csv(os.path.join(folder_path, "research_training_set.csv"))
# valid_data = pd.read_csv(os.path.join(folder_path, "research_validation_set.csv"))
# test_data = pd.read_csv(os.path.join(folder_path, "research_test_set.csv"))

# folder_path = "./"
# train_data = pd.merge(train_data, user_reviews_argumented_text_with_feature_vectors_df, on='user_id')
# train_data = pd.merge(train_data, business_reviews_argumented_text_with_feature_vectors_df, on='business_id')
# train_data = pd.merge(train_data, categories_argumented_text_with_feature_vectors_df, on='business_id')
# print("Train data shape:", train_data.shape)
# # save as csv
# train_data.to_csv(os.path.join(folder_path, "research_training_set_with_argumented_text_and_feature_vectors.csv"), index=False)

# valid_data = pd.merge(valid_data, user_reviews_argumented_text_with_feature_vectors_df, on='user_id')
# valid_data = pd.merge(valid_data, business_reviews_argumented_text_with_feature_vectors_df, on='business_id')
# valid_data = pd.merge(valid_data, categories_argumented_text_with_feature_vectors_df, on='business_id')
# print("Valid data shape:", valid_data.shape)
# # save as csv
# valid_data.to_csv(os.path.join(folder_path, "research_validation_set_with_argumented_text_and_feature_vectors.csv"), index=False)

# test_data = pd.merge(test_data, user_reviews_argumented_text_with_feature_vectors_df, on='user_id')
# test_data = pd.merge(test_data, business_reviews_argumented_text_with_feature_vectors_df, on='business_id')
# test_data = pd.merge(test_data, categories_argumented_text_with_feature_vectors_df, on='business_id')
# print("Test data shape:", test_data.shape)
# # save as csv
# test_data.to_csv(os.path.join(folder_path, "research_test_set_with_argumented_text_and_feature_vectors.csv"), index=False)

Train data shape: (11196, 15)
Valid data shape: (1400, 15)
Test data shape: (1400, 15)
