In [None]:
# from pathlib import Path
# import nbformat

# def load_notebook(notebook_path):
#     with open(notebook_path, 'r', encoding='utf-8') as f:
#         nb = nbformat.read(f, as_version=4)
#     code_cells = [cell.source for cell in nb.cells if cell.cell_type == 'code']
#     exec('\n'.join(code_cells), globals())

## import written function and variable

# parent_directory = Path('../../../../')
## parent_directory = Path('../../')
# data_preprocessing_utils_path = parent_directory / 'data_preprocessing_utils.ipynb'

# load_notebook(data_preprocessing_utils_path)

# Dataset sampling, filtering, price calculation

In [None]:
import pandas as pd

def check_attributes_and_filter_out_restaurants(yelp_academic_dataset_business_path, business_type='restaurant'):
    
    import re
    
    # Set chunk size
    chunk_size = 10000
    
    # Initialize an empty DataFrame to store the sampled results
    filtered_businesses = pd.DataFrame()
    
    if business_type=='restaurant':
        # Define keywords related to restaurants
        restaurant_keywords = [
            'restaurant', 'food', 'drink', 'cafe', 'dining', 'eatery', 'bistro', 'diner', 'tavern',
            'brasserie', 'trattoria', 'eatery', 'pizzeria', 'sushi', 'grill', 'deli', 'bakery',
            'pub', 'steakhouse', 'barbecue', 'buffet', 'noodle', 'rice', 'pasta', 'spaghetti', 'pizza',
            'burger', 'sandwich', 'taco', 'burrito', 'sushi', 'sashimi', 'ramen', 'pho', 'curry',
            'fried chicken', 'schnitzel', 'kebab', 'shawarma', 'pad thai', 'fried rice', 'poke bowl',
            'soup', 'salad', 'sushi roll', 'spring roll', 'gyro', 'hot dog', 'pancake', 'waffle',
            'crepe', 'bagel', 'toast', 'omelette', 'scrambled eggs', 'frittata', 'quiche', 'pita',
            'wrap', 'sandwich', 'sub', 'sourdough', 'muffin', 'croissant', 'biscuit', 'hash browns',
            'fries', 'onion rings', 'samosa', 'dim sum', 'bao', 'empanada', 'pierogi', 'lasagna',
            'casserole', 'risotto', 'gnocchi', 'meatball', 'stew', 'chili', 'paella', 'souvlaki',
            'tandoori', 'tempura', 'schnitzel', 'satay', 'katsu', 'bibimbap', 'tostada', 'enchilada',
            'tortilla', 'quesadilla', 'poutine', 'syrup', 'gravy', 'curry', 'hummus', 'tzatziki',
            'guacamole', 'salsa', 'pesto', 'tapenade', 'sauce', 'condiment'
        ]


        # Iterate through the data using chunks
        for chunk in pd.read_json(yelp_academic_dataset_business_path, lines=True, encoding='utf-8-sig', chunksize=chunk_size):
            # Convert categories to lowercase
            chunk['categories'] = chunk['categories'].str.lower()
            # Drop rows with NaN values in categories column
            chunk.dropna(subset=['categories'], inplace=True)
            # Split categories into individual words
            chunk['category_words'] = chunk['categories'].apply(lambda x: re.findall(r'\b\w+\b', x))
            # Filter businesses containing restaurant keywords
            filtered_chunk = chunk[chunk['category_words'].apply(lambda x: any(word in restaurant_keywords for word in x))]
            # Append filtered chunk to the result DataFrame
            filtered_businesses = pd.concat([filtered_businesses, filtered_chunk], ignore_index=True)

    return filtered_businesses

In [None]:
def filter_reviews_for_certain_business_type(yelp_academic_dataset_review_path, businesses):
    # Set chunk size
    chunk_size = 10000

    # Initialize an empty DataFrame to store the sampled results
    filtered_business_reviews = pd.DataFrame()

    # Iterate through the data using chunks
    for chunk in pd.read_json(yelp_academic_dataset_review_path, lines=True, encoding='utf-8-sig', chunksize=chunk_size):
        # Filter reviews for restaurants
        filtered_chunk = chunk[chunk['business_id'].isin(businesses['business_id'])]
        # Append filtered chunk to the result DataFrame
        filtered_business_reviews = pd.concat([filtered_business_reviews, filtered_chunk], ignore_index=True)

    return filtered_business_reviews

In [None]:
import nltk
from nltk.tokenize import word_tokenize
import pandas as pd

# Download the nltk punkt tokenizer
# nltk.download('punkt') # already downloaded once

# Function to calculate token count for a text using nltk
def calculate_token_count_nltk(text):
    tokens = word_tokenize(text)
    return len(tokens)

def sample_reviews_and_calculate_price_then_return_data(data, percent, 
                                                        user_reviews_num=5,
                                                        user_comparison="min",
                                                        user_reviews_num_range=None,
                                                        business_reviews_num=5,
                                                        business_comparison="min",
                                                        business_reviews_num_range=None,
                                                        genai="GPT-3.5 Turbo", 
                                                        sampling_method='random', column='None'):
    
    print("Percent:", percent, "%")
    print("User threshold reviews num:", user_reviews_num)
    print("User reviews num comparison method:", user_comparison)
    print("User threshold reviews num range:", user_reviews_num_range)
    print("Business threshold reviews num:", business_reviews_num)
    print("Business reviews num comparison method:", business_comparison)
    print("Business threshold reviews num range:", business_reviews_num_range)
    print("Generative AI model:", genai)
    print("Sampling_method:", sampling_method)
    print("Column:", column)
    print("\n")
    
    sampled_percent_reviews = pd.DataFrame()
    
    if sampling_method=='random':
        
        sampled_percent_reviews = data.sample(frac=(percent * 0.01), random_state=42)
    
    elif sampling_method=='stratified' and column!='None':
        
        # Calculate the occurrence count of the column
        column_value_counts = data[column].value_counts()
        # Calculate the total number data
        total_num = len(data)
        # Calculate the ratio of each star rating
        column_value_ratios = column_value_counts / total_num
        
        for value, ratio in column_value_ratios.items():
            sampled_subset = data[data[column] == value].sample(frac=(percent * 0.01), random_state=42)
            sampled_percent_reviews = pd.concat([sampled_percent_reviews, sampled_subset], ignore_index=True)
        
    else:
        print("Please specify sample type and column!")

    # Print the sampled results
    sampled_percent_reviews_num = len(sampled_percent_reviews)
#     print(f"Number of {percent}% reviews:", sampled_percent_reviews_num)
    
    # Select reviews from the sampled dataset based on the filtering conditions
    filtered_reviews = sampled_percent_reviews
    
    if user_reviews_num_range!=None and (not isinstance(user_reviews_num_range, tuple) and not isinstance(user_reviews_num_range, list)):
        print("Please make sure the range of user reviews is tuple or list data type !")
        return 0
    if business_reviews_num_range!=None and (not isinstance(business_reviews_num_range, tuple) and not isinstance(business_reviews_num_range, list)):
        print("Please make sure the range of business reviews is tuple or list data type !")
        return 0
    if user_reviews_num_range!=None:
        if len(user_reviews_num_range)!=2:
            print("Please make sure the user range contains only min and max number !")
            return 0
        if user_reviews_num_range[1] <= user_reviews_num_range[0]:
            print("Within the user range, please make sure the right number(max) is larger than left number(min) !")
            return 0
    if business_reviews_num_range!=None:
        if len(business_reviews_num_range)!=2:
            print("Please make sure the business range contains only min and max number !")
            return 0
        if business_reviews_num_range[1] <= business_reviews_num_range[0]:
            print("Within the business range, please make sure the right number(max) is larger than left number(min) !")
            return 0
    
    if user_reviews_num_range!=None and business_reviews_num_range!=None:
        user_min = user_reviews_num_range[0]
        user_max = user_reviews_num_range[1]
        business_min = business_reviews_num_range[0]
        business_max = business_reviews_num_range[1]
        while True:
            user_review_count = filtered_reviews['user_id'].value_counts()
            business_review_count = filtered_reviews['business_id'].value_counts()
            excluded_users = user_review_count[(user_review_count < user_min) | (user_review_count > user_max)].index.values
            excluded_businesses = business_review_count[(business_review_count < business_min) | (business_review_count > business_max)].index.values
            if (len(excluded_users) == 0) and (len(excluded_businesses) == 0):
                break
            else:
                selected_users = user_review_count[(user_review_count >= user_min) & (user_review_count <= user_max)].index.values
                selected_businesses = business_review_count[(business_review_count >= business_min) & (business_review_count <= business_max)].index.values
                filtered_reviews = filtered_reviews[(filtered_reviews['user_id'].isin(selected_users)) & (filtered_reviews['business_id'].isin(selected_businesses))]
                # print(len(filtered_reviews))
    elif user_reviews_num_range!=None and business_reviews_num_range==None and business_comparison=="equal":
        user_min = user_reviews_num_range[0]
        user_max = user_reviews_num_range[1]
        while True:
            user_review_count = filtered_reviews['user_id'].value_counts()
            business_review_count = filtered_reviews['business_id'].value_counts()
            excluded_users = user_review_count[(user_review_count < user_min) | (user_review_count > user_max)].index.values
            excluded_businesses = business_review_count[business_review_count != business_reviews_num].index.values
            if (len(excluded_users) == 0) and (len(excluded_businesses) == 0):
                break
            else:
                selected_users = user_review_count[(user_review_count >= user_min) & (user_review_count <= user_max)].index.values
                selected_businesses = business_review_count[business_review_count == business_reviews_num].index.values
                filtered_reviews = filtered_reviews[(filtered_reviews['user_id'].isin(selected_users)) & (filtered_reviews['business_id'].isin(selected_businesses))]
                # print(len(filtered_reviews))
    elif user_reviews_num_range!=None and business_reviews_num_range==None and business_comparison=="min":
        user_min = user_reviews_num_range[0]
        user_max = user_reviews_num_range[1]
        while True:
            user_review_count = filtered_reviews['user_id'].value_counts()
            business_review_count = filtered_reviews['business_id'].value_counts()
            excluded_users = user_review_count[(user_review_count < user_min) | (user_review_count > user_max)].index.values
            excluded_businesses = business_review_count[business_review_count < business_reviews_num].index.values
            if (len(excluded_users) == 0) and (len(excluded_businesses) == 0):
                break
            else:
                selected_users = user_review_count[(user_review_count >= user_min) & (user_review_count <= user_max)].index.values
                selected_businesses = business_review_count[business_review_count >= business_reviews_num].index.values
                filtered_reviews = filtered_reviews[(filtered_reviews['user_id'].isin(selected_users)) & (filtered_reviews['business_id'].isin(selected_businesses))]
                # print(len(filtered_reviews))
    elif user_reviews_num_range!=None and business_reviews_num_range==None and business_comparison=="max":
        user_min = user_reviews_num_range[0]
        user_max = user_reviews_num_range[1]
        while True:
            user_review_count = filtered_reviews['user_id'].value_counts()
            business_review_count = filtered_reviews['business_id'].value_counts()
            excluded_users = user_review_count[(user_review_count < user_min) | (user_review_count > user_max)].index.values
            excluded_businesses = business_review_count[business_review_count > business_reviews_num].index.values
            if (len(excluded_users) == 0) and (len(excluded_businesses) == 0):
                break
            else:
                selected_users = user_review_count[(user_review_count >= user_min) & (user_review_count <= user_max)].index.values
                selected_businesses = business_review_count[business_review_count <= business_reviews_num].index.values
                filtered_reviews = filtered_reviews[(filtered_reviews['user_id'].isin(selected_users)) & (filtered_reviews['business_id'].isin(selected_businesses))]
                # print(len(filtered_reviews))
    elif user_reviews_num_range==None and user_comparison=="equal" and business_reviews_num_range!=None:
        business_min = business_reviews_num_range[0]
        business_max = business_reviews_num_range[1]
        while True:
            user_review_count = filtered_reviews['user_id'].value_counts()
            business_review_count = filtered_reviews['business_id'].value_counts()
            excluded_users = user_review_count[user_review_count != user_reviews_num].index.values
            excluded_businesses = business_review_count[(business_review_count < business_min) | (business_review_count > business_max)].index.values
            if (len(excluded_users) == 0) and (len(excluded_businesses) == 0):
                break
            else:
                selected_users = user_review_count[user_review_count == user_reviews_num].index.values
                selected_businesses = business_review_count[(business_review_count >= business_min) & (business_review_count <= business_max)].index.values
                filtered_reviews = filtered_reviews[(filtered_reviews['user_id'].isin(selected_users)) & (filtered_reviews['business_id'].isin(selected_businesses))]
                # print(len(filtered_reviews))
    elif user_reviews_num_range==None and user_comparison=="min" and business_reviews_num_range!=None:
        business_min = business_reviews_num_range[0]
        business_max = business_reviews_num_range[1]
        while True:
            user_review_count = filtered_reviews['user_id'].value_counts()
            business_review_count = filtered_reviews['business_id'].value_counts()
            excluded_users = user_review_count[user_review_count < user_reviews_num].index.values
            excluded_businesses = business_review_count[(business_review_count < business_min) | (business_review_count > business_max)].index.values
            if (len(excluded_users) == 0) and (len(excluded_businesses) == 0):
                break
            else:
                selected_users = user_review_count[user_review_count >= user_reviews_num].index.values
                selected_businesses = business_review_count[(business_review_count >= business_min) & (business_review_count <= business_max)].index.values
                filtered_reviews = filtered_reviews[(filtered_reviews['user_id'].isin(selected_users)) & (filtered_reviews['business_id'].isin(selected_businesses))]
                # print(len(filtered_reviews))
    elif user_reviews_num_range==None and user_comparison=="max" and business_reviews_num_range!=None:
        business_min = business_reviews_num_range[0]
        business_max = business_reviews_num_range[1]
        while True:
            user_review_count = filtered_reviews['user_id'].value_counts()
            business_review_count = filtered_reviews['business_id'].value_counts()
            excluded_users = user_review_count[user_review_count > user_reviews_num].index.values
            excluded_businesses = business_review_count[(business_review_count < business_min) | (business_review_count > business_max)].index.values
            if (len(excluded_users) == 0) and (len(excluded_businesses) == 0):
                break
            else:
                selected_users = user_review_count[user_review_count <= user_reviews_num].index.values
                selected_businesses = business_review_count[(business_review_count >= business_min) & (business_review_count <= business_max)].index.values
                filtered_reviews = filtered_reviews[(filtered_reviews['user_id'].isin(selected_users)) & (filtered_reviews['business_id'].isin(selected_businesses))]
                # print(len(filtered_reviews))

    elif user_comparison=="equal" and business_comparison=="equal" and user_reviews_num_range==None and business_reviews_num_range==None:
        # if compare method is equal
        while True:
            user_review_count = filtered_reviews['user_id'].value_counts()
            business_review_count = filtered_reviews['business_id'].value_counts()
            excluded_users = user_review_count[user_review_count != user_reviews_num].index.values
            excluded_businesses = business_review_count[business_review_count != business_reviews_num].index.values
            if (len(excluded_users) == 0) and (len(excluded_businesses) == 0):
                break
            else:
                selected_users = user_review_count[user_review_count == user_reviews_num].index.values
                selected_businesses = business_review_count[business_review_count == business_reviews_num].index.values
                filtered_reviews = filtered_reviews[(filtered_reviews['user_id'].isin(selected_users)) & (filtered_reviews['business_id'].isin(selected_businesses))]
                # print(len(filtered_reviews))
    elif user_comparison=="equal" and business_comparison=="min" and user_reviews_num_range==None and business_reviews_num_range==None:
        # if compare method is minimum
        while True:
            user_review_count = filtered_reviews['user_id'].value_counts()
            business_review_count = filtered_reviews['business_id'].value_counts()
            excluded_users = user_review_count[user_review_count != user_reviews_num].index.values
            excluded_businesses = business_review_count[business_review_count < business_reviews_num].index.values
            if (len(excluded_users) == 0) and (len(excluded_businesses) == 0):
                break
            else:
                selected_users = user_review_count[user_review_count == user_reviews_num].index.values
                selected_businesses = business_review_count[business_review_count >= business_reviews_num].index.values
                filtered_reviews = filtered_reviews[(filtered_reviews['user_id'].isin(selected_users)) & (filtered_reviews['business_id'].isin(selected_businesses))]
                # print(len(filtered_reviews))
    elif user_comparison=="equal" and business_comparison=="max" and user_reviews_num_range==None and business_reviews_num_range==None:
        # if compare method is maximum
        while True:
            user_review_count = filtered_reviews['user_id'].value_counts()
            business_review_count = filtered_reviews['business_id'].value_counts()
            excluded_users = user_review_count[user_review_count != user_reviews_num].index.values
            excluded_businesses = business_review_count[business_review_count > business_reviews_num].index.values
            if (len(excluded_users) == 0) and (len(excluded_businesses) == 0):
                break
            else:
                selected_users = user_review_count[user_review_count == user_reviews_num].index.values
                selected_businesses = business_review_count[business_review_count <= business_reviews_num].index.values
                filtered_reviews = filtered_reviews[(filtered_reviews['user_id'].isin(selected_users)) & (filtered_reviews['business_id'].isin(selected_businesses))]
                # print(len(filtered_reviews))
    elif user_comparison=="min" and business_comparison=="equal" and user_reviews_num_range==None and business_reviews_num_range==None:
        # if compare method is minimum
        while True:
            user_review_count = filtered_reviews['user_id'].value_counts()
            business_review_count = filtered_reviews['business_id'].value_counts()
            excluded_users = user_review_count[user_review_count < user_reviews_num].index.values
            excluded_businesses = business_review_count[business_review_count != business_reviews_num].index.values
            if (len(excluded_users) == 0) and (len(excluded_businesses) == 0):
                break
            else:
                selected_users = user_review_count[user_review_count >= user_reviews_num].index.values
                selected_businesses = business_review_count[business_review_count == business_reviews_num].index.values
                filtered_reviews = filtered_reviews[(filtered_reviews['user_id'].isin(selected_users)) & (filtered_reviews['business_id'].isin(selected_businesses))]
                # print(len(filtered_reviews))
    elif user_comparison=="min" and business_comparison=="min" and user_reviews_num_range==None and business_reviews_num_range==None:
        # if compare method is maximum
        while True:
            user_review_count = filtered_reviews['user_id'].value_counts()
            business_review_count = filtered_reviews['business_id'].value_counts()
            excluded_users = user_review_count[user_review_count < user_reviews_num].index.values
            excluded_businesses = business_review_count[business_review_count < business_reviews_num].index.values
            if (len(excluded_users) == 0) and (len(excluded_businesses) == 0):
                break
            else:
                selected_users = user_review_count[user_review_count >= user_reviews_num].index.values
                selected_businesses = business_review_count[business_review_count >= business_reviews_num].index.values
                filtered_reviews = filtered_reviews[(filtered_reviews['user_id'].isin(selected_users)) & (filtered_reviews['business_id'].isin(selected_businesses))]
                # print(len(filtered_reviews))
    elif user_comparison=="min" and business_comparison=="max" and user_reviews_num_range==None and business_reviews_num_range==None:
        # if compare method is minimum
        while True:
            user_review_count = filtered_reviews['user_id'].value_counts()
            business_review_count = filtered_reviews['business_id'].value_counts()
            excluded_users = user_review_count[user_review_count < user_reviews_num].index.values
            excluded_businesses = business_review_count[business_review_count > business_reviews_num].index.values
            if (len(excluded_users) == 0) and (len(excluded_businesses) == 0):
                break
            else:
                selected_users = user_review_count[user_review_count >= user_reviews_num].index.values
                selected_businesses = business_review_count[business_review_count <= business_reviews_num].index.values
                filtered_reviews = filtered_reviews[(filtered_reviews['user_id'].isin(selected_users)) & (filtered_reviews['business_id'].isin(selected_businesses))]
                # print(len(filtered_reviews))
    elif user_comparison=="max" and business_comparison=="equal" and user_reviews_num_range==None and business_reviews_num_range==None:
        # if compare method is maximum
        while True:
            user_review_count = filtered_reviews['user_id'].value_counts()
            business_review_count = filtered_reviews['business_id'].value_counts()
            excluded_users = user_review_count[user_review_count > user_reviews_num].index.values
            excluded_businesses = business_review_count[business_review_count != business_reviews_num].index.values
            if (len(excluded_users) == 0) and (len(excluded_businesses) == 0):
                break
            else:
                selected_users = user_review_count[user_review_count <= user_reviews_num].index.values
                selected_businesses = business_review_count[business_review_count == business_reviews_num].index.values
                filtered_reviews = filtered_reviews[(filtered_reviews['user_id'].isin(selected_users)) & (filtered_reviews['business_id'].isin(selected_businesses))]
                # print(len(filtered_reviews))
    elif user_comparison=="max" and business_comparison=="min" and user_reviews_num_range==None and business_reviews_num_range==None:
        # if compare method is minimum
        while True:
            user_review_count = filtered_reviews['user_id'].value_counts()
            business_review_count = filtered_reviews['business_id'].value_counts()
            excluded_users = user_review_count[user_review_count > user_reviews_num].index.values
            excluded_businesses = business_review_count[business_review_count < business_reviews_num].index.values
            if (len(excluded_users) == 0) and (len(excluded_businesses) == 0):
                break
            else:
                selected_users = user_review_count[user_review_count <= user_reviews_num].index.values
                selected_businesses = business_review_count[business_review_count >= business_reviews_num].index.values
                filtered_reviews = filtered_reviews[(filtered_reviews['user_id'].isin(selected_users)) & (filtered_reviews['business_id'].isin(selected_businesses))]
                # print(len(filtered_reviews))
    elif user_comparison=="max" and business_comparison=="max" and user_reviews_num_range==None and business_reviews_num_range==None:
        # if compare method is maximum
        while True:
            user_review_count = filtered_reviews['user_id'].value_counts()
            business_review_count = filtered_reviews['business_id'].value_counts()
            excluded_users = user_review_count[user_review_count > user_reviews_num].index.values
            excluded_businesses = business_review_count[business_review_count > business_reviews_num].index.values
            if (len(excluded_users) == 0) and (len(excluded_businesses) == 0):
                break
            else:
                selected_users = user_review_count[user_review_count <= user_reviews_num].index.values
                selected_businesses = business_review_count[business_review_count <= business_reviews_num].index.values
                filtered_reviews = filtered_reviews[(filtered_reviews['user_id'].isin(selected_users)) & (filtered_reviews['business_id'].isin(selected_businesses))]
                # print(len(filtered_reviews))
    else:
        print("Please select the comparison method of reviews num threshold for user and business within equal, min, max !")

    # print(n)
    # print(filtered_reviews)
    filtered_reviews_num = len(filtered_reviews)
#     print("Number of filtered_reviews:", filtered_reviews_num)
    
    filtered_users_count = filtered_reviews['user_id'].nunique()
    filtered_businesses_count = filtered_reviews['business_id'].nunique()

#     print('Number of unique users in filtered reviews:', filtered_users_count)
#     print('Number of unique businesses in filtered reviews:', filtered_businesses_count)
#     print('\n')

    # Calculate min, average, max reviews per user
    min_reviews_per_user = filtered_reviews.groupby('user_id')['review_id'].count().min()
    mean_reviews_per_user = filtered_reviews.groupby('user_id')['review_id'].count().mean()
    max_reviews_per_user = filtered_reviews.groupby('user_id')['review_id'].count().max()

    # Calculate min, average, max reviews per business
    min_reviews_per_business = filtered_reviews.groupby('business_id')['review_id'].count().min()
    mean_reviews_per_business = filtered_reviews.groupby('business_id')['review_id'].count().mean()
    max_reviews_per_business = filtered_reviews.groupby('business_id')['review_id'].count().max()

    # Print the results
#     print('Min reviews per user:', min_reviews_per_user)
#     print('Mean reviews per user:', mean_reviews_per_user)
#     print('Max reviews per user:', max_reviews_per_user)
#     print('\n')
#     print('Min reviews per business:', min_reviews_per_business)
#     print('Mean reviews per business:', mean_reviews_per_business)
#     print('Max reviews per business:', max_reviews_per_business)
#     print('\n')
    
#     GPT-3.5 Turbo: The price for one English token input is 0.000015 TWD, and one English token output is 0.000045 TWD.
#     GPT-4 Turbo (20 times the price): The price for one English token input is 0.0003 TWD, and one English token output is 0.0009 TWD.
#     GPT-4 Turbo Image: The price for a 1080 x 1080 image is 0.2295 TWD.
#     GPT-4 (40 times the price): The price for one English token input is 0.0006 TWD, and one English token output is 0.0018 TWD.
#     GPT-4 32k (80 times the price): The price for one English token input is 0.0012 TWD, and one English token output is 0.0036 TWD.
    
    model_NT_price_per_token_output = 0.000045
    model_NT_price_per_token_input = (model_NT_price_per_token_output / 3)
    model_NT_price_per_token_input_and_output = ((model_NT_price_per_token_input + model_NT_price_per_token_output) / 2)
    model_NT_price_per_picture = 0.2295
    
    if genai=="GPT-3.5 Turbo":
        model_NT_price_per_token_output = 0.000045
        model_NT_price_per_token_input = (model_NT_price_per_token_output / 3)
        model_NT_price_per_token_input_and_output = ((model_NT_price_per_token_input + model_NT_price_per_token_output) / 2)
  
    elif genai=="GPT-4 Turbo":
        model_NT_price_per_token_output = 0.0009
        model_NT_price_per_token_input = (model_NT_price_per_token_output / 3)
        model_NT_price_per_token_input_and_output = ((model_NT_price_per_token_input + model_NT_price_per_token_output) / 2)
  
    elif genai=="GPT-4 Turbo Image":
        model_NT_price_per_picture = 0.2295
        
    elif genai=="GPT-4":
        model_NT_price_per_token_output = 0.0018
        model_NT_price_per_token_input = (model_NT_price_per_token_output / 2)
        model_NT_price_per_token_input_and_output = ((model_NT_price_per_token_input + model_NT_price_per_token_output) / 2)
  
    elif genai=="GPT-4 32k":
        model_NT_price_per_token_output = 0.0036
        model_NT_price_per_token_input = (model_NT_price_per_token_output / 2)
        model_NT_price_per_token_input_and_output = ((model_NT_price_per_token_input + model_NT_price_per_token_output) / 2)
  
    else:
        print("Please specify Generative AI type or correct type!")

    # Apply the function to calculate token count for each review
    filtered_reviews['token_count_nltk'] = filtered_reviews['text'].apply(calculate_token_count_nltk)

    # Calculate average token count per user and per business
    total_token_count_per_user = filtered_reviews.groupby('user_id')['token_count_nltk'].sum()
    avg_token_count_per_user = total_token_count_per_user.mean()
    total_token_count_per_business = filtered_reviews.groupby('business_id')['token_count_nltk'].sum()
    avg_token_count_per_business = total_token_count_per_business.mean()

    # Print the results
#     print('Average Token Count per User (nltk):', avg_token_count_per_user)
#     print('Average Token Count per Business (nltk):', avg_token_count_per_business)
#     print('\n')

    # Calculate price
    
    price_per_user_by_output = 2 * avg_token_count_per_user * model_NT_price_per_token_output
    price_per_user_by_input_and_output = 2 * avg_token_count_per_user * model_NT_price_per_token_input_and_output
    price_for_all_user_by_output = filtered_users_count * price_per_user_by_output
    price_for_all_user_by_input_and_output = filtered_users_count* price_per_user_by_input_and_output
    
#     print(f"{genai} Price per user by output:", price_per_user_by_output)
#     print(f"{genai} Price per user by input and output:", price_per_user_by_input_and_output)
#     print(f"{genai} Price for all user by output:", price_for_all_user_by_output)
#     print(f"{genai} Price for all user by input and output:", price_for_all_user_by_input_and_output)
#     print('\n')
    
    price_per_business_by_output = 2 * avg_token_count_per_business * model_NT_price_per_token_output
    price_per_business_by_input_and_output = 2 * avg_token_count_per_business * model_NT_price_per_token_input_and_output
    price_for_all_business_by_output = filtered_businesses_count * price_per_business_by_output
    price_for_all_business_by_input_and_output = filtered_businesses_count * price_per_business_by_input_and_output
    
#     print(f"{genai} Price per business by output:", price_per_business_by_output)
#     print(f"{genai} Price per business by input and output:", price_per_business_by_input_and_output)
#     print(f"{genai} Price for all business by output:", price_for_all_business_by_output)
#     print(f"{genai} Price for all business by input and output:", price_for_all_business_by_input_and_output)

#     print("\n")
    print("-----------------------------------------------------------")
    print("\n")
    
    calculation_results = {}
    calculation_results["percent"] = percent
#     calculation_results["reviews_num_threshold"] = reviews_num_threshold
    calculation_results["user_reviews_num_threshold"] = user_reviews_num
    calculation_results["user_comparison_method"] = user_comparison
    calculation_results["user_reviews_num_range"] = user_reviews_num_range
    calculation_results["business_reviews_num_threshold"] = business_reviews_num
    calculation_results["business_comparison_method"] = business_comparison
    calculation_results["business_reviews_num_range"] = business_reviews_num_range
    calculation_results["sampling_method"] = sampling_method
    calculation_results["column"] = column
    calculation_results["sampled_percent_reviews_num"] = sampled_percent_reviews_num
    calculation_results["filtered_reviews_num"] = filtered_reviews_num
    calculation_results["filtered_users_count"] = filtered_users_count
    calculation_results["filtered_businesses_count"] = filtered_businesses_count
    calculation_results["min_reviews_per_user"] = min_reviews_per_user
    calculation_results["mean_reviews_per_user"] = mean_reviews_per_user
    calculation_results["max_reviews_per_user"] = max_reviews_per_user
    calculation_results["min_reviews_per_business"] = min_reviews_per_business
    calculation_results["mean_reviews_per_business"] = mean_reviews_per_business
    calculation_results["max_reviews_per_business"] = max_reviews_per_business
    calculation_results["genai"] = genai
    calculation_results["model_NT_price_per_token_input"] = model_NT_price_per_token_input
    calculation_results["model_NT_price_per_token_output"] = model_NT_price_per_token_output
    calculation_results["model_NT_price_per_token_input_and_output"] = model_NT_price_per_token_input_and_output
    calculation_results["model_NT_price_per_picture"] = model_NT_price_per_picture
    calculation_results["avg_token_count_per_user"] = avg_token_count_per_user
    calculation_results["avg_token_count_per_business"] = avg_token_count_per_business
    calculation_results["price_per_user_by_output"] = price_per_user_by_output
    calculation_results["price_per_user_by_input_and_output"] = price_per_user_by_input_and_output
    calculation_results["price_for_all_user_by_output"] = price_for_all_user_by_output
    calculation_results["price_for_all_user_by_input_and_output"] = price_for_all_user_by_input_and_output
    calculation_results["price_per_business_by_output"] = price_per_business_by_output
    calculation_results["price_per_business_by_input_and_output"] = price_per_business_by_input_and_output
    calculation_results["price_for_all_business_by_output"] = price_for_all_business_by_output
    calculation_results["price_for_all_business_by_input_and_output"] = price_for_all_business_by_input_and_output
    
    # return the filtered reviews and calculation results
    return filtered_reviews, calculation_results
#     return calculation_results

In [None]:
def combination_calculation_results_to_df(all_combination_results):
    
    import pandas as pd

    all_combination_results_df = pd.DataFrame(columns = all_combination_results[0].keys())

    for calculation_result in all_combination_results:
        all_combination_results_df = pd.concat([all_combination_results_df, pd.DataFrame([calculation_result])], ignore_index=True)

    return all_combination_results_df

In [None]:
def filter_combination_calculation_results(combination_calculation_results_df, 
                               user_reviews_num = 2, user_comparison = "equal",
                               user_reviews_num_range = None, 
                               business_reviews_num = 2, business_comparison = "equal",
                               business_reviews_num_range = None,
                               price_limit = 300, reviews_num = 5000, user_num = 500, business_num = 500):

    combination_filtered_df = combination_calculation_results_df[
        ((combination_calculation_results_df["price_for_all_business_by_output"] <= price_limit) | (combination_calculation_results_df["price_for_all_business_by_input_and_output"] <= price_limit))
        & (combination_calculation_results_df["user_reviews_num_threshold"] == user_reviews_num)
        & (combination_calculation_results_df["user_comparison_method"] == user_comparison)
        & (combination_calculation_results_df["user_reviews_num_range"] == user_reviews_num_range)
        & (combination_calculation_results_df["business_reviews_num_threshold"] == business_reviews_num)
        & (combination_calculation_results_df["business_comparison_method"] == business_comparison)
        & (combination_calculation_results_df["business_reviews_num_range"] == business_reviews_num_range)
        & (combination_calculation_results_df["filtered_reviews_num"] >= reviews_num)
        & (combination_calculation_results_df["filtered_users_count"] >= user_num)
        & (combination_calculation_results_df["filtered_businesses_count"] >= business_num)
    ]

    return combination_filtered_df

# Data split

In [None]:
def dataset_split(df, train_ratio=0.8, valid_ratio=0.1, test_ratio=0.1, stratify=(False, "stars")):
    
    import os
    import pandas as pd
    from sklearn.model_selection import train_test_split
    
    if (train_ratio + valid_ratio + test_ratio) == 1 and train_ratio >= 0 and valid_ratio >= 0 and test_ratio >= 0:
        
        # Split the dataset
        if stratify[0]:
            # Split the dataset using stratified sampling
            train_data, remaining = train_test_split(df, test_size=(valid_ratio + test_ratio), stratify=df[stratify[1]], random_state=42)
            valid_data, test_data = train_test_split(remaining, test_size=test_ratio/(valid_ratio + test_ratio), stratify=remaining[stratify[1]], random_state=42)
        else:
            train_data, remaining = train_test_split(df, test_size=(valid_ratio + test_ratio), random_state=42)
            valid_data, test_data = train_test_split(remaining, test_size=test_ratio/(valid_ratio + test_ratio), random_state=42)
     
        # Reset index
        train_data.reset_index(drop=True, inplace=True)
        valid_data.reset_index(drop=True, inplace=True)
        test_data.reset_index(drop=True, inplace=True)
        
        print(f"Original dataset size: {df.shape}, ratio is {len(df)/len(df)}", )
        print(f"Training dataset size: {train_data.shape}, ratio is {len(train_data)/len(df)}")
        print(f"Validation dataset size: {valid_data.shape}, ratio is {len(valid_data)/len(df)}")
        print(f"Test dataset size: {test_data.shape}, ratio is {len(test_data)/len(df)}")
        
        return train_data, valid_data, test_data
    
    else:
        print("Please make sure each ratio is larger than 0 and sum of the three ratio is 1 !")

In [None]:
def kfold_cross_validation(data_df, train_ratio=0.9, test_ratio=0.1, fold_num=10):
    
    import pandas as pd
    from sklearn.model_selection import KFold
    from sklearn.model_selection import train_test_split
    
    train_data, test_data = train_test_split(data_df, test_size=test_ratio, random_state=42)
    train_data.to_csv("research_training_set.csv", index=False)
    test_data.to_csv("research_test_set.csv", index=False)
    print(f"Data has successfully split into training set and test set and saved to csv files !")
    
    # Specify the number of folds
    k = fold_num

    # Initialize the KFold cross-validator
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    
    # Create a folder named 'folds' if it doesn't exist
    if not os.path.exists('folds'):
        os.makedirs('folds')

    # Enumerate through each fold
    for i, (train_index, test_index) in enumerate(kf.split(data_df)):
        # Create a folder for each fold
        fold_dir = f'folds/fold_{i+1}'
        if not os.path.exists(fold_dir):
            os.makedirs(fold_dir)

        train_set = data_df.iloc[train_index]
        test_set = data_df.iloc[test_index]

        # Save training and testing sets to CSV files in the fold's folder
        train_set.to_csv(f'{fold_dir}/train_set_fold_{i+1}.csv', index=False)
        test_set.to_csv(f'{fold_dir}/test_set_fold_{i+1}.csv', index=False)
    
    print(f"Training data has successfully split into {fold_num} folds and saved to csv files !")

# Formatting text data

In [None]:
import os
import pandas as pd
import numpy as np

def remove_backslash_and_newline(text):
    if isinstance(text, str):
        return text.replace("\\", "").replace("\n", "")
    return text

# wrap reviews into dictionary format
def formatting_content_method1(train_data):
    
    user_data = pd.DataFrame(columns=["user_id", "user_concatenated_reviews_with_business_categories"])
    business_data = pd.DataFrame(columns=["business_id", "business_concatenated_reviews_with_business_categories"])
    
    # Remove \ and \n
#     train_data['text'] = train_data['text'].apply(remove_backslash_and_newline)
#     train_data['business_categories'] = train_data['business_categories'].apply(remove_backslash_and_newline)
    
    # Merge user reviews with corresponding business_categories
    i = 0
    for user_id, group in train_data.groupby("user_id"):
        
        user_concatenated_reviews_categories = []
        
        review_i = 0
        for _, row in group.iterrows():
            user_concatenated_reviews_categories.append({f"review_{review_i+1}": row['text'], "business_categories": row['business_categories']})
            review_i += 1
            
        user_concatenated_reviews_categories_str = "\n".join(str(review_dict) for review_dict in user_concatenated_reviews_categories)
        
        user_data.at[i, "user_id"] = user_id
        user_data.at[i, "user_concatenated_reviews_with_business_categories"] = user_concatenated_reviews_categories_str
        
        i += 1
        
    print(user_data.head())
    
    # Merge business reviews and add business_categories
    i = 0
    for business_id, group in train_data.groupby('business_id'):
        
        review_i = 0
        business_concatenated_reviews = {}
        for _, row in group.iterrows():
            business_concatenated_reviews[f"review_{review_i+1}"] = row['text']
            review_i += 1    

        business_categories = group['business_categories'].iloc[0]
        
        business_concatenated_reviews_categories = {**business_concatenated_reviews, "business_categories": business_categories}
        
        business_data.at[i, "business_id"] = business_id
        business_data.at[i, "business_concatenated_reviews_with_business_categories"] = str(business_concatenated_reviews_categories)
        
        i += 1
        
    print(business_data.head())
    
    return user_data, business_data

In [None]:
# wrap reviews into more like tallking or article format

def formatting_content_method_v2(train_data):
    
    user_data = pd.DataFrame(columns=["user_id", "user_concatenated_reviews_with_business_categories"])
    business_data = pd.DataFrame(columns=["business_id", "business_concatenated_reviews_with_business_categories"])
    
    i = 0
    for user_id, group in train_data.groupby("user_id"):
        
        user_concatenated_reviews_categories = []
        
        review_i = 0
        for _, row in group.iterrows():
            user_concatenated_reviews_categories.append(f"review_{review_i+1}: {row['text']}, restaurant_categories: {row['business_categories']}")
            review_i += 1
            
        user_concatenated_reviews_categories_str = "\n".join(review_category for review_category in user_concatenated_reviews_categories)
        
        user_data.at[i, "user_id"] = user_id
        user_data.at[i, "user_concatenated_reviews_with_business_categories"] = user_concatenated_reviews_categories_str
        
        i += 1
        
    print(user_data.head())
    
    # Merge business reviews and add business_categories
    i = 0
    for business_id, group in train_data.groupby('business_id'):
        
        review_i = 0
        business_concatenated_reviews = []
        for _, row in group.iterrows():
            business_concatenated_reviews.append(f"review_{review_i+1}: {row['text']}")
            review_i += 1    

        business_concatenated_reviews_categories_str = "\n".join(review for review in business_concatenated_reviews)
        business_categories = group['business_categories'].iloc[0]
        
        business_concatenated_reviews_categories = f"restaurant_categories: {business_categories}" + '\n' + business_concatenated_reviews_categories_str
        
        business_data.at[i, "business_id"] = business_id
        business_data.at[i, "business_concatenated_reviews_with_business_categories"] = business_concatenated_reviews_categories
        
        i += 1
        
    print(business_data.head())
    
    return user_data, business_data

In [None]:
# wrap reviews into more like tallking or article format, and turn categories into sentence

def formatting_content_method_v3(train_data):
    
    user_data = pd.DataFrame(columns=["user_id", "user_concatenated_reviews_with_business_categories"])
    business_data = pd.DataFrame(columns=["business_id", "business_concatenated_reviews_with_business_categories"])
    
    i = 0
    for user_id, group in train_data.groupby("user_id"):
        
        user_concatenated_reviews_categories = []
        
        review_i = 0
        for _, row in group.iterrows():
            
            business_categories = row['business_categories']
            
            # categories sentence
            words = business_categories.split(", ")
            filtered_words = [word for word in words if "restaurant" not in word.lower()]
            business_categories = ", ".join(filtered_words[:])
            begin = "This is an " if business_categories[0].lower() in ["a", "e", "i", "o", "u"] else "This is a "
            categories_sentence =  begin + business_categories + " restaurant."
            
            user_concatenated_reviews_categories.append(f"review_{review_i+1}: {categories_sentence} {row['text']}")
            review_i += 1
            
        user_concatenated_reviews_categories_str = "\n".join(review_category for review_category in user_concatenated_reviews_categories)
        
        user_data.at[i, "user_id"] = user_id
        user_data.at[i, "user_concatenated_reviews_with_business_categories"] = user_concatenated_reviews_categories_str
        
        i += 1
        
    print(user_data.head())
    
    # Merge business reviews and add business_categories
    i = 0
    for business_id, group in train_data.groupby('business_id'):
        
        review_i = 0
        business_concatenated_reviews = []
        for _, row in group.iterrows():
            business_concatenated_reviews.append(f"review_{review_i+1}: {row['text']}")
            review_i += 1    
        
        business_concatenated_reviews_categories_str = "\n".join(review for review in business_concatenated_reviews)
        business_categories = group['business_categories'].iloc[0]
        
        # categories sentence
        words = business_categories.split(", ")
        filtered_words = [word for word in words if "restaurant" not in word.lower()]
        business_categories = ", ".join(filtered_words[:])
        begin = "This is an " if business_categories[0].lower() in ["a", "e", "i", "o", "u"] else "This is a "
        categories_sentence =  begin + business_categories + " restaurant."
        
#         business_concatenated_reviews_categories = f"restaurant_categories: {business_categories}" + '\n' + business_concatenated_reviews_categories_str
        
        business_data.at[i, "business_id"] = business_id
        business_data.at[i, "business_concatenated_reviews_with_business_categories"] = categories_sentence + '\n' + business_concatenated_reviews_categories_str
        
        i += 1
        
    print(business_data.head())
    
    return user_data, business_data

# Text to vector

Original text to BERT vector or embedding

In [None]:
import pandas as pd
import os
import ast
import torch
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel
import warnings

warnings.filterwarnings("ignore", message='''Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.''')

def check_users_and_business_from_valid_and_test_is_in_training_set(train_data, valid_data, test_data):
    
    # Check if users and businesses in the validation set are all in the training set
    valid_users_in_train = valid_data['user_id'].isin(train_data['user_id']).all()
    valid_businesses_in_train = valid_data['business_id'].isin(train_data['business_id']).all()

    # Check if users and businesses in the test set are all in the training set
    test_users_in_train = test_data['user_id'].isin(train_data['user_id']).all()
    test_businesses_in_train = test_data['business_id'].isin(train_data['business_id']).all()

    # Print the results
    print('Are all users in the validation set also in the training set?', valid_users_in_train)
    print('Are all businesses in the validation set also in the training set?', valid_businesses_in_train)
    print('Are all users in the test set also in the training set?', test_users_in_train)
    print('Are all businesses in the test set also in the training set?', test_businesses_in_train)


def get_bert_sentence_cls_embedding(text, method):
    
    tokenizer = 0
    model = 0
    if "roberta" in method.lower():
        # RoBERTa is roberta-base
        tokenizer = RobertaTokenizer.from_pretrained(method)
        model = RobertaModel.from_pretrained(method)
    else:
        tokenizer = BertTokenizer.from_pretrained(method)
        model = BertModel.from_pretrained(method)
    
#     tokenizer = BERT_tokenizer.from_pretrained(method)
#     model = BERT_model.from_pretrained(method)
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    cls_sentence_embedding = outputs.last_hidden_state[:, 0, :]  # Extract [CLS] token embedding
    
    return cls_sentence_embedding

def get_vector_or_embedding_and_save_to_txt(folder_file_path, data, column_id_name, column_to_tranform,
                                     method="bert-base-uncased", embedding_level="sentence_cls_embedding"):
    
    # make sure file is empty
    with open(folder_file_path, 'w') as output_file:
        pass
    
    with open(folder_file_path, 'a') as output_file:

        if embedding_level=="sentence_cls_embedding":
            # Iterate through user_reviews and Append Results to Text File
            for index, row in data.iterrows():

                _id = row[column_id_name]
                text = row[column_to_tranform]

                feature_vector = get_bert_sentence_cls_embedding(text, method)

                # Convert PyTorch Tensor to a list for easy storage
                feature_vector_list = feature_vector.squeeze().tolist()

                # Convert the list to a string for storage
                feature_vector_str = str(feature_vector_list)

                # Write user_id and feature_vector_str to the text file
                output_file.write(f"{_id}\t{feature_vector_str}\n")

            # Print a message indicating the successful saving of feature vectors
            print(f'Feature vectors saved to {folder_file_path}')


def read_vector_or_embedding_txt_return_df(folder_file_path, data_columns):

    data = []

    with open(folder_file_path, 'r') as file:
        for line in file:
            # Split the line into _id and vector(embedding)
            _id, feature_vector_str = line.strip().split('\t')

            # Convert the vector string back to a list using ast.literal_eval
            feature_vector = ast.literal_eval(feature_vector_str)

            # Append _id and feature_vector to the data list
            data.append([_id, feature_vector])

    # Create the DataFrame
    df = pd.DataFrame(data, columns=data_columns)
    
    return df

def generate_BERT_vector_or_embedding(folder_name, data_df, method="bert-base-uncased", embedding_level="sentence_cls_embedding"):
    
    cols = list(data_df.columns)
    id_column = [col for col in cols if 'id' in col.lower()][0]
    forwhat_column = [col for col in cols if "argument" in col.lower() or "review" in col.lower() or "categor" in col.lower()]
    for col in forwhat_column:
        if "argument" in col:
            forwhat_column = col
        else:
            forwhat_column = forwhat_column[0]
    
#     if text_name == "user_concatenated_reviews_with_business_categories" or text_name == "business_concatenated_reviews_with_business_categories":
        
#         for_who = ""
        
#         if text_name == "user_concatenated_reviews_with_business_categories":
#             for_who = "user"
#         else:
#             for_who = "business"
            
#         id_name = f"{for_who}_id"
#         column_to_tranform = f"{text_name}_text"
#         column_to_tranform = f"{text_name}"
        
#         # Preprocess Text Data
#         concatenated_reviews = train_data.groupby(id_name)['text'].apply(lambda x: ';'.join(x)).reset_index()
#         concatenated_reviews = concatenated_reviews.rename(columns={'text': column_to_tranform})
#         # Save as csv
#         concatenated_reviews.to_csv(os.path.join(folder_name, f'{text_name}.csv'), index=False)


    # Get BERT embedding
    output_file_path = f'{forwhat_column}_{method}_{embedding_level}.txt'
    folder_file_path = os.path.join(folder_name, output_file_path)
    get_vector_or_embedding_and_save_to_txt(folder_file_path, data_df, 
                                            method=method, embedding_level=embedding_level, 
                                            column_id_name=id_column, column_to_tranform=forwhat_column)

    feature_vectors_df = read_vector_or_embedding_txt_return_df(folder_file_path, 
                                            data_columns=[id_column, f"{forwhat_column}_{method}_feature_vector"])

    # Merge data
    data_df_with_feature_vectors_df = pd.merge(data_df, feature_vectors_df, on=id_column)

    return data_df_with_feature_vectors_df


#     elif text_name=="business_categories":

# #         merged_data = pd.merge(train_data, business_data, on="business_id", how="left")
        
#         # drop duplicate business
#         categories_data = train_data[['business_id', text_name]].drop_duplicates(subset=['business_id']).reset_index(drop=True)
#         categories_data.to_csv(os.path.join(folder_name, f"{text_name}.csv"), index=False)
        
#         # Get BERT embedding
#         output_file_path = f'{text_name}_{method}_{embedding_level}.txt'
#         folder_file_path = os.path.join(folder_name, output_file_path)
        
#         get_vector_or_embedding_and_save_to_txt(folder_file_path, categories_data, 
#                                                 method=method, embedding_level=embedding_level, 
#                                                 column_id_name="business_id", column_to_tranform=text_name)

#         feature_vectors_df = read_vector_or_embedding_txt_return_df(folder_file_path, 
#                                                 data_columns=["business_id", f"{text_name}_feature_vector"])
        
# #         feature_vectors_df = feature_vectors_df.drop_duplicates(subset=['business_id']).reset_index(drop=True)
        
# #         categories_with_feature_vectors_df = pd.merge(merged_data, feature_vectors_df, on=id_name)
    
# #         return categories_with_feature_vectors_df
#         return feature_vectors_df

#     else:
#         print("Text name has not handled yet or wrong text name !")

Argumented text to BERT vector or embedding

In [None]:
# argumented text to vector function

def argumented_data_to_vector_or_embedding(
    folder_path,
    argumented_results_data,
    method="bert-base-uncased", 
    embedding_level="sentence_cls_embedding"):
    
    cols = list(argumented_results_data.columns)
    id_column = [col for col in cols if 'id' in col.lower()][0]
    forwhat_column = [col for col in cols if "argument" in col.lower()][0]
    
#     if text_name=="argumented_text_result":

    # Get BERT embedding
    output_file_path = f'{forwhat_column}_{method}_{embedding_level}.txt'
    folder_file_path = os.path.join(folder_path, output_file_path)
    get_vector_or_embedding_and_save_to_txt(folder_file_path, argumented_results_data, 
                                            method=method, embedding_level=embedding_level, 
                                            column_id_name=id_column, column_to_tranform=forwhat_column)

    feature_vectors_df = read_vector_or_embedding_txt_return_df(folder_file_path, 
                                            data_columns=[id_column, f"{forwhat_column}_{method}_feature_vector"])

    # Merge data
    argumented_results_data_with_feature_vectors_df = pd.merge(argumented_results_data, feature_vectors_df, on=id_column)

    return argumented_results_data_with_feature_vectors_df

#     else:
#         print("Text name has not handled yet or wrong text name !")

Text to Word2Vec vector or embedding

In [None]:
# # Download stopwords list
# nltk.download('stopwords')
# nltk.download('punkt')

In [None]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors, FastText
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def generate_word_embedding_vector(text, text_index, tfidf_matrix, vocabs, model_name="word2vec", method='concatenate', 
                        is_text_preprocessed=True, lower=True, punc=True, stop_word=True, lemmitize=True):
    
    model = 0
    
    if model_name=="word2vec":
        model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
    elif model_name=="fasttext":
        model = FastText.load_fasttext_format('cc.en.300.bin.gz')
    
    doc_tokens = []
    
    if is_text_preprocessed:
        text = preprocess_text(text, lower=lower, punc=punc, stop_word=stop_word, lemmitize=lemmitize)
    
    doc_tokens = word_tokenize(text)
        
    # Initialize document vector
    doc_vector = np.zeros(model.vector_size)
    num_words = 0
    
    # Average method
    if method == 'average':
        if len(doc_tokens) > max_words:
            doc_tokens = doc_tokens[:max_words]
        for word in doc_tokens:
            if word in model.wv:
                doc_vector += model[word]
                num_words += 1
        
        # If the document is empty, return zero vector
        if num_words > 0:
            doc_vector /= num_words

        return doc_vector
    
    # TF-IDF weighted average method
    elif method == 'tfidf_weighted_average':
        tfidf_sum = 0
        if len(doc_tokens) > max_words:
            doc_tokens = doc_tokens[:max_words]
        for word in doc_tokens:
            if word in model.wv:
                word_vector = model[word]
                # Get the index of the word
                word_index = vocabs.index(word)
                # Check if the word is in the document
                if word_index == -1:
                    word_index = 0.0
                # Get the TF-IDF weight of the word in the document
                tfidf_weight = tfidf_matrix[text_index, word_index]
                doc_vector += word_vector * tfidf_weight
                tfidf_sum += tfidf_weight
        # If the document is not empty and there are non-zero TF-IDF weights
        if num_words > 0 and tfidf_sum > 0:
            doc_vector /= tfidf_sum

        return doc_vector
    
    # Concatenation method
    elif method == 'concatenate':
        doc_vectors = []
        for word in tokens:
            if word in model.wv:
                doc_vectors.append(model[word])
        
        # Pad with zeros or truncate
        if len(doc_vectors) < max_words:
            num_zeros_to_pad = max_words - len(doc_vectors)
            doc_vectors.extend([np.zeros(model.vector_size)] * num_zeros_to_pad)
        elif len(doc_vectors) > max_words:
            doc_vectors = doc_vectors[:max_words]
        
        # Concatenate
        doc_vector = np.concatenate(doc_vectors)
    
        return doc_vector


# # Convert the text to word2vec vector using different methods
# text = "Your input text here..."
# doc_vector_average = text_to_word2vec(text, model, method='average', max_words=100)
# doc_vector_tfidf_weighted_average = text_to_word2vec(text, model, method='tfidf_weighted_average', max_words=100)
# doc_vector_concatenate = text_to_word2vec(text, model, method='concatenate', max_words=100)

# print("Document vector (average method) shape:", doc_vector_average.shape)
# print("Document vector (average method):", doc_vector_average)
# print("Document vector (TF-IDF weighted average method) shape:", doc_vector_tfidf_weighted_average.shape)
# print("Document vector (TF-IDF weighted average method):", doc_vector_tfidf_weighted_average)
# print("Document vector (concatenate method) shape:", doc_vector_concatenate.shape)
# print("Document vector (concatenate method):", doc_vector_concatenate)

Text to SVD or LDA topic vector, TFIDF vector, TF vector

In [None]:
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('stopwords')

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
import re

def preprocess_text(text, lower=True, punc=True, digit=False, stop_word=True, lemmitize=True):
    # Convert text to lowercase
#     lower_text = ''
#     for char in text:
#         if char.isdigit() or char in string.punctuation:
#             lower_text += char
#         else:
#             lower_text += char.lower()
#     text = lower_text
    
    if isinstance(text, float) or text == np.nan:
        return ""
    
    if lower:
        text = text.lower()
    
#     Remove punctuation
#     if punc:
#         text = text.translate(str.maketrans(' ', ' ', string.punctuation))
    
    # Replace punctuation with spaces
    if punc:
        text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    
    # Separate consecutive letters and numbers
    text = re.sub(r'([a-zA-Z])([0-9])', r'\1 \2', text)
    text = re.sub(r'([0-9])([a-zA-Z])', r'\1 \2', text)
    
    # Remove digits
    if digit:
        text = re.sub(r'\d+', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Initialize WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # Lemmatize tokens
    if lemmitize:
#         lemmatized_tokens = [lemmatizer.lemmatize(word) if not word.isdigit() else word for word in tokens]
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # stopwords
    stop_words = set(stopwords.words('english'))
    
    # Remove stopwords
    if stop_word:
        tokens = [word for word in tokens if word not in stop_words]
    
    # Reconstruct text
    normalized_text = ' '.join(tokens)
    
    return normalized_text

def compute_tfidf_vectors(texts, min_df=1, max_df=1.0, max_feature_num=None):
    # Create a TF-IDF vectorizer
    vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, max_features=max_feature_num)
    # Convert the list of texts to TF-IDF vectors
    tfidf_vectors = vectorizer.fit_transform(texts)
    # Return TF-IDF vectors and feature names
    return tfidf_vectors.toarray(), vectorizer.get_feature_names_out()

def compute_tf_vectors(texts, min_df=1, max_df=1.0, max_feature_num=None):
    # Create a TF vectorizer
    vectorizer = CountVectorizer(min_df=min_df, max_df=max_df, max_features=max_feature_num)
    # Convert the list of texts to TF vectors
    tf_vectors = vectorizer.fit_transform(texts)
    # Return TF vectors and feature names
    return tf_vectors.toarray(), vectorizer.get_feature_names_out()

def generate_topics(model_name, model, words, n_words=100):
    topics_df = pd.DataFrame(columns=[f"{model_name}_topic_index", "word_list"])
    for topic_index, topic in enumerate(abs(model.components_)):
        topics_df.at[topic_index, f"{model_name}_topic_index"] = topic_index
        topics_df.at[topic_index, "word_list"] = [words[i] for i in topic.argsort()[:-n_words-1:-1]]
    return topics_df

def generate_topic_tfidf_tf_vector(data_df_list, topic_method="SVD", n_components=10,
                                   min_df=1, max_df=1.0, max_feature_num=None, 
                                   is_text_preprocessed=True, lower=True, punc=True, 
                                   digit=False, stop_word=True, 
                                   lemmitize=True, data_dict_seperate=False):
    
    if data_dict_seperate:
        
        tfidf_vectors_df_list = []
        tf_vectors_df_list = []
        svd_topic_df_list = []
        svd_topics_df_list = []
        lda_vectors_df_list = []
        lda_topics_df_list = []
        dictionaries = {}
        
        if topic_method=="SVD":
            
            for data_df in data_df_list:
            
                cols = list(data_df.columns)
                id_col = [col for col in cols if "id" in col.lower()][0]
                forwhat_cols = [col for col in cols if "review" in col.lower() or "categor" in col.lower() or "argument" in col.lower()]
                forwhat_col = 0
                for col in forwhat_cols:
                    if "argument" in col:
                        forwhat_col = col
                    else:
                        forwhat_col = forwhat_cols[0]

                # generate TFIDF vectors
                tfidf_vectors_df = pd.DataFrame(columns=[id_col, f"{forwhat_col}_tfidf_feature_vector"])
                text_list = data_df[forwhat_col].values.tolist()
                
                if is_text_preprocessed:
                    normalized_text_list = []
                    for text in text_list:
    #                     print(text)
                        normalized_text = preprocess_text(text, lower=lower, punc=punc, digit=digit, stop_word=stop_word, lemmitize=lemmitize)
                        normalized_text_list.append(normalized_text)
                    text_list = normalized_text_list

                tfidf_vectors, tfidf_feature_names = compute_tfidf_vectors(text_list, min_df=min_df, max_df=max_df, max_feature_num=max_feature_num)

                num_data = len(data_df)

                for i in range(num_data):
                    tfidf_vectors_df.at[i, id_col] = data_df.at[i, id_col]
                    tfidf_vectors_df.at[i, f"{forwhat_col}_tfidf_feature_vector"] = tfidf_vectors[i]

                tfidf_vectors_df_list.append(tfidf_vectors_df)

                dictionaries[f'{forwhat_col}_preprocessed_{is_text_preprocessed}_lower_{lower}_punc_{punc}_digit_{digit}_stopword_{stop_word}_lemmitize_{lemmitize}_tfidf_dictionary'] = tfidf_feature_names

                with open(f'{forwhat_col}_preprocessed_{is_text_preprocessed}_lower_{lower}_punc_{punc}_digit_{digit}_stopword_{stop_word}_lemmitize_{lemmitize}_tfidf_dictionary.txt', 'w') as file:
                    for word in tfidf_feature_names:
                        file.write(word + '\n')

                # generate SVD vectors
                svd_vectors_df = pd.DataFrame(columns=[id_col, f"{forwhat_col}_svd_feature_vector"])
                svd_model = TruncatedSVD(n_components=n_components)
                svd_vectors = svd_model.fit_transform(tfidf_vectors)

                for i in range(num_data):
                    svd_vectors_df.at[i, id_col] = data_df.at[i, id_col]
                    svd_vectors_df.at[i, f"{forwhat_col}_svd_feature_vector"] = svd_vectors[i]

                svd_vectors_df_list.append(svd_vectors_df)

                # get topics
                svd_topics_df = generate_topics(model_name="SVD", model=svd_model, words=tfidf_feature_names, n_words=10000)
                svd_topics_df_list.append(svd_topics_df)
                svd_topics_df.to_csv(f"{forwhat_col}_SVD_topic_word_list.csv", index=False)
                
            return svd_vectors_df_list, svd_topics_df_list, tfidf_vectors_df_list, dictionaries
        

        elif topic_method=="LDA":
            
            for data_df in data_df_list:
            
                cols = list(data_df.columns)
                id_col = [col for col in cols if "id" in col.lower()][0]
                forwhat_cols = [col for col in cols if "review" in col.lower() or "categor" in col.lower() or "argument" in col.lower()]
                forwhat_col = 0
                for col in forwhat_cols:
                    if "argument" in col:
                        forwhat_col = col
                    else:
                        forwhat_col = forwhat_cols[0]

                # generate TF vectors
                tf_vectors_df = pd.DataFrame(columns=[id_col, f"{forwhat_col}_tf_feature_vector"])
                text_list = data_df[forwhat_col].values.tolist()
                if is_text_preprocessed:
                    normalized_text_list = []
                    for text in text_list:
    #                     print(text)
                        normalized_text = preprocess_text(text, lower=lower, punc=punc, digit=digit, stop_word=stop_word, lemmitize=lemmitize)
                        normalized_text_list.append(normalized_text)
                    text_list = normalized_text_list

                tf_vectors, tf_feature_names = compute_tf_vectors(text_list, min_df=min_df, max_df=max_df, max_feature_num=max_feature_num)

                num_data = len(data_df)

                for i in range(num_data):
                    tf_vectors_df.at[i, id_col] = data_df.at[i, id_col]
                    tf_vectors_df.at[i, f"{forwhat_col}_tf_feature_vector"] = tf_vectors[i]

                tf_vectors_df_list.append(tf_vectors_df)

                dictionaries[f'{forwhat_col}_preprocessed_{is_text_preprocessed}_lower_{lower}_punc_{punc}_digit_{digit}_stopword_{stop_word}_lemmitize_{lemmitize}_tfidf_dictionary'] = tf_feature_names

                with open(f'{forwhat_col}_preprocessed_{is_text_preprocessed}_lower_{lower}_punc_{punc}_digit_{digit}_stopword_{stop_word}_lemmitize_{lemmitize}_tfidf_dictionary.txt', 'w') as file:
                    for word in tf_feature_names:
                        file.write(word + '\n')

                # generate LDA vectors
                lda_vectors_df = pd.DataFrame(columns=[id_col, f"{forwhat_col}_lda_feature_vector"])
                lda_model = LatentDirichletAllocation(n_components=n_components)
                lda_vectors = lda_model.fit_transform(tf_vectors)

                for i in range(num_data):
                    lda_vectors_df.at[i, id_col] = data_df.at[i, id_col]
                    lda_vectors_df.at[i, f"{forwhat_col}_lda_feature_vector"] = lda_vectors[i]

                lda_vectors_df_list.append(lda_vectors_df)

                # get topics
                lda_topics_df = generate_topics(model_name="LDA", model=lda_model, words=tf_feature_names, n_words=10000)
                lda_topics_df_list.append(lda_topics_df)
                lda_topics_df.to_csv(f"{forwhat_col}_LDA_topic_word_list.csv", index=False)
                
            return lda_vectors_df_list, lda_topics_df_list, tf_vectors_df_list, dictionaries
        
        else:
            print("Please select a topic method within SVD or LDA !")
        
    else:
        
        tfidf_vectors_df_list = []
        tf_vectors_df_list = []
        svd_vectors_df_list = []
        lda_vectors_df_list = []
        
        text_list = []
        for data_df in data_df_list:
            cols = list(data_df.columns)
            id_col = [col for col in cols if "id" in col.lower()][0]
#             forwhat_col = [col for col in cols if "review" in col.lower() or "categor" in col.lower()][0]
            forwhat_cols = [col for col in cols if "review" in col.lower() or "categor" in col.lower() or "argument" in col.lower()]
            forwhat_col = 0
            for col in forwhat_cols:
                if "argument" in col:
                    forwhat_col = col
                else:
                    forwhat_col = forwhat_cols[0]
            text_list += data_df[forwhat_col].values.tolist()
        
        if is_text_preprocessed:
            normalized_text_list = []
            for text in text_list:
                normalized_text = preprocess_text(text, lower=lower, punc=punc, digit=digit, stop_word=stop_word, lemmitize=lemmitize)
                normalized_text_list.append(normalized_text)
            text_list = normalized_text_list
        
        if topic_method=="SVD":
            
            # generate TFIDF vectors
            tfidf_vectors, tfidf_feature_names = compute_tfidf_vectors(text_list, min_df=min_df, max_df=max_df, max_feature_num=max_feature_num)

            # generate SVD vectors
            svd_model = TruncatedSVD(n_components=n_components)
            svd_vectors = svd_model.fit_transform(tfidf_vectors)

            start_index = 0
            for data_df in data_df_list:

                cols = list(data_df.columns)
                id_col = [col for col in cols if "id" in col.lower()][0]
                forwhat_cols = [col for col in cols if "review" in col.lower() or "categor" in col.lower() or "argument" in col.lower()]
                forwhat_col = 0
                for col in forwhat_cols:
                    if "argument" in col:
                        forwhat_col = col
                    else:
                        forwhat_col = forwhat_cols[0]

                tfidf_vectors_df = pd.DataFrame(columns=[id_col, f"{forwhat_col}_tfidf_feature_vector"])
                svd_vectors_df = pd.DataFrame(columns=[id_col, f"{forwhat_col}_svd_feature_vector"])

                num_data = len(data_df)
                end_index = start_index + num_data
                selected_tfidf_vectors = tfidf_vectors[start_index:end_index]
                selected_svd_vectors = svd_vectors[start_index:end_index]

                for i in range(num_data):
                    tfidf_vectors_df.at[i, id_col] = data_df.at[i, id_col]
                    tfidf_vectors_df.at[i, f"{forwhat_col}_tfidf_feature_vector"] = selected_tfidf_vectors[i]
                tfidf_vectors_df_list.append(tfidf_vectors_df)

                for i in range(num_data):
                    svd_vectors_df.at[i, id_col] = data_df.at[i, id_col]
                    svd_vectors_df.at[i, f"{forwhat_col}_svd_feature_vector"] = svd_vectors[i]
                svd_vectors_df_list.append(svd_vectors_df)

                start_index = end_index

            # get topics
            svd_topics_df = generate_topics(model_name="SVD", model=svd_model, words=tfidf_feature_names, n_words=10000)
            svd_topics_df.to_csv(f"SVD_topic_word_list.csv", index=False)
            
            with open(f'preprocessed_{is_text_preprocessed}_lower_{lower}_punc_{punc}_digit_{digit}_stopword_{stop_word}_lemmitize_{lemmitize}_tfidf_dictionary.txt', 'w') as file:
                for word in tfidf_feature_names:
                    file.write(word + '\n')

            return svd_vectors_df_list, svd_topics_df, tfidf_vectors_df_list, tfidf_feature_names
        
        elif topic_method=="LDA":
            
            # generate TF vectors
            tf_vectors, tf_feature_names = compute_tf_vectors(text_list, min_df=min_df, max_df=max_df, max_feature_num=max_feature_num)

            # generate LDA vectors
            lda_model = LatentDirichletAllocation(n_components=n_components)
            lda_vectors = lda_model.fit_transform(tf_vectors)

            start_index = 0
            for data_df in data_df_list:

                cols = list(data_df.columns)
                id_col = [col for col in cols if "id" in col.lower()][0]
                forwhat_cols = [col for col in cols if "review" in col.lower() or "categor" in col.lower() or "argument" in col.lower()]
                forwhat_col = 0
                for col in forwhat_cols:
                    if "argument" in col:
                        forwhat_col = col
                    else:
                        forwhat_col = forwhat_cols[0]

                tf_vectors_df = pd.DataFrame(columns=[id_col, f"{forwhat_col}_tf_feature_vector"])
                lda_vectors_df = pd.DataFrame(columns=[id_col, f"{forwhat_col}_lda_feature_vector"])

                num_data = len(data_df)
                end_index = start_index + num_data
                selected_tf_vectors = tf_vectors[start_index:end_index]
                selected_lda_vectors = lda_vectors[start_index:end_index]

                for i in range(num_data):
                    tf_vectors_df.at[i, id_col] = data_df.at[i, id_col]
                    tf_vectors_df.at[i, f"{forwhat_col}_tf_feature_vector"] = selected_tf_vectors[i]
                tf_vectors_df_list.append(tf_vectors_df)

                for i in range(num_data):
                    lda_vectors_df.at[i, id_col] = data_df.at[i, id_col]
                    lda_vectors_df.at[i, f"{forwhat_col}_lda_feature_vector"] = lda_vectors[i]
                lda_vectors_df_list.append(lda_vectors_df)

                start_index = end_index

            # get topics
            lda_topics_df = generate_topics(model_name="LDA", model=lda_model, words=tf_feature_names, n_words=10000)
            lda_topics_df.to_csv(f"LDA_topic_word_list.csv", index=False)
            
            with open(f'preprocessed_{is_text_preprocessed}_lower_{lower}_punc_{punc}_digit_{digit}_stopword_{stop_word}_lemmitize_{lemmitize}_tfidf_dictionary.txt', 'w') as file:
                for word in tf_feature_names:
                    file.write(word + '\n')

            return lda_vectors_df_list, lda_topics_df, tf_vectors_df_list, tf_feature_names
        
        else:
            print("Please select a topic method within SVD or LDA !")