In [None]:
# from pathlib import Path
# import nbformat

# def load_notebook(notebook_path):
#     with open(notebook_path, 'r', encoding='utf-8') as f:
#         nb = nbformat.read(f, as_version=4)
#     code_cells = [cell.source for cell in nb.cells if cell.cell_type == 'code']
#     exec('\n'.join(code_cells), globals())

## import written function and variable

# parent_directory = Path('../../../')
## parent_directory = Path('../../')
# data_preprocessing_utils_path = parent_directory / 'data_preprocessing_utils.ipynb'

# load_notebook(data_preprocessing_utils_path)

# Data split

In [None]:
def dataset_split(df, train_ratio=0.8, valid_ratio=0.1, test_ratio=0.1, stratify=(False, "stars")):
    
    import os
    import pandas as pd
    from sklearn.model_selection import train_test_split
    
    if (train_ratio + valid_ratio + test_ratio) == 1 and train_ratio >= 0 and valid_ratio >= 0 and test_ratio >= 0:
        
        # Split the dataset
        if stratify[0]:
            # Split the dataset using stratified sampling
            train_data, remaining = train_test_split(df, test_size=(valid_ratio + test_ratio), stratify=df[stratify[1]], random_state=42)
            valid_data, test_data = train_test_split(remaining, test_size=test_ratio/(valid_ratio + test_ratio), stratify=remaining[stratify[1]], random_state=42)
        else:
            train_data, remaining = train_test_split(df, test_size=(valid_ratio + test_ratio), random_state=42)
            valid_data, test_data = train_test_split(remaining, test_size=test_ratio/(valid_ratio + test_ratio), random_state=42)
     
        # Reset index
        train_data.reset_index(drop=True, inplace=True)
        valid_data.reset_index(drop=True, inplace=True)
        test_data.reset_index(drop=True, inplace=True)
        
        print(f"Original dataset size: {df.shape}, ratio is {len(df)/len(df)}", )
        print(f"Training dataset size: {train_data.shape}, ratio is {len(train_data)/len(df)}")
        print(f"Validation dataset size: {valid_data.shape}, ratio is {len(valid_data)/len(df)}")
        print(f"Test dataset size: {test_data.shape}, ratio is {len(test_data)/len(df)}")
        
        return train_data, valid_data, test_data
    
    else:
        print("Please make sure each ratio is larger than 0 and sum of the three ratio is 1 !")

In [None]:
def kfold_cross_validation(data_df, train_ratio=0.9, test_ratio=0.1, fold_num=10):
    
    import pandas as pd
    from sklearn.model_selection import KFold
    from sklearn.model_selection import train_test_split
    
    train_data, test_data = train_test_split(data_df, test_size=test_ratio, random_state=42)
    train_data.to_csv("research_training_set.csv", index=False)
    test_data.to_csv("research_test_set.csv", index=False)
    print(f"Data has successfully split into training set and test set and saved to csv files !")
    
    # Specify the number of folds
    k = fold_num

    # Initialize the KFold cross-validator
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    
    # Create a folder named 'folds' if it doesn't exist
    if not os.path.exists('folds'):
        os.makedirs('folds')

    # Enumerate through each fold
    for i, (train_index, test_index) in enumerate(kf.split(data_df)):
        # Create a folder for each fold
        fold_dir = f'folds/fold_{i+1}'
        if not os.path.exists(fold_dir):
            os.makedirs(fold_dir)

        train_set = data_df.iloc[train_index]
        test_set = data_df.iloc[test_index]

        # Save training and testing sets to CSV files in the fold's folder
        train_set.to_csv(f'{fold_dir}/train_set_fold_{i+1}.csv', index=False)
        test_set.to_csv(f'{fold_dir}/test_set_fold_{i+1}.csv', index=False)
    
    print(f"Training data has successfully split into {fold_num} folds and saved to csv files !")

# Formatting text data

In [None]:
import os
import pandas as pd
import numpy as np

def remove_backslash_and_newline(text):
    if isinstance(text, str):
        return text.replace("\\", "").replace("\n", "")
    return text

# wrap reviews into dictionary format
def formatting_content_method1(train_data):
    
    user_data = pd.DataFrame(columns=["user_id", "user_concatenated_reviews_with_business_categories"])
    business_data = pd.DataFrame(columns=["business_id", "business_concatenated_reviews_with_business_categories"])
    
    # Remove \ and \n
#     train_data['text'] = train_data['text'].apply(remove_backslash_and_newline)
#     train_data['business_categories'] = train_data['business_categories'].apply(remove_backslash_and_newline)
    
    # Merge user reviews with corresponding business_categories
    i = 0
    for user_id, group in train_data.groupby("user_id"):
        
        user_concatenated_reviews_categories = []
        
        review_i = 0
        for _, row in group.iterrows():
            user_concatenated_reviews_categories.append({f"review_{review_i+1}": row['text'], "business_categories": row['business_categories']})
            review_i += 1
            
        user_concatenated_reviews_categories_str = "\n".join(str(review_dict) for review_dict in user_concatenated_reviews_categories)
        
        user_data.at[i, "user_id"] = user_id
        user_data.at[i, "user_concatenated_reviews_with_business_categories"] = user_concatenated_reviews_categories_str
        
        i += 1
        
    print(user_data.head())
    
    # Merge business reviews and add business_categories
    i = 0
    for business_id, group in train_data.groupby('business_id'):
        
        review_i = 0
        business_concatenated_reviews = {}
        for _, row in group.iterrows():
            business_concatenated_reviews[f"review_{review_i+1}"] = row['text']
            review_i += 1    

        business_categories = group['business_categories'].iloc[0]
        
        business_concatenated_reviews_categories = {**business_concatenated_reviews, "business_categories": business_categories}
        
        business_data.at[i, "business_id"] = business_id
        business_data.at[i, "business_concatenated_reviews_with_business_categories"] = str(business_concatenated_reviews_categories)
        
        i += 1
        
    print(business_data.head())
    
    return user_data, business_data

In [None]:
# wrap reviews into more like tallking or article format

def formatting_content_method_v2(train_data):
    
    user_data = pd.DataFrame(columns=["user_id", "user_concatenated_reviews_with_business_categories"])
    business_data = pd.DataFrame(columns=["business_id", "business_concatenated_reviews_with_business_categories"])
    
    i = 0
    for user_id, group in train_data.groupby("user_id"):
        
        user_concatenated_reviews_categories = []
        
        review_i = 0
        for _, row in group.iterrows():
            user_concatenated_reviews_categories.append(f"review_{review_i+1}: {row['text']}, restaurant_categories: {row['business_categories']}")
            review_i += 1
            
        user_concatenated_reviews_categories_str = "\n".join(review_category for review_category in user_concatenated_reviews_categories)
        
        user_data.at[i, "user_id"] = user_id
        user_data.at[i, "user_concatenated_reviews_with_business_categories"] = user_concatenated_reviews_categories_str
        
        i += 1
        
    print(user_data.head())
    
    # Merge business reviews and add business_categories
    i = 0
    for business_id, group in train_data.groupby('business_id'):
        
        review_i = 0
        business_concatenated_reviews = []
        for _, row in group.iterrows():
            business_concatenated_reviews.append(f"review_{review_i+1}: {row['text']}")
            review_i += 1    

        business_concatenated_reviews_categories_str = "\n".join(review for review in business_concatenated_reviews)
        business_categories = group['business_categories'].iloc[0]
        
        business_concatenated_reviews_categories = f"restaurant_categories: {business_categories}" + '\n' + business_concatenated_reviews_categories_str
        
        business_data.at[i, "business_id"] = business_id
        business_data.at[i, "business_concatenated_reviews_with_business_categories"] = business_concatenated_reviews_categories
        
        i += 1
        
    print(business_data.head())
    
    return user_data, business_data

In [None]:
# wrap reviews into more like tallking or article format, and turn categories into sentence

def formatting_content_method_v3(train_data):
    
    user_data = pd.DataFrame(columns=["user_id", "user_concatenated_reviews_with_business_categories"])
    business_data = pd.DataFrame(columns=["business_id", "business_concatenated_reviews_with_business_categories"])
    
    i = 0
    for user_id, group in train_data.groupby("user_id"):
        
        user_concatenated_reviews_categories = []
        
        review_i = 0
        for _, row in group.iterrows():
            
            business_categories = row['business_categories']
            
            # categories sentence
            words = business_categories.split(", ")
            filtered_words = [word for word in words if "restaurant" not in word.lower()]
            business_categories = ", ".join(filtered_words[:])
            begin = "This is an " if business_categories[0].lower() in ["a", "e", "i", "o", "u"] else "This is a "
            categories_sentence =  begin + business_categories + " restaurant."
            
            user_concatenated_reviews_categories.append(f"review_{review_i+1}: {categories_sentence} {row['text']}")
            review_i += 1
            
        user_concatenated_reviews_categories_str = "\n".join(review_category for review_category in user_concatenated_reviews_categories)
        
        user_data.at[i, "user_id"] = user_id
        user_data.at[i, "user_concatenated_reviews_with_business_categories"] = user_concatenated_reviews_categories_str
        
        i += 1
        
    print(user_data.head())
    
    # Merge business reviews and add business_categories
    i = 0
    for business_id, group in train_data.groupby('business_id'):
        
        review_i = 0
        business_concatenated_reviews = []
        for _, row in group.iterrows():
            business_concatenated_reviews.append(f"review_{review_i+1}: {row['text']}")
            review_i += 1    
        
        business_concatenated_reviews_categories_str = "\n".join(review for review in business_concatenated_reviews)
        business_categories = group['business_categories'].iloc[0]
        
        # categories sentence
        words = business_categories.split(", ")
        filtered_words = [word for word in words if "restaurant" not in word.lower()]
        business_categories = ", ".join(filtered_words[:])
        begin = "This is an " if business_categories[0].lower() in ["a", "e", "i", "o", "u"] else "This is a "
        categories_sentence =  begin + business_categories + " restaurant."
        
#         business_concatenated_reviews_categories = f"restaurant_categories: {business_categories}" + '\n' + business_concatenated_reviews_categories_str
        
        business_data.at[i, "business_id"] = business_id
        business_data.at[i, "business_concatenated_reviews_with_business_categories"] = categories_sentence + '\n' + business_concatenated_reviews_categories_str
        
        i += 1
        
    print(business_data.head())
    
    return user_data, business_data

# Text to vector

Original text to BERT Embedding

In [None]:
import pandas as pd
import os
import ast
import torch
from transformers import BertTokenizer, BertModel

def check_users_and_business_from_valid_and_test_is_in_training_set(train_data, valid_data, test_data):
    
    # Check if users and businesses in the validation set are all in the training set
    valid_users_in_train = valid_data['user_id'].isin(train_data['user_id']).all()
    valid_businesses_in_train = valid_data['business_id'].isin(train_data['business_id']).all()

    # Check if users and businesses in the test set are all in the training set
    test_users_in_train = test_data['user_id'].isin(train_data['user_id']).all()
    test_businesses_in_train = test_data['business_id'].isin(train_data['business_id']).all()

    # Print the results
    print('Are all users in the validation set also in the training set?', valid_users_in_train)
    print('Are all businesses in the validation set also in the training set?', valid_businesses_in_train)
    print('Are all users in the test set also in the training set?', test_users_in_train)
    print('Are all businesses in the test set also in the training set?', test_businesses_in_train)


def get_bert_sentence_cls_embedding(text):
    
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    cls_sentence_embedding = outputs.last_hidden_state[:, 0, :]  # Extract [CLS] token embedding
    return cls_sentence_embedding

def get_vector_or_embedding_and_save_to_txt(folder_file_path, data, column_id_name, column_to_tranform,
                                     method="bert-base-uncased", embedding_level="sentence_cls_embedding"):
    
    # make sure file is empty
    with open(folder_file_path, 'w') as output_file:
        pass
    
    with open(folder_file_path, 'a') as output_file:

        if method=="bert-base-uncased" and embedding_level=="sentence_cls_embedding":
            # Iterate through user_reviews and Append Results to Text File
            for index, row in data.iterrows():

                _id = row[column_id_name]
                text = row[column_to_tranform]

                feature_vector = get_bert_sentence_cls_embedding(text)

                # Convert PyTorch Tensor to a list for easy storage
                feature_vector_list = feature_vector.squeeze().tolist()

                # Convert the list to a string for storage
                feature_vector_str = str(feature_vector_list)

                # Write user_id and feature_vector_str to the text file
                output_file.write(f"{_id}\t{feature_vector_str}\n")

            # Print a message indicating the successful saving of feature vectors
            print(f'Feature vectors saved to {folder_file_path}')


def read_vector_or_embedding_txt_return_df(folder_file_path, data_columns):

    data = []

    with open(folder_file_path, 'r') as file:
        for line in file:
            # Split the line into _id and vector(embedding)
            _id, feature_vector_str = line.strip().split('\t')

            # Convert the vector string back to a list using ast.literal_eval
            feature_vector = ast.literal_eval(feature_vector_str)

            # Append _id and feature_vector to the data list
            data.append([_id, feature_vector])

    # Create the DataFrame
    df = pd.DataFrame(data, columns=data_columns)
    
    return df

def training_dataset_get_vector_or_embedding(folder_name, train_data,  
                                   method="bert-base-uncased", embedding_level="sentence_cls_embedding",
                                   text_name="user_concatenated_reviews_with_business_categories"):
    
    if text_name == "user_concatenated_reviews_with_business_categories" or text_name == "business_concatenated_reviews_with_business_categories":
        
        for_who = ""
        
        if text_name == "user_concatenated_reviews_with_business_categories":
            for_who = "user"
        else:
            for_who = "business"
            
        id_name = f"{for_who}_id"
#         column_to_tranform = f"{text_name}_text"
        column_to_tranform = f"{text_name}"
        
#         # Preprocess Text Data
#         concatenated_reviews = train_data.groupby(id_name)['text'].apply(lambda x: ';'.join(x)).reset_index()
#         concatenated_reviews = concatenated_reviews.rename(columns={'text': column_to_tranform})
#         # Save as csv
#         concatenated_reviews.to_csv(os.path.join(folder_name, f'{text_name}.csv'), index=False)


        # Get BERT embedding
        output_file_path = f'{text_name}_{method}_{embedding_level}.txt'
        folder_file_path = os.path.join(folder_name, output_file_path)
        get_vector_or_embedding_and_save_to_txt(folder_file_path, train_data, 
                                                method=method, embedding_level=embedding_level, 
                                                column_id_name=id_name, column_to_tranform=column_to_tranform)

        feature_vectors_df = read_vector_or_embedding_txt_return_df(folder_file_path, 
                                                data_columns=[id_name, f"{for_who}_feature_vector"])

        # Merge data
        concatenated_reviews_with_feature_vectors_df = pd.merge(train_data, feature_vectors_df, on=id_name)
        
        return concatenated_reviews_with_feature_vectors_df


    elif text_name=="business_categories":

#         merged_data = pd.merge(train_data, business_data, on="business_id", how="left")
        
        # drop duplicate business
        categories_data = train_data[['business_id', text_name]].drop_duplicates(subset=['business_id']).reset_index(drop=True)
        categories_data.to_csv(os.path.join(folder_name, f"{text_name}.csv"), index=False)
        
        # Get BERT embedding
        output_file_path = f'{text_name}_{method}_{embedding_level}.txt'
        folder_file_path = os.path.join(folder_name, output_file_path)
        
        get_vector_or_embedding_and_save_to_txt(folder_file_path, categories_data, 
                                                method=method, embedding_level=embedding_level, 
                                                column_id_name="business_id", column_to_tranform=text_name)

        feature_vectors_df = read_vector_or_embedding_txt_return_df(folder_file_path, 
                                                data_columns=["business_id", f"{text_name}_feature_vector"])
        
#         feature_vectors_df = feature_vectors_df.drop_duplicates(subset=['business_id']).reset_index(drop=True)
        
#         categories_with_feature_vectors_df = pd.merge(merged_data, feature_vectors_df, on=id_name)
    
#         return categories_with_feature_vectors_df
        return feature_vectors_df

    else:
        print("Text name has not handled yet or wrong text name !")

Argumented text to BERT Embedding

In [None]:
# argumented text to vector function

def argumented_data_to_vector_or_embedding(
    folder_path,
    argumented_results_data,
    method="bert-base-uncased", 
    embedding_level="sentence_cls_embedding",
    text_name="argumented_text_result"):
    
    cols = list(argumented_results_data.columns)
    id_column = [col for col in cols if 'id' in col.lower()][0]
    forwhat_column = [col for col in cols if "argument" in col.lower()][0]
    
    if text_name=="argumented_text_result":

        # Get BERT embedding
        output_file_path = f'{forwhat_column}_{method}_{embedding_level}.txt'
        folder_file_path = os.path.join(folder_path, output_file_path)
        get_vector_or_embedding_and_save_to_txt(folder_file_path, argumented_results_data, 
                                                method=method, embedding_level=embedding_level, 
                                                column_id_name=id_column, column_to_tranform=forwhat_column)

        feature_vectors_df = read_vector_or_embedding_txt_return_df(folder_file_path, 
                                                data_columns=[id_column, f"{forwhat_column}_feature_vector"])

        # Merge data
        argumented_results_data_with_feature_vectors_df = pd.merge(argumented_results_data, feature_vectors_df, on=id_column)
        
        return argumented_results_data_with_feature_vectors_df

    else:
        print("Text name has not handled yet or wrong text name !")

Text to TFIDF embedding

In [None]:
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('stopwords')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string

def normalize_text(text):
    # Convert text to lowercase
#     lower_text = ''
#     for char in text:
#         if char.isdigit() or char in string.punctuation:
#             lower_text += char
#         else:
#             lower_text += char.lower()
#     text = lower_text
    
    if isinstance(text, float) or text == np.nan:
        return ""
    
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Download WordNet data
#     nltk.download('wordnet')
    
    # Initialize WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # Lemmatize tokens
#     lemmatized_tokens = [lemmatizer.lemmatize(word) if not word.isdigit() else word for word in tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Download stopwords list
#     nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    
    # Remove stopwords
    filtered_tokens = [word for word in lemmatized_tokens if word not in stop_words]
    
    # Reconstruct text
    normalized_text = ' '.join(filtered_tokens)
    
    return normalized_text

def compute_tfidf_vectors(texts, max_feature_num=None):
    # Create a TF-IDF vectorizer
    vectorizer = TfidfVectorizer(max_features=max_feature_num)
    # Convert the list of texts to TF-IDF vectors
    tfidf_vectors = vectorizer.fit_transform(texts)
    # Return TF-IDF vectors and feature names
    return tfidf_vectors.toarray(), vectorizer.get_feature_names_out()

def text_to_tfidf_vector(data_df_list, max_feature_num=None, normalize=True, data_dict_seperate=False):
    
#     cols = list(data_df.columns)
#     forwhat_column = [col for col in cols if "review" in col.lower() or "categor" in col.lower()]

    if data_dict_seperate:
        
        tfidf_vectors_df_list = []
        dictionaries = {}
        
        for data_df in data_df_list:
            
            cols = list(data_df.columns)
            id_col = [col for col in cols if "id" in col.lower()][0]
            forwhat_col = [col for col in cols if "review" in col.lower() or "categor" in col.lower()][0]
        
            tfidf_vectors_df = pd.DataFrame(columns=[id_col, f"{forwhat_col}_tfidf_feature_vector"])
              
            text_list = data_df[forwhat_col].values.tolist()
            if normalize:
                normalized_text_list = []
                for text in text_list:
#                     print(text)
                    normalized_text = normalize_text(text)
                    normalized_text_list.append(normalized_text)
                text_list = normalized_text_list

            tfidf_vectors, feature_names = compute_tfidf_vectors(text_list, max_feature_num)

            num_data = len(data_df)
            for i in range(num_data):
                tfidf_vectors_df.at[i, id_col] = data_df.at[i, id_col]
                tfidf_vectors_df.at[i, f"{forwhat_col}_tfidf_feature_vector"] = tfidf_vectors[i]
            
            tfidf_vectors_df_list.append(tfidf_vectors_df)
            
            dictionaries[f'{forwhat_col}_normalize_{normalize}_dictionary'] = feature_names

            with open(f'{forwhat_col}_normalize_{normalize}_dictionary.txt', 'w') as file:
                for word in feature_names:
                    file.write(word + '\n')

        return tfidf_vectors_df_list, dictionaries
        
    else:
        
        tfidf_vectors_df_list = []
        
        text_list = []
        for data_df in data_df_list:
            cols = list(data_df.columns)
            id_col = [col for col in cols if "id" in col.lower()][0]
            forwhat_col = [col for col in cols if "review" in col.lower() or "categor" in col.lower()][0]
            text_list += data_df[forwhat_col].values.tolist()
        
        if normalize:
            normalized_text_list = []
#             i = 0
            for text in text_list:
#                 i += 1
# #                 print(text)
#                 if isinstance(text, float):
#                     print(text)
#                     print(i)
                normalized_text = normalize_text(text)
                normalized_text_list.append(normalized_text)
            text_list = normalized_text_list

        tfidf_vectors, feature_names = compute_tfidf_vectors(text_list, max_feature_num)
        
        start_index = 0
        for data_df in data_df_list:
            
            cols = list(data_df.columns)
            id_col = [col for col in cols if "id" in col.lower()][0]
            forwhat_col = [col for col in cols if "review" in col.lower() or "categor" in col.lower()][0]
            
            tfidf_vectors_df = pd.DataFrame(columns=[id_col, f"{forwhat_col}_tfidf_feature_vector"])
        
            num_data = len(data_df)
            end_index = start_index + num_data
            vectors = tfidf_vectors[start_index:end_index]
            
            for i in range(num_data):
                
                tfidf_vectors_df.at[i, id_col] = data_df.at[i, id_col]
                tfidf_vectors_df.at[i, f"{forwhat_col}_tfidf_feature_vector"] = vectors[i]
            
            tfidf_vectors_df_list.append(tfidf_vectors_df)
            
            start_index = end_index

        with open(f'normalize_{normalize}_dictionary.txt', 'w') as file:
            for word in feature_names:
                file.write(word + '\n')
    
    return tfidf_vectors_df_list, feature_names