In [7]:
from pathlib import Path
import nbformat

def load_notebook(notebook_path):
    with open(notebook_path, 'r', encoding='utf-8') as f:
        nb = nbformat.read(f, as_version=4)
    code_cells = [cell.source for cell in nb.cells if cell.cell_type == 'code']
    exec('\n'.join(code_cells), globals())

# import written function and variable

parent_directory = Path('../../../../')
data_preprocessing_utils_path = parent_directory / 'data_preprocessing_utils.ipynb'
experiment_related_utils_path = parent_directory / 'experiment_related_utils.ipynb'
load_notebook(data_preprocessing_utils_path)
load_notebook(experiment_related_utils_path)

In [3]:
import os
import pandas as pd
import numpy as np

folder_name = '../../../original/text_format4/BERT_base_sentence_embedding/'

files = os.listdir(folder_name)
train_df = pd.read_csv(os.path.join(folder_name,[file for file in files if 'train' in file and 'set' in file][0]))
print(train_df.shape)
valid_df = pd.read_csv(os.path.join(folder_name,[file for file in files if 'valid' in file and 'set' in file][0]))
print(valid_df.shape)
test_df = pd.read_csv(os.path.join(folder_name,[file for file in files if 'test' in file and 'set' in file][0]))
print(test_df.shape)

(11196, 48)
(1400, 48)
(1400, 48)


In [4]:
user_df, business_df = calcuate_reviews_num(train_df)
user_distribution = calculate_distribution(user_df["reviews_num"])
business_distribution = calculate_distribution(business_df["reviews_num"])
print("User reviews num distribution:\n")
print_nested_dict(user_distribution)
print('\n')
print("Business reviews num distribution:\n")
print_nested_dict(business_distribution)

User reviews num distribution:

min: 1
max: 13
mean: 5.6301211771494515
std: 2.1744341079634166
mode: 5.0
percentiles:
    5th: 3.0
    10th: 3.0
    15th: 4.0
    20th: 4.0
    25th: 4.0
    30th: 4.0
    35th: 5.0
    40th: 5.0
    45th: 5.0
    50th: 5.0
    55th: 5.0
    60th: 6.0
    65th: 6.0
    70th: 6.0
    75th: 7.0
    80th: 7.0
    85th: 8.0
    90th: 9.0
    95th: 10.0
    100th: 13.0


Business reviews num distribution:

min: 1
max: 15
mean: 6.371983914209116
std: 2.7195239056128435
mode: 5.0
percentiles:
    5th: 3.0
    10th: 4.0
    15th: 4.0
    20th: 4.0
    25th: 4.0
    30th: 5.0
    35th: 5.0
    40th: 5.0
    45th: 5.0
    50th: 6.0
    55th: 6.0
    60th: 6.0
    65th: 7.0
    70th: 7.0
    75th: 8.0
    80th: 9.0
    85th: 9.0
    90th: 10.0
    95th: 12.0
    100th: 15.0


In [8]:
train_data, valid_data, cal_results = return_filtered_train_test_data(train_df, valid_df, user_reviews_num=7,user_comparison="max", 
                                    user_reviews_num_range=None, business_reviews_num=9,
                                    business_comparison="max", business_reviews_num_range=None)
print_nested_dict(cal_results)

Percent: 100 %
User threshold reviews num: 7
User reviews num comparison method: max
User threshold reviews num range: None
Business threshold reviews num: 9
Business reviews num comparison method: max
Business threshold reviews num range: None
Generative AI model: GPT-3.5 Turbo
Sampling_method: random
Column: None


-----------------------------------------------------------


User reviews: 7 max or None
Business reviews: 9 max or None
Training data num:
Before: 11196 After: 4351 

Training data user count:
Before: 1808 After: 1395 

Training data business count:
Before: 1573 After: 1249 

Validation (Test) data num:
Before: 1400 After: 626 

Validation (Test) data user count:
Before: 944 After: 514 

Validation (Test) data business count:
Before: 918 After: 507
Train_data_num_before: 11196
Train_data_num_after: 4351
Train_data_user_count_before: 1808
Train_data_user_count_after: 1395
Train_data_business_count_before: 1573
Train_data_business_count_after: 1249
Test_data_num_before: 14

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [9]:
train_data, test_data, cal_results = return_filtered_train_test_data(train_df, test_df, user_reviews_num=7,user_comparison="max", 
                                    user_reviews_num_range=None, business_reviews_num=9,
                                    business_comparison="max", business_reviews_num_range=None)
print_nested_dict(cal_results)

Percent: 100 %
User threshold reviews num: 7
User reviews num comparison method: max
User threshold reviews num range: None
Business threshold reviews num: 9
Business reviews num comparison method: max
Business threshold reviews num range: None
Generative AI model: GPT-3.5 Turbo
Sampling_method: random
Column: None


-----------------------------------------------------------


User reviews: 7 max or None
Business reviews: 9 max or None
Training data num:
Before: 11196 After: 4351 

Training data user count:
Before: 1808 After: 1395 

Training data business count:
Before: 1573 After: 1249 

Validation (Test) data num:
Before: 1400 After: 621 

Validation (Test) data user count:
Before: 958 After: 510 

Validation (Test) data business count:
Before: 872 After: 484
Train_data_num_before: 11196
Train_data_num_after: 4351
Train_data_user_count_before: 1808
Train_data_user_count_after: 1395
Train_data_business_count_before: 1573
Train_data_business_count_after: 1249
Test_data_num_before: 14

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [10]:
train_data.to_csv("research_training_set.csv", index=False)
valid_data.to_csv("research_validation_set.csv", index=False)
test_data.to_csv("research_test_set.csv", index=False)