## Basic Setting & Installation

In [1]:
from google.colab import drive
from google.colab import userdata
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
!pip install tiktoken

## Data Load

In [8]:
import pandas as pd
import os

base_path = "/content/drive/MyDrive/Profect SAE/test_data"
input_file = os.path.join(base_path, "test_page_data.csv")
output_file = os.path.join(base_path, "test_page_data_preprocessed.csv")

if os.path.exists(input_file):
  print("input: good to go")
else:
  print("input: not good")

if os.path.exists(output_file):
  print("output: good to go")
else:
  print("output: not good")


input: good to go
output: not good


## Data Preprocessing

In [9]:
df = pd.read_csv(input_file)

### Text Cleaning

In [10]:
import re

def text_cleaning(text):
  if isinstance(text, str):
    # lower case
    text = text.lower()
    # empty space removal
    text = text.strip()
    # multiple empty space removal
    text = re.sub(r'\s+', ' ', text)
    # special character removal
    text = re.sub(r'[^\w\s.,!?$%&@-]', '', text)
  return text

In [20]:
# Text Cleaning
df['cleaned_innerText'] = df['innerText'].apply(text_cleaning)

# Other preprocessing tactics....

# Saving
df.to_csv(output_file, index=False)

## Test

In [21]:
import tiktoken

def cleaning_result_comparison(df):
  tokenizer = tiktoken.get_encoding("cl100k_base")

  avg_legnth_decrease = 0
  avg_token_decrease = 0

  for index, row in df.iterrows():
    before = str(row['innerText'])
    after = str(row['cleaned_innerText'])

    before_length = len(before)
    after_length = len(after)

    before_tokens = len(tokenizer.encode(before))
    after_tokens = len(tokenizer.encode(after))

    avg_legnth_decrease += (before_length - after_length)
    avg_token_decrease += (before_tokens - after_tokens)

    print(f"Row {index + 1}")
    print(f" - Original Text Length: {before_length}, Cleaned Text Length: {after_length}")
    print(f" - Original Text Tokens: {before_tokens}, Cleaned Text Tokens: {after_tokens}")
    print("---")

  print("")
  print("Result")
  print(f"Number of length decreased after cleaning: {avg_legnth_decrease/len(df)}")
  print(f"Number of tokens decreased after cleaning: {avg_token_decrease/len(df)}")

In [19]:
cleaning_result_comparison(df)

Row 1
 - Original Text Length: 4383, Cleaned Text Length: 4208
 - Original Text Tokens: 4674, Cleaned Text Tokens: 4471
---
Row 2
 - Original Text Length: 7123, Cleaned Text Length: 6635
 - Original Text Tokens: 7423, Cleaned Text Tokens: 6926
---
Row 3
 - Original Text Length: 2215, Cleaned Text Length: 2080
 - Original Text Tokens: 2095, Cleaned Text Tokens: 1864
---
Row 4
 - Original Text Length: 1012, Cleaned Text Length: 932
 - Original Text Tokens: 945, Cleaned Text Tokens: 838
---
Row 5
 - Original Text Length: 930, Cleaned Text Length: 882
 - Original Text Tokens: 888, Cleaned Text Tokens: 796
---

Result
Number of length decreased after cleaning: 185.2
Number of tokens decreased after cleaning: 226.0
