# Clean and Chunk

**Setup & Import**

In [None]:
import sys
current_dir = "/Users/josephtolsma/Documents/dev/yelp_rag"
sys.path.insert(0,current_dir)

In [None]:
from src.config import DATA_DIR_SAMP,DATA_DIR_PROC,COL_TEXT, \
                       MIN_REVIEW_CHARS, CHUNK_CHARS,OVERLAP_CHARS, \
                       MIN_CHUNK_CHARS
import pandas as pd
import re
import os
import unicodedata

In [None]:
pd.set_option("display.max_columns",None)
pd.set_option("display.max_colwidth",None)

In [None]:
reviews_df = pd.read_csv(os.path.join(DATA_DIR_SAMP,"reviews_df.csv"))

**Cleaning Functions**

In [None]:
def clean_review_text(df):
    "for dataframe df, remove nonstandard characters and drop unusable (too short) values"

    before = len(df)
    # basic cleaning    
    df[COL_TEXT] = df[COL_TEXT].astype("string").str.strip()
    df = df.dropna(subset = [COL_TEXT])

    # remove ultra-short reviews
    df = df[df[COL_TEXT].str.len() >= MIN_REVIEW_CHARS]

    # remove nonstandard characters
    invalid_chars = {
        "\u00a0":" ",
        "\u002b":"",
        "\xa0":" ",
        "\x0b":" ",
        "“":'"',
        "’": "'",
    }
    
    for char,rep_str in invalid_chars.items():
        df[COL_TEXT] = df[COL_TEXT].str.replace(char,rep_str,regex = False)

    df[COL_TEXT] = df[COL_TEXT].str.replace(r"\s+", " ", regex=True).str.strip()
    
    print(f"{before - len(df)} reviews dropped in cleaning step.")
    return df

In [None]:
def normalize_unicode(text):
    "convert all characters to standard unicode"
    return unicodedata.normalize("NFKC",text)

In [None]:
def deduplicate_reviews(df):
    "remove duplicated review texts from the dataset"
    before = len(df)
    df = df.drop_duplicates(subset = [COL_TEXT])
    print(f"{before - len(df)} reviews dropped in deduplicating step.")
    return df

In [None]:
reviews_df = clean_review_text(reviews_df)
reviews_df[COL_TEXT] = reviews_df[COL_TEXT].apply(normalize_unicode)
reviews_df = deduplicate_reviews(reviews_df)

**Chunking Functions**

In [None]:
# test_review = reviews_df[reviews_df[COL_TEXT].str.len() > 1100][COL_TEXT].sample(1,random_state=42).values[0]

In [None]:
# chunk_dict = {}
# char_idx = 0
# chunk_idx = 0
# while char_idx < len(test_review):
#     upcoming_chunk_length = len(test_review) - char_idx + OVERLAP_CHARS
#     if len(test_review) <= CHUNK_CHARS:
#         chunk_dict.update(
#             {"_0":test_review}
#             )
#     elif char_idx == 0:
#         chunk_dict.update(
#             {"_"+str(chunk_idx):test_review[0:CHUNK_CHARS]}
#             )
#     elif upcoming_chunk_length > MIN_CHUNK_CHARS:
#         chunk_dict.update(
#             {"_"+str(chunk_idx):test_review[(char_idx - OVERLAP_CHARS):(char_idx+CHUNK_CHARS-OVERLAP_CHARS)]}
#             )
#     else:
#         print(f"Chunk dropped because it was smaller than minimum size of {MIN_CHUNK_CHARS} characters.")
#     chunk_idx +=1
#     char_idx+=(CHUNK_CHARS - OVERLAP_CHARS)

# df = pd.DataFrame(chunk_dict,index = ["chunk"]).T.reset_index(names = ["chunk_id"])
        
    
    

In [None]:
def divide_reviews_into_chunks(text):
    chunk_dict = {}
    char_idx = 0
    chunk_idx = 0
    while char_idx < len(text):
        upcoming_chunk_length = len(text) - char_idx + OVERLAP_CHARS
        if len(text) <= CHUNK_CHARS:
            chunk_dict.update(
                {0:text}
                )
        elif char_idx == 0:
            chunk_dict.update(
                {chunk_idx:text[0:CHUNK_CHARS]}
                )
        elif upcoming_chunk_length > MIN_CHUNK_CHARS:
            chunk_dict.update(
                {chunk_idx:text[(char_idx - OVERLAP_CHARS):(char_idx+CHUNK_CHARS-OVERLAP_CHARS)]}
                )
        chunk_idx +=1
        char_idx+=(CHUNK_CHARS - OVERLAP_CHARS)

        return pd.DataFrame(chunk_dict,index = ["chunk"]).T.reset_index(names = ["chunk_index"])

In [None]:
def generate_chunk_df(reviews_df):
    chunk_df = pd.DataFrame()
    for row in reviews_df.itertuples():
        new_row = divide_reviews_into_chunks(row.text)
        new_row["business_id"] = row.business_id
        new_row["review_id"] = row.review_id
        new_row["restaurant_name"] = row.name
        new_row["chunk_id"] = f"{row.review_id}_{new_row["chunk_index"].values[0]}"
        new_row["n_chars"] = new_row["chunk"].str.len()
        new_row["stars"] = row.stars_restaurant
        new_row["date"] = row.date
        chunk_df = pd.concat([chunk_df,new_row],axis = 0)
        chunk_df = chunk_df.reset_index(drop = True)
    return chunk_df

In [13]:
review_chunks = generate_chunk_df(reviews_df)

In [14]:
review_chunks.to_parquet(os.path.join(DATA_DIR_PROC,"review_chunks.parquet"),engine = "pyarrow")