# Clean and Explore

**Setup & Import**

In [1]:
import sys
current_dir = "/Users/josephtolsma/Documents/dev/yelp_rag"
sys.path.insert(0,current_dir)

In [2]:
from src.config import DATA_DIR_SAMP,COL_TEXT,MIN_REVIEW_CHARS,CHUNK_CHARS,OVERLAP_CHARS,MIN_CHUNK_CHARS
import pandas as pd
import re
import os
import unicodedata

In [3]:
pd.set_option("display.max_columns",None)
pd.set_option("display.max_colwidth",None)

In [4]:
reviews_df = pd.read_csv(os.path.join(DATA_DIR_SAMP,"reviews_df.csv"))

**Cleaning Functions**

In [5]:
def clean_review_text(df):
    "for dataframe df, remove nonstandard characters and drop unusable (too short) values"

    before = len(df)
    # basic cleaning    
    df[COL_TEXT] = df[COL_TEXT].astype("string").str.strip()
    df = df.dropna(subset = [COL_TEXT])

    # remove ultra-short reviews
    df = df[df[COL_TEXT].str.len() >= MIN_REVIEW_CHARS]

    # remove nonstandard characters
    invalid_chars = {
        "\u00a0":" ",
        "\u002b":"",
        "\xa0":" ",
        "\x0b":" ",
        "“":'"',
        "’": "'",
    }
    
    for char,rep_str in invalid_chars.items():
        df[COL_TEXT] = df[COL_TEXT].str.replace(char,rep_str,regex = False)

    df[COL_TEXT] = df[COL_TEXT].str.replace(r"\s+", " ", regex=True).str.strip()
    
    print(f"{before - len(df)} reviews dropped in cleaning step.")
    return df

In [6]:
def normalize_unicode(text):
    "convert all characters to standard unicode"
    return unicodedata.normalize("NFKC",text)

In [7]:
def deduplicate_reviews(df):
    "remove duplicated review texts from the dataset"
    before = len(df)
    df = df.drop_duplicates(subset = [COL_TEXT])
    print(f"{before - len(df)} reviews dropped in deduplicating step.")
    return df

In [8]:
reviews_df = clean_review_text(reviews_df)
reviews_df[COL_TEXT] = reviews_df[COL_TEXT].apply(normalize_unicode)
reviews_df = deduplicate_reviews(reviews_df)

0 reviews dropped in cleaning step.
3 reviews dropped in deduplicating step.


**Chunking Functions**

In [9]:
test_review = reviews_df[reviews_df[COL_TEXT].str.len() > 1100][COL_TEXT].sample(1,random_state=42).values[0]

In [15]:
chunk_dict = {}
char_idx = 0
chunk_idx = 0
while char_idx < len(test_review):
    upcoming_chunk_length = len(test_review) - char_idx + OVERLAP_CHARS
    if len(test_review) <= CHUNK_CHARS:
        chunk_dict.update(
            {"_0":test_review}
            )
    elif char_idx == 0:
        chunk_dict.update(
            {"_"+str(chunk_idx):test_review[0:CHUNK_CHARS]}
            )
    elif upcoming_chunk_length > MIN_CHUNK_CHARS:
        chunk_dict.update(
            {"_"+str(chunk_idx):test_review[(char_idx - OVERLAP_CHARS):(char_idx+CHUNK_CHARS-OVERLAP_CHARS)]}
            )
    else:
        print(f"Chunk dropped because it was smaller than minimum size of {MIN_CHUNK_CHARS} characters.")
    chunk_idx +=1
    char_idx+=(CHUNK_CHARS - OVERLAP_CHARS)
        
    
    

In [16]:
df = pd.DataFrame(chunk_dict,index = ["chunk"]).T.reset_index(names = ["chunk_id"])

In [12]:
# reviews_df has restaurant_id, restaurant_name, text

# Step 1: ensure review_id exists
# Step 2: set chunk params
# Step 3: define chunk_text(text) -> list[str]
# Step 4: iterate rows and expand
# Step 5: add chunk_index + chunk_id
# Step 6: sanity checks
# Step 7: save chunks_df


In [14]:
df

Unnamed: 0,chunk_id,chunk
0,_0,"Welcome to the first of 29 reviews I'm writing in February as part of Yelp Tucson's Spread The Love Review Challenge 2012. (read more here http://www.yelp.com/topic/tucson-2012-yelpolutions-challenge-february) Color me impressed Tanque Verde & Sabino Canyon. Outside of having one of the most craptastic intersections in all of Tucson, your instantly gaining points with me for this little gem of an Italian restaurant! At this point I've only had one dish, but it's incredible... smoked salmon and greens, tossed with olives, parmesan, feta and bow-tie pasta. If I'm being completely honest (which I'm going to be) the pasta doesn't do anything for me except make me feel like I am 5, so I ditch that. I also don't cheese it up because I'm lactarded and don't want to suffer the consequences of devouring dairy. With that said, this salad is topped with some of the best smoked fish I have ever eaten. And I usually make fun of people who say things like ""the best thing I ever put in my mouth"" and"
1,_1,"'m going to be) the pasta doesn't do anything for me except make me feel like I am 5, so I ditch that. I also don't cheese it up because I'm lactarded and don't want to suffer the consequences of devouring dairy. With that said, this salad is topped with some of the best smoked fish I have ever eaten. And I usually make fun of people who say things like ""the best thing I ever put in my mouth"" and ""so good I could die,"" so trust me when I say it's good. I'm told they smoke it there and I've never come up for air long enough while mauling the thing to ask what that means. I'll find out though and tell you what I come up with. I don't LOVE the seating style (see I just used the word love in case you were looking for it as part of February's review challenge), it just doesn't feel as warm and friendly as I'd expect from an Osteria, not that I know what an Osteria is. Gus... the G-U-S in Gusto tells me that it's sort of like a wine bar with food. He's a pretty cool guy, owner and overseer o"
2,_2,"friendly as I'd expect from an Osteria, not that I know what an Osteria is. Gus... the G-U-S in Gusto tells me that it's sort of like a wine bar with food. He's a pretty cool guy, owner and overseer of operations. OOO[1]. Check them out, they're super tucked in the Safeway shopping center, and worth the drive and that stupid intersection for the smoked salmon. [1] Completely made up term, but I think if I owned a company I'd rather be an OOO than a CEO or CFO. How cool would it be to hear people say, that's Corey, he's our OOO as I walked down the hall! AND if I happened to be a magician and an OOO people could say, Corey, OOO he's magic."
