# Full Data Preparation PipelineThis notebook consolidates all preprocessing steps used to create `review_business_5up_5aspect_3sentiment_vectorized_clean.json`.

In [None]:
import pandas as pdimport jsonimport refrom collections import defaultdictfrom tqdm import tqdmfrom transformers import DebertaV2Tokenizer

In [None]:
# --- Step 1: business.json preprocessing ---df_B = pd.read_json("data/raw/yelp_academic_dataset_business.json", lines=True)business_df = df_B.copy()drop_cols = ['postal_code','latitude','longitude','attributes','hours']business_df = business_df.drop(columns=drop_cols)business_df.loc[business_df['city'].str.lower().str.contains("philadelphia", na=False),'city'] = "Philadelphia"def load_categories(fp):    with open(fp,'r',encoding='utf-8') as f:        return set(line.strip().lower() for line in f if line.strip())food_categories = load_categories('data/raw/food.txt')restaurant_categories = load_categories('data/raw/restaurant.txt')target_categories = food_categories.union(restaurant_categories)def category_match(row):    if isinstance(row,str):        biz_categories = set(cat.strip().lower() for cat in row.split(','))        return bool(biz_categories & target_categories)    return Falsebusiness_food_df = business_df[business_df['categories'].apply(category_match)]top_state = business_food_df['state'].value_counts().idxmax()business_pa_df = business_food_df[business_food_df['state']==top_state]business_paph_df = business_pa_df[business_pa_df['city']=="Philadelphia"]mask = business_paph_df.apply(lambda col: col.map(lambda x: pd.isna(x) or (isinstance(x,str) and x.strip()==""))).any(axis=1)business_paph_df_2 = business_paph_df[~mask].reset_index(drop=True)business_paph_df_2.to_json("data/output/business.json", orient="records", lines=True, force_ascii=False)print(f"✅ 총 {len(business_paph_df_2)}개 항목이 'data/output/business.json'에 저장되었습니다.")

In [None]:
# --- Step 1: review.json preprocessing ---chunk_size=100000chunks=pd.read_json("data/raw/yelp_academic_dataset_review.json", lines=True, chunksize=chunk_size)df_review=pd.concat(chunk for chunk in chunks)business_ids=set(business_paph_df_2['business_id'])df_review=df_review[df_review['business_id'].isin(business_ids)]df_review=df_review.drop(columns=['funny','cool'])tqdm.pandas()tokenizer=DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v3-base")df_review['token_length']=df_review['text'].progress_apply(lambda x: len(tokenizer.tokenize(x)))df_review.to_json("data/output/review.json", orient="records", lines=True, force_ascii=False)

In [None]:
# --- Step 1: user.json preprocessing ---chunks=pd.read_json("data/raw/yelp_academic_dataset_user.json", lines=True, chunksize=100000)df_user=pd.concat(chunk for chunk in chunks)drop_columns=['yelping_since','funny','cool','elite','friends','fans','compliment_hot','compliment_more','compliment_profile','compliment_cute','compliment_list','compliment_note','compliment_plain','compliment_cool','compliment_funny','compliment_writer','compliment_photos']df_user=df_user.drop(columns=drop_columns)review_counts=df_review['user_id'].value_counts()user_ids_5plus=review_counts[review_counts>=5].indexdf_user=df_user[df_user['user_id'].isin(user_ids_5plus)]df_user.to_json("data/output/user.json", orient="records", lines=True, force_ascii=False)

In [None]:
# --- Step 2: merge review, user and business ---def load_jsonl(path):    with open(path,'r',encoding='utf-8') as f:        return [json.loads(line) for line in f]reviews=load_jsonl("data/output/review.json")users=load_jsonl("data/output/user.json")businesses=load_jsonl("data/output/business.json")user_dict={u['user_id']:u for u in users}business_dict={b['business_id']:b for b in businesses}merged_data=[]for r in tqdm(reviews, desc='병합 중'):    uid=r['user_id']; bid=r['business_id']    if uid in user_dict and bid in business_dict:        m=r.copy()        for k,v in user_dict[uid].items():            m[f'user_{k}']=v        for k,v in business_dict[bid].items():            m[f'business_{k}']=v        merged_data.append(m)with open("merged_dataset.json","w",encoding='utf-8') as f:    for row in merged_data:        json.dump(row,f,ensure_ascii=False); f.write("")print("✅ 병합 완료: merged_dataset.json")

In [None]:
# --- Step 3: attach IDs to ABSA results and vectorize ---# filter merged dataset for users with >=5 reviewsuser_review_counts=defaultdict(int)with open("merged_dataset.json","r",encoding='utf-8') as f:    for line in f:        obj=json.loads(line)        user_review_counts[obj['user_id']]+=1qualified_users={u for u,c in user_review_counts.items() if c>=5}filtered_reviews=[]with open("merged_dataset.json","r",encoding='utf-8') as f:    for line in f:        obj=json.loads(line)        if obj['user_id'] in qualified_users:            filtered_reviews.append(obj)with open("merged_dataset_5up_users_only.json","w",encoding='utf-8') as f:    for obj in filtered_reviews:        f.write(json.dumps(obj,ensure_ascii=False)+"")print("✅ 필터링 완료:",len(filtered_reviews),'개 리뷰 저장 → merged_dataset_5up_users_only.json')id_map={}for obj in filtered_reviews:    rid=obj['review_id']    id_map[rid]={        'user_id':obj['user_id'],        'business_id':obj['business_id'],        'stars':obj['review_stars'],        'review_useful':obj['review_useful'],        'review_date':obj['review_date']    }updated=[]with open("review_5up_5aspect_3sentiment.json","r",encoding='utf-8') as f:    for line in tqdm(f, desc='ID 및 평점 추가 중'):        obj=json.loads(line)        rid=obj.get('review_id')        if rid in id_map:            obj.update(id_map[rid])            updated.append(obj)with open("review_5up_5aspect_3sentiment_with_ids.json","w",encoding='utf-8') as f:    for obj in updated:        f.write(json.dumps(obj,ensure_ascii=False)+"")print("✅ 저장 완료:",len(updated),'건 → review_5up_5aspect_3sentiment_with_ids.json')input_file="review_5up_5aspect_3sentiment_with_ids.json"output_file="review_5up_5aspect_3sentiment_vectorized_clean.json"def sentiment_to_vector(sentiment_dict):    aspects=['food','service','price','ambience','location']    polarities=['Negative','Neutral','Positive']    vector=[]    for asp in aspects:        scores=sentiment_dict.get(asp,{}).get('scores',{})        for pol in polarities:            vector.append(scores.get(pol,0.0))    return vectorwith open(input_file,'r',encoding='utf-8') as fin, open(output_file,'w',encoding='utf-8') as fout:    for line in fin:        obj=json.loads(line)        vec=sentiment_to_vector(obj.get('sentiment',{}))        cleaned={'review_id':obj.get('review_id'),'user_id':obj.get('user_id'),'business_id':obj.get('business_id'),'stars':obj.get('stars'),'review_date':obj.get('review_date'),'sentiment_vector':vec}        fout.write(json.dumps(cleaned,ensure_ascii=False)+"")print("✅ 완료: text와 sentiment 제거 후 저장 →",output_file)

In [None]:
# --- Step 4: filter users with <5 unique businesses ---input_file="review_5up_5aspect_3sentiment_vectorized_clean.json"output_file="review_business_5up_5aspect_3sentiment_vectorized_clean.json"user_biz_ids=defaultdict(set)with open(input_file,'r',encoding='utf-8') as f:    for line in f:        obj=json.loads(line)        user_biz_ids[obj['user_id']].add(obj['business_id'])with open(input_file,'r',encoding='utf-8') as fin, open(output_file,'w',encoding='utf-8') as fout:    for line in fin:        obj=json.loads(line)        if len(user_biz_ids[obj['user_id']])>=5:            fout.write(json.dumps(obj,ensure_ascii=False)+"")print("✅ 완료: business_id가 5개 미만인 사용자 제거 후 저장 →",output_file)