In [2]:
## features
# - Description, MISC and Category
# - Sentiment analysis of user reviews
import pandas as pd
import json

# assuming your JSON is loaded into a variable called `data`
data = json.load(open("datasets\data_for_gpt-oss.json"))

df = pd.json_normalize(data, sep='_')

# Convert review_time (ms) to readable datetime
df['review_time'] = pd.to_datetime(df['review_time'], unit='ms')

print(df.head())
print(df.columns)

df = df[['meta_name', 'meta_address', 'meta_gmap_id', 'meta_description', 'meta_category', 'meta_avg_rating',
       'meta_num_of_reviews','review_user_id',
       'review_name', 'review_rating', 'review_text',
       'review_pics', 'review_resp', 'review_gmap_id',
       'meta_MISC_Service options', 'meta_MISC_Accessibility',
       'meta_MISC_Amenities', 'meta_MISC_Planning', 'review_resp_time',
       'review_resp_text', 'meta_MISC_Offerings', 'meta_MISC_Payments',
       'meta_MISC_From the business', 'meta_MISC_Health & safety',
       'meta_MISC_Highlights', 'meta_MISC_Popular for',
       'meta_MISC_Dining options', 'meta_MISC_Atmosphere', 'meta_MISC_Crowd',
       'meta_MISC_Lodging options', 'meta_MISC_Health and safety',
       'meta_MISC_Recycling']]

print(df.head())
print(df.columns)
print(df.info())

# # Select all MISC columns
# misc_cols = [col for col in df.columns if col.startswith("meta_MISC_")]

# # Replace NaN with "No" and flatten lists to comma-separated strings
# for col in misc_cols:
#     df[col] = df[col].fillna("No").apply(
#         lambda x: ", ".join(x) if isinstance(x, list) else x
#     )


misc_cols = [col for col in df.columns if col.startswith("meta_MISC_")]
# for col in misc_cols:
#     df[col] = df[col].fillna("").apply(
#         lambda x: ", ".join([c.strip() for c in x]) if isinstance(x, list) else str(x).strip()
#     )

# # When combining, also strip the final result
# df['business_features'] = (
#     df['meta_description'].fillna("").str.strip() + ", " +
#     df['meta_category'] + ", " +
#     df[misc_cols].apply(lambda row: " ".join(row.values.astype(str)), axis=1)
# ).str.strip()

# print(df['business_features'].unique())

def build_features(row):
    parts = []
    
    # Description
    if pd.notna(row["meta_description"]) and str(row["meta_description"]).strip() != "":
        parts.append(str(row["meta_description"]).strip())
    
    # Category (can be list)
    cat = row["meta_category"]
    if isinstance(cat, list):
        cat_str = ", ".join([str(x).strip() for x in cat if str(x).strip() not in ("", "nan")])
        if cat_str:
            parts.append(cat_str)
    elif pd.notna(cat) and str(cat).strip() not in ("", "nan"):
        parts.append(str(cat).strip())
    
    # MISC fields
    misc_texts = []
    for v in row[misc_cols]:
        if isinstance(v, list):
            misc_texts.append(", ".join([str(x).strip() for x in v if str(x).strip() not in ("", "nan")]))
        elif pd.notna(v) and str(v).strip() not in ("", "nan"):
            misc_texts.append(str(v).strip())
    
    if misc_texts:
        parts.append(" ".join(misc_texts))
    
    # Join all non-empty parts with ", "
    return ", ".join(parts)

# Apply to build business_features
df["business_features"] = df.apply(build_features, axis=1)




  data = json.load(open("datasets\data_for_gpt-oss.json"))


                     meta_name  \
0  Bear Creek Cabins & RV Park   
1  Bear Creek Cabins & RV Park   
2  Bear Creek Cabins & RV Park   
3  Bear Creek Cabins & RV Park   
4  Bear Creek Cabins & RV Park   

                                        meta_address  \
0  Bear Creek Cabins & RV Park, 3181 Richardson H...   
1  Bear Creek Cabins & RV Park, 3181 Richardson H...   
2  Bear Creek Cabins & RV Park, 3181 Richardson H...   
3  Bear Creek Cabins & RV Park, 3181 Richardson H...   
4  Bear Creek Cabins & RV Park, 3181 Richardson H...   

                            meta_gmap_id meta_description  meta_latitude  \
0  0x56b646ed2220b77f:0xd8975e316de80952             None      61.100644   
1  0x56b646ed2220b77f:0xd8975e316de80952             None      61.100644   
2  0x56b646ed2220b77f:0xd8975e316de80952             None      61.100644   
3  0x56b646ed2220b77f:0xd8975e316de80952             None      61.100644   
4  0x56b646ed2220b77f:0xd8975e316de80952             None      61.100644   

 

In [8]:
from transformers import pipeline

# Initialize sentiment analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis")


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]



In [9]:
def get_sentiment_label(text):
    if not isinstance(text, str) or text.strip() == "":
        return "NEUTRAL"
    result = sentiment_pipeline(text[:512])[0]  # truncate to 512 chars for efficiency
    # result['label'] can be e.g., 'POSITIVE', 'NEGATIVE', 'NEUTRAL' (depends on model)
    return result['label'].upper()


In [10]:
df['review_sentiment'] = df['review_text'].apply(get_sentiment_label)
print(df[['review_text', 'review_sentiment']].head(10))

                                         review_text review_sentiment
0  We always stay here when in Valdez for silver ...         POSITIVE
1  Great campground for the price. Nice hot unlim...         POSITIVE
2  We tent camped here for 2 nights while explori...         NEGATIVE
3  This place is just a few miles outside Valdez,...         NEGATIVE
4  Probably the nicest and cleanest campground we...         POSITIVE
5  Great, slept like a bear. Clean, convenient, c...         POSITIVE
6  It is always a treat to visit this rain or shi...         POSITIVE
7  Only 3 booths with mainly vegetables. More exp...         NEGATIVE
8  Not a lot going on here since it moved from do...         NEGATIVE
9                                      It's a market         POSITIVE


In [11]:
def rating_to_sentiment(rating):
    if rating <= 2:
        return "NEGATIVE"
    elif rating == 3:
        return "NEUTRAL"
    elif rating >= 4:
        return "POSITIVE"
    else:
        return "NEUTRAL"

df['rating_sentiment'] = df['review_rating'].apply(rating_to_sentiment)

In [12]:
# Example: flag matches / mismatches
df['sentiment_match'] = df['review_sentiment'] == df['rating_sentiment']

# Summary statistics
match_rate = df['sentiment_match'].mean()
print(f"Percentage of review sentiments matching user ratings: {match_rate*100:.2f}%")

Percentage of review sentiments matching user ratings: 86.34%


In [13]:
df.to_csv("datasets/processed_reviews_with_sentiment.csv", index=False)

In [15]:
# Convert to JSON file
df.to_json("datasets/processed_reviews_with_sentiment.json", 
           orient="records",  # list of dicts
           lines=False,       # False → entire file is one JSON array
           force_ascii=False) # keep non-ASCII characters

# # Example: to get JSON string in memory
# json_str = df.to_json(orient="records", force_ascii=False)
