<a href="https://colab.research.google.com/github/lisa11323/CSR_yelp/blob/main/ECRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 0. Prepare dataset

In [None]:
# Download open dataset
# https://business.yelp.com/data/resources/open-dataset/

In [None]:
from google.colab import files

# Upload files manually from local machine
uploaded = files.upload()

In [None]:
# Convert JSON to CSV

import pandas as pd

# Load Yelp dataset JSON files
df_review = pd.read_json("yelp_academic_dataset_review.json", lines=True)
df_user = pd.read_json("yelp_academic_dataset_user.json", lines=True)
df_biz = pd.read_json("yelp_academic_dataset_business.json", lines=True)

# 1. Sampling Dataset

In [None]:
# Categorize businesses

import re

healthcare_cats = [
    'Health & Medical', 'Medical Centers', 'Doctors', 'Dentists', 'General Dentistry',
    'Cosmetic Dentists', 'Oral Surgeons', 'Endodontists', 'Orthodontists', 'Pediatric Dentists',
    'Veterinarians', 'Chiropractors', 'Physical Therapy', 'Urgent Care'
    ]

repair_cats = [
    'Auto Repair', 'Heating & Air Conditioning/HVAC', 'Plumbing', 'Appliances & Repair',
    'Auto Glass Services', 'Transmission Repair', 'Water Heater Installation/Repair'
    ]

professional_cats = [
    'Financial Services', 'Banks & Credit Unions', 'Real Estate Services',
    'Property Management', 'Real Estate Agents', 'Notaries'
    ]

def norm_token(x):
  if pd.isna(x):
    return ""
  return str(x).strip().lower()

health_set = {norm_token(c) for c in healthcare_cats}
repair_set = {norm_token(c) for c in repair_cats}
prof_set = {norm_token(c) for c in professional_cats}

df_biz = df_biz.copy()
df_biz["categories_token"] = (
    df_biz["categories"]
    .fillna("")
    .astype(str)
    .str.split(",")
    .apply(lambda xs: [norm_token(t) for t in xs if norm_token(t)])
    )

df_biz["is_healthcare"] = df_biz["categories_token"].apply(lambda toks: int(any(t in health_set for t in toks)))
df_biz["is_repair"] = df_biz["categories_token"].apply(lambda toks: int(any(t in repair_set for t in toks)))
df_biz["is_professional"] = df_biz["categories_token"].apply(lambda toks: int(any(t in prof_set for t in toks)))
biz_healthcare = df_biz.loc[df_biz["is_healthcare"] == 1].drop_duplicates(subset="business_id").copy()
biz_repair = df_biz.loc[df_biz["is_repair"] == 1].drop_duplicates(subset="business_id").copy()
biz_professional = df_biz.loc[df_biz["is_professional"] == 1].drop_duplicates(subset="business_id").copy()

mask_credence = (df_biz["is_healthcare"] == 1) | (df_biz["is_repair"] == 1) | (df_biz["is_professional"] == 1)
biz_credence = df_biz.loc[mask_credence].drop_duplicates(subset="business_id").copy()

biz_credence['postal_code'] = biz_credence['postal_code'].astype(str).str.strip()
mask_postal = biz_credence['postal_code'].str.fullmatch(r'\d{5}')
biz_credence.loc[~mask_postal, 'postal_code'] = pd.NA
biz_credence = biz_credence.dropna(subset=['categories', 'business_id','postal_code'])

print("biz_credence:", biz_credence.shape)

In [None]:
# Extract reviews of credence goods and services

df_review["business_id"] = df_review["business_id"].astype(str)
biz_credence["business_id"] = biz_credence["business_id"].astype(str)

cred_biz_ids = biz_credence["business_id"].dropna().astype(str).unique()
rev_credence = df_review[df_review["business_id"].isin(cred_biz_ids)].copy()

rev_credence['date'] = pd.to_datetime(rev_credence['date'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
rev_credence['text'] = rev_credence['text'].astype(str).str.strip()
rev_credence.loc[rev_credence['text'] == '', 'text'] = pd.NA

print("rev_credence:", rev_credence.shape)

In [None]:
# Identify reviewers of credence goods and services

rev_credence["user_id"] = rev_credence["user_id"].astype(str)
df_user["user_id"] = df_user["user_id"].astype(str)

cred_user_ids = rev_credence["user_id"].dropna().astype(str).unique()
user_credence = df_user[df_user["user_id"].isin(cred_user_ids)].copy()

user_credence['yelping_since'] = pd.to_datetime(user_credence['yelping_since'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

user_credence["name"] = user_credence["name"].astype(str)
user_credence["name"] = user_credence["name"].str.strip()
user_credence.loc[user_credence["name"].isin(["", "nan", "None", "NA", "N/A"]), "name"] = np.nan

exclude_cols = ["elite", "friends"]
cols_to_check = [c for c in user_credence.columns if c not in exclude_cols]
user_credence = user_credence.dropna(subset=cols_to_check, how="any").copy()

print("user_credence", user_credence.shape)

In [None]:
# Drop users with missing data

rev_credence["user_id"] = rev_credence["user_id"].astype(str)
user_credence["user_id"] = user_credence["user_id"].astype(str)

rev_user_ids = pd.Index(rev_credence["user_id"].dropna().unique())
user_ids = pd.Index(user_credence["user_id"].dropna().unique())

missing_in_user = rev_user_ids.difference(user_ids)

rev_credence = rev_credence[~rev_credence["user_id"].isin(missing_in_user)].copy()

print("rev_credence", rev_credence.shape)

In [None]:
# Add a prefix and aggregate columns

def preprocess_reviews(rev_credence, user_credence, biz_credence):
  rev_credence = rev_credence.rename(columns={
      'useful': 'rev_useful',
      'funny': 'rev_funny',
      'cool': 'rev_cool'
      })

  user_columns = [
      'user_id', 'name', 'review_count', 'yelping_since', 'useful', 'elite',
      'friends', 'fans', 'average_stars', 'compliment_hot', 'compliment_more',
      'compliment_profile', 'compliment_cute', 'compliment_list',
      'compliment_note', 'compliment_plain', 'compliment_cool',
      'compliment_funny', 'compliment_writer', 'compliment_photos',
      'funny', 'cool'
      ]
  user_columns_exist = [c for c in user_columns if c in user_credence.columns]
  user_data = user_credence[user_columns].rename(columns={
      'useful': 'user_useful',
      'funny': 'user_funny',
      'cool': 'user_cool'
      })

  biz_columns = [
      'business_id', 'name', 'city', 'state', 'postal_code', 'stars',
      "categories", "categories_token", "is_healthcare", "is_repair", "is_professional"

      ]
  biz_columns_exist = [c for c in biz_columns if c in biz_credence.columns]
  biz_data = biz_credence[biz_columns].rename(columns={
      'name': 'biz_name',
      'stars': 'biz_stars'
      })
  out = rev_credence.merge(user_data, on="user_id", how="left")
  out = out.merge(biz_data, on="business_id", how="left")
  return out

rev_credence = preprocess_reviews(rev_credence, user_credence, biz_credence)

print(rev_credence.shape)

In [None]:
user_credence.rename(columns={
    "useful": "user_useful",
    "funny": "user_funny",
    "cool": "user_cool"
}, inplace=True)

biz_credence.rename(columns={
    "name": "biz_name",
    "stars": "biz_stars"
}, inplace=True)

In [None]:
print("rev_credence:", rev_credence.shape)
print("biz_credence:", biz_credence.shape)
print("user_credence", user_credence.shape)

# 2. Relational social capital variables

In [None]:
# Compliment count
def add_compliment_num(df):
  df = df.copy()
  compliment_cols = [col for col in df.columns if col.startswith('compliment_')]
  df[compliment_cols] = df[compliment_cols].apply(pd.to_numeric, errors='coerce').fillna(0)
  df['compliment_num'] = df[compliment_cols].sum(axis=1)
  return df

user_credence = add_compliment_num(user_credence)

In [None]:
# Compliment diversity (Shannon entropy)

from scipy.stats import entropy
import numpy as np

def add_compliment_diversity(df):
  df = df.copy()
  compliment_cols = [col for col in df.columns if col.startswith('compliment_')]
  df[compliment_cols] = df[compliment_cols].apply(pd.to_numeric, errors='coerce').fillna(0)

  def shannon_diversity(row):
    values = row[compliment_cols].astype(float).values
    total = values.sum()
    if total == 0:
      return 0.0
    proportions = values / total
    return entropy(proportions, base=np.e)

  df['compliment_diversity'] = df.apply(shannon_diversity, axis=1)
  return df

user_credence = add_compliment_diversity(user_credence)

In [None]:
# Social feedback

def add_social_feedback(df):
  df = df.copy()
  df["social_feedback"] = df["user_funny"].astype("int64") + df["user_cool"].astype("int64")
  return df

user_credence = add_social_feedback(user_credence)

# 3. Structural social capital variables

In [None]:
!pip install python-igraph

In [None]:
def build_user_network(df):
  df_net = df[['user_id', 'friends']].drop_duplicates().reset_index(drop=True)

  edges = []
  nodes = set(df_net['user_id'].astype(str))

  for row in df_net.itertuples(index=False):
    uid = str(row.user_id).strip()
    friends_val = row.friends
    if pd.isna(friends_val):
      continue
    friends = [f.strip() for f in str(friends_val).split(',') if f.strip()]
    for f in friends:
      if f != uid:
        edges.append(tuple(sorted([uid, f])))
        nodes.add(f)
  if edges:
    edge_set = set(edges)
    edges_df = pd.DataFrame(list(edge_set), columns=["node1", "node2"])
  else:
    edges_df = pd.DataFrame(columns=["node1", "node2"])

  nodes_df = pd.DataFrame({"user_id": list(nodes)})

  return edges_df, nodes_df

edges_credence, nodes_credence = build_user_network(user_credence)

In [None]:
import igraph as ig

def build_graph(nodes_df, edges_df):
  node_ids = nodes_df["user_id"].astype(str).tolist()
  node_to_idx = {node_id: idx for idx, node_id in enumerate(node_ids)}

  edge_list = [
      (node_to_idx[a.strip()], node_to_idx[b.strip()])
      for a, b in edges_df[["node1", "node2"]].values
      if isinstance(a, str) and isinstance(b, str)
      and a.strip() in node_to_idx and b.strip() in node_to_idx
      and a.strip() != b.strip()
      ]

  g = ig.Graph()
  g.add_vertices(len(node_ids))
  if edge_list:
    g.add_edges(edge_list)
  g.vs["name"] = node_ids
  return g

g_credence = build_graph(nodes_credence, edges_credence)

In [None]:
# Degree centrality

def add_degree_centrality(user_df, nodes_df, g):
  nodes_tmp = nodes_df.copy()
  nodes_tmp["degree"] = g.degree()
  user_df = user_df.merge(nodes_tmp[["user_id", "degree"]], on="user_id", how="left")
  return user_df

user_credence = add_degree_centrality(user_credence, nodes_credence, g_credence)

In [None]:
# Pagerank centrality

def add_pagerank_centrality(user_df, nodes_df, g):
  nodes_tmp = nodes_df.copy()
  nodes_tmp["pagerank"] = g.pagerank(damping=0.85, weights=None, directed=False)
  user_df = user_df.merge(nodes_tmp[["user_id", "pagerank"]], on="user_id", how="left")
  return user_df

user_credence = add_pagerank_centrality(user_credence, nodes_credence, g_credence)

In [None]:
# k-core centrality

def add_kcore_centrality(user_df, nodes_df, g):
  nodes_tmp = nodes_df.copy()
  nodes_tmp["kcore"] = g.coreness()
  user_df = user_df.merge(nodes_tmp[["user_id", "kcore"]], on="user_id", how="left")
  return user_df

user_credence = add_kcore_centrality(user_credence, nodes_credence, g_credence)

# 4. Cognitive social capital variables

In [None]:
!pip install textblob nltk==3.8.1 textstat

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

In [None]:
# Stopwords: standard + platform name + sentiment words
stop = set(stopwords.words('english'))
stop.add("yelp")
sentiment_words = [
    "excellent", "love", "like", "awesome", "good", "great", "best", "perfect",
    "nice", "super", "wish", "thanks", "thank", "lot"
    ]
stop.update(sentiment_words)

lemmatizer = WordNetLemmatizer()

In [None]:
def clean_tokens(text):
  if pd.isna(text):
    return []

  tokens = [w for w in word_tokenize(text.lower()) if w.isalpha()]
  tagged = pos_tag(tokens)

  tagged = [(w, pos) for w, pos in tagged if not (
      pos.startswith('RB') or pos in ['IN', 'CC']
      )]

  tagged = [w for w, pos in tagged if w not in stop]

  cleaned = []
  for w, pos in pos_tag(tagged):
    if pos.startswith('V'):
      cleaned.append(lemmatizer.lemmatize(w, 'v'))
    else:
      cleaned.append(lemmatizer.lemmatize(w, 'n'))

  return cleaned

rev_credence['processed_text'] = rev_credence['text'].apply(clean_tokens)

In [None]:
# Narrativity

pronouns = {"i","we","you","he","she","they","me","us","him","her","them"}
time_words = {"today","yesterday","tomorrow","year","month","day","week","season","hour","minute"}
place_words = {"home","school","office","hospital","restaurant","city","country","room","house","street","park"}
event_verbs = {"go","come","say","tell","make","do","give","take","see","meet","leave","arrive"}

def narrativity_score(text):
  if not isinstance(text, str) or text.strip() == "":
    return 0

  tokens = nltk.word_tokenize(text.lower())
  if len(tokens) == 0:
    return 0

  pos_tags = nltk.pos_tag(tokens)

  count = 0
  for word, pos in pos_tags:
    if word in pronouns:
      count += 1
    elif word in time_words:
      count += 1
    elif word in place_words:
      count += 1
    elif word in event_verbs and pos.startswith("V"):
      count += 1

  return count / len(tokens)

def add_narrativity(df):
  df['narrativity'] = df['text'].fillna("").apply(narrativity_score)
  return df

rev_credence = add_narrativity(rev_credence)

In [None]:
# Semantic similarity

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

def sbert_semantic_similarity_processed_text(
    df,
    model_name="paraphrase-MiniLM-L3-v2",
    text_col="processed_text",
    out_col="semantic_similarity",
    batch_size=128,
    dtype=np.float32
):
    out = df.copy()
    texts = out[text_col].fillna("").astype(str).tolist()

    model = SentenceTransformer(model_name)

    emb = model.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True
    ).astype(dtype, copy=False)

    global_centroid = np.asarray(emb.mean(axis=0, keepdims=True))
    sims = np.empty(len(texts), dtype=np.float32)

    step = max(2048, batch_size)
    for i in range(0, len(texts), step):
        Xb = emb[i:i + step, :]
        sims[i:i + Xb.shape[0]] = cosine_similarity(Xb, global_centroid).ravel().astype(np.float32, copy=False)

    out[out_col] = sims
    return out

rev_credence = sbert_semantic_similarity_processed_text(
    rev_credence,
    model_name="paraphrase-MiniLM-L3-v2",
    text_col="processed_text",
    out_col="semantic_similarity",
    batch_size=128
)

In [None]:
# Concept overlap

from sklearn.feature_extraction.text import TfidfVectorizer

def to_text(x):
    if isinstance(x, list):
        return " ".join(map(str, x))
    if isinstance(x, str):
        return x
    return ""

def add_concept_overlap(df, text_col="processed_text", max_features=5000, out_col="concept_overlap"):
    out = df.copy()

    texts = out[text_col].apply(to_text).fillna("").astype(str).tolist()
    vectorizer = TfidfVectorizer(max_features=max_features)
    X = vectorizer.fit_transform(texts)

    out["concept_overlap_healthcare_tmp"] = np.nan
    out["concept_overlap_repair_tmp"] = np.nan
    out["concept_overlap_professional_tmp"] = np.nan

    sector_specs = {
        "concept_overlap_healthcare_tmp": ("is_healthcare", 1),
        "concept_overlap_repair_tmp": ("is_repair", 1),
        "concept_overlap_professional_tmp": ("is_professional", 1),
    }

    for tmp_col, (flag_col, flag_val) in sector_specs.items():
        mask = (pd.to_numeric(out[flag_col], errors="coerce") == flag_val).to_numpy()
        idx = np.where(mask)[0]
        if len(idx) == 0:
            continue
        Xs = X[idx]
        centroid = np.asarray(Xs.mean(axis=0)).ravel()
        out.iloc[idx, out.columns.get_loc(tmp_col)] = cosine_similarity(Xs, centroid.reshape(1, -1)).ravel()

    tmp_cols = ["concept_overlap_healthcare_tmp", "concept_overlap_repair_tmp", "concept_overlap_professional_tmp"]
    out[out_col] = out[tmp_cols].mean(axis=1, skipna=True)

    out = out.drop(columns=tmp_cols, errors="ignore")
    return out

rev_credence = add_concept_overlap(
    rev_credence,
    text_col="processed_text",
    max_features=5000,
    out_col="concept_overlap"
)

In [None]:
# Review length (Information load)

def add_review_length(df):
  df['review_length'] = df['text'].fillna("").str.split().str.len()
  return df

rev_credence = add_review_length(rev_credence)

# 5. Moderator

In [None]:
# Platform-driven recognition (elite count)
def count_elite_years(val):
  if pd.isna(val) or str(val).strip() == '':
    return 0
  return len(str(val).split(','))

def add_elite_count(df):
  df['elite_count'] = df['elite'].apply(count_elite_years)
  return df

user_credence = add_elite_count(user_credence)

# 6. Control variables

In [None]:
# Reference date: most recent review date (time-unit normalized)
reference_date = rev_credence['date'].max().normalize()
print(reference_date)  # 2022-01-19 00:00:00

def add_tenure(df, reference_date):
  df['tenure'] = (reference_date - df['yelping_since']).dt.days / 365.25
  return df

user_credence = add_tenure(user_credence, reference_date)

In [None]:
# Geographic breadth

user_city = rev_credence.groupby("user_id", as_index=False).agg(n_city=("city", pd.Series.nunique))

rev_credence = rev_credence.merge(user_city, on="user_id", how="left")

# 6. Aggregate dataset

In [None]:
# Variables to aggregate in rev_credence
keep_cols = [
    "user_id", "narrativity", "semantic_similarity", "concept_overlap", "review_length", "n_city"
]

# Compute user-level means (grouped by user_id)
def aggregate_user_level(user_df, review_df):
  agg_df = review_df[keep_cols].groupby("user_id").mean(numeric_only=True).reset_index()
  merged = user_df.merge(agg_df, on="user_id", how="left")
  return merged

data_credence = aggregate_user_level(user_credence, rev_credence)

In [None]:
from google.colab import files
# Save final dataset
data_credence.to_csv("data_credence.csv", index=False)

files.download("data_credence.csv")