<a href="https://colab.research.google.com/github/lisa11323/CSR_yelp/blob/main/Variable_Construction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 0. Prepare dataset

In [None]:
# Download open dataset
# https://business.yelp.com/data/resources/open-dataset/

In [None]:
from google.colab import files

# Upload files manually from local machine
uploaded = files.upload()

In [None]:
# Convert JSON to CSV

import pandas as pd

# Load Yelp dataset JSON files
df_review = pd.read_json("yelp_academic_dataset_review.json", lines=True)
df_user = pd.read_json("yelp_academic_dataset_user.json", lines=True)
df_biz = pd.read_json("yelp_academic_dataset_business.json", lines=True)

# Save to CSV
df_review.to_csv("review.csv", index=False)
df_user.to_csv("user.csv", index=False)
df_biz.to_csv("business.csv", index=False)

In [None]:
# Download CSV files to local machine
files.download("yelp_academic_dataset_review.csv")
files.download("yelp_academic_dataset_user.csv")
files.download("yelp_academic_dataset_business.csv")

# 1. Sampling Dataset

In [None]:
import pandas as pd

# Filter Health & Medical businesses
hm_biz = df_biz[df_biz['categories'].str.contains("Health & Medical", na=False)]
hm_business_ids = hm_biz['business_id'].tolist()

# Select reviews for Health & Medical businesses (2020–2022)
df_hm = df_review[df_review['business_id'].isin(hm_business_ids)].copy()
df_hm['date'] = pd.to_datetime(df_hm['date'])
df_hm = df_hm[(df_hm['date'].dt.year >= 2020) & (df_hm['date'].dt.year <= 2022)]

def preprocess_reviews(df, df_user):
    # Keep review-level funny/cool with prefix
    df = df.rename(columns={
        'useful': 'rev_useful',
        'funny': 'rev_funny',
        'cool': 'rev_cool'
    })
    df = df.drop(columns=['business_id'], errors='ignore')

    # Select user-level columns including funny/cool
    user_columns = ['user_id', 'name', 'review_count', 'yelping_since', 'useful', 'elite',
                    'friends', 'fans', 'average_stars', 'compliment_hot', 'compliment_more',
                    'compliment_profile', 'compliment_cute', 'compliment_list',
                    'compliment_note', 'compliment_plain', 'compliment_cool',
                    'compliment_funny', 'compliment_writer', 'compliment_photos',
                    'funny', 'cool']
    user_data = df_user[user_columns].rename(columns={'useful': 'user_useful',
                                                      'funny': 'user_funny',
                                                      'cool': 'user_cool'})

    # Merge review and user data
    df = df.merge(user_data, on='user_id', how='left')
    return df

df_hm = preprocess_reviews(df_hm, df_user)

In [None]:
hm_user = df_hm.drop_duplicates(subset='user_id').copy()

# User-level columns
user_cols = ['user_id', 'name', 'review_count', 'yelping_since', 'user_useful',
             'elite', 'friends', 'fans', 'average_stars', 'user_funny', 'user_cool'] + \
            [col for col in df_hm.columns if col.startswith('compliment_')]

hm_user = hm_user[user_cols]

# 2. Relational social capital variables

In [None]:
# Convert compliment columns to numeric and create compliment count
compliment_cols = [col for col in hm_user.columns if col.startswith('compliment_')]
hm_user[compliment_cols] = hm_user[compliment_cols].apply(pd.to_numeric, errors='coerce').fillna(0)
hm_user['compliment_num'] = hm_user[compliment_cols].sum(axis=1)

In [None]:
from scipy.stats import entropy

# Calculate compliment diversity (Shannon entropy)
def shannon_diversity(row):
    values = row[compliment_cols].astype(float).values
    total = values.sum()
    if total == 0:
        return 0
    proportions = values / total
    return entropy(proportions, base=np.e)

hm_user['compliment_diversity'] = hm_user.apply(shannon_diversity, axis=1)

In [None]:
# Social feedback (funny + cool votes)
hm_user['user_funny'] = pd.to_numeric(hm_user['user_funny'], errors='coerce').fillna(0)
hm_user['user_cool'] = pd.to_numeric(hm_user['user_cool'], errors='coerce').fillna(0)
hm_user['social_feedback'] = hm_user['user_funny'] + hm_user['user_cool']

# 3. Structural social capital variables

In [None]:
!pip install python-igraph

In [None]:
hm_network = df_hm[['user_id', 'friends']].drop_duplicates().reset_index(drop=True)

# Build edge list
edges = []
nodes = set(hm_network['user_id'].astype(str))

for row in hm_network.itertuples(index=False):
    uid = str(row.user_id).strip()
    if pd.isna(row.friends):
        continue
    friends = [f.strip() for f in row.friends.split(',') if f.strip()]
    for f in friends:
        if f != uid:
            edges.append((uid, f))
            nodes.add(f)

# Unique undirected edges
edge_set = set(tuple(sorted([a, b])) for a, b in edges)
edges_df = pd.DataFrame(edge_set, columns=["node1", "node2"])
nodes_df = pd.DataFrame({"user_id": list(nodes)})

In [None]:
# Build igraph graph
import igraph as ig
node_ids = nodes_df["user_id"].astype(str).tolist()
node_to_idx = {node_id: idx for idx, node_id in enumerate(node_ids)}

edges = [
    (node_to_idx[a.strip()], node_to_idx[b.strip()])
    for a, b in edges_df[["node1", "node2"]].values
    if a.strip() in node_to_idx and b.strip() in node_to_idx and a.strip() != b.strip()
]

g = ig.Graph()
g.add_vertices(len(node_ids))
g.add_edges(edges)
g.vs["name"] = node_ids

In [None]:
# Degree centrality

nodes_df["degree"] = g.degree()
hm_user = hm_user.merge(nodes_df[["user_id", "degree"]], on="user_id", how="left")

In [None]:
# Pagerank centrality

nodes_df["pagerank"] = g.pagerank(damping=0.85, weights=None, directed=False)
hm_user = hm_user.merge(nodes_df[["user_id", "pagerank"]], on="user_id", how="left")

In [None]:
# k-core

nodes_df["kcore"] = g.coreness()
hm_user = hm_user.merge(nodes_df[["user_id", "kcore"]], on="user_id", how="left")

# 4. Cognitive social capital variables

In [None]:
!pip install textblob nltk==3.8.1 textstat

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import pandas as pd

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

In [None]:
# stopwords: standard + platform name + sentiment words
stop = set(stopwords.words('english'))
stop.add("yelp")
sentiment_words = [
    "excellent", "love", "like", "awesome", "good", "great", "best", "perfect",
    "nice", "super", "wish", "thanks", "thank", "lot"
]
stop.update(sentiment_words)

lemmatizer = WordNetLemmatizer()

In [None]:
# Preprocessing function
def clean_tokens(text):
    if pd.isna(text):
        return []

    # Lowercase and remove numbers
    tokens = [w for w in word_tokenize(text.lower()) if w.isalpha()]

    # POS tagging
    tagged = pos_tag(tokens)

    # Remove adverbs (RB*), prepositions (IN), conjunctions (CC)
    tagged = [(w, pos) for w, pos in tagged if not (
        pos.startswith('RB') or pos in ['IN', 'CC']
    )]

    # Remove stopwords
    tagged = [w for w, pos in tagged if w not in stop]

    # Lemmatization (verbs vs nouns)
    cleaned = []
    for w, pos in pos_tag(tagged):
        if pos.startswith('V'):
            cleaned.append(lemmatizer.lemmatize(w, 'v'))
        else:
            cleaned.append(lemmatizer.lemmatize(w, 'n'))

    return cleaned

df_hm['processed_text'] = df_hm['text'].apply(clean_tokens)

In [None]:
# Narrativity

pronouns = {"i","we","you","he","she","they","me","us","him","her","them"}
time_words = {"today","yesterday","tomorrow","year","month","day","week","season","hour","minute"}
place_words = {"home","school","office","hospital","restaurant","city","country","room","house","street","park"}
event_verbs = {"go","come","say","tell","make","do","give","take","see","meet","leave","arrive"}

def narrativity_score(text):
    if not isinstance(text, str) or text.strip() == "":
        return 0

    tokens = nltk.word_tokenize(text.lower())
    if len(tokens) == 0:
        return 0

    pos_tags = nltk.pos_tag(tokens)

    count = 0
    for word, pos in pos_tags:
        if word in pronouns:
            count += 1
        elif word in time_words:
            count += 1
        elif word in place_words:
            count += 1
        elif word in event_verbs and pos.startswith("V"):
            count += 1

    return count / len(tokens)

df_hm['narrativity'] = df_hm['text'].fillna("").apply(narrativity_score)

In [None]:
# Semantic Similarity

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

model = SentenceTransformer('paraphrase-MiniLM-L3-v2')

embeddings = model.encode(
    df_hm['processed_text'].fillna("").tolist(),
    batch_size=128,
    show_progress_bar=True,
    convert_to_numpy=True
)
corpus_mean = np.mean(embeddings, axis=0).reshape(1, -1)

df_hm['semantic_similarity'] = cosine_similarity(embeddings, corpus_mean).flatten()

In [None]:
# Concept overlap

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(df_hm['processed_text'].fillna(""))

corpus_mean = X_tfidf.mean(axis=0).A

df_hm['concept_overlap'] = cosine_similarity(X_tfidf, corpus_mean).flatten()

# 5. Moderator

In [None]:
# Elite years count

def count_elite_years(val):
    if pd.isna(val) or str(val).strip() == '':
        return 0
    return len(str(val).split(','))
hm_user['elite_count'] = hm_user['elite'].apply(count_elite_years)

# 6. Aggregate dataset

In [None]:
# Variables to aggregate from df_hm
keep_cols = [
    "user_id","semantic_similarity", "narrativity", "concept_overlap"
]

# Aggregate mean by user_id
agg_df = df_hm[keep_cols].groupby("user_id").mean(numeric_only=True).reset_index()

# Merge with existing hm_user (which already has relational variables)
data = hm_user.merge(agg_df, on="user_id", how="left")

In [None]:
data.columns

In [None]:
from google.colab import files
# Save final dataset
data.to_csv("data.csv", index=False)

files.download("data.csv")