<a href="https://colab.research.google.com/github/lisa11323/GPT/blob/main/GPT_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Preprocessing

1.1 Preprocessing Code for Zero-Shot Classification

In [None]:
# Remove non-English characters and emojis, preprocess syllables, then apply filtering

!pip install -U pandas numpy
!pip install nltk emoji

import pandas as pd
import re
import emoji
from tqdm import tqdm
from google.colab import files

# Upload file
uploaded = files.upload()
file_name = list(uploaded.keys())[0]

tqdm.pandas()

df = pd.read_excel(file_name)

def clean_text(text):
    if isinstance(text, float):
        text = str(text)
    text = emoji.replace_emoji(text, replace="")
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text.strip()

df['Review'] = df['Review'].progress_apply(clean_text)

def get_review_length(text):
    try:
        return len(str(text).split())
    except:
        return 0

df['Review_length'] = df['Review'].progress_apply(get_review_length)

filter_by_length = True
min_length = 10            # standard length

if filter_by_length:
    df = df[df['Review_length'] >= min_length].reset_index(drop=True)

preprocessed_file_path = 'preprocessed_comments_with_length.xlsx'
df.to_excel(preprocessed_file_path, index=False)
print(f"Saved preprocessed and review length filtered ({filter_by_length}) data to '{preprocessed_file_path}'")

files.download(preprocessed_file_path)

1.2 Preprocessing code for BERTopic

In [None]:
!pip install tqdm
!pip install nltk
!pip install sentence-transformers
!pip install hdbscan
!pip install plotly

import pandas as pd
from tqdm import tqdm
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from google.colab import files
import io

import plotly.io as pio
pio.renderers.default = "colab"

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Upload file
uploaded = files.upload()

for fn in uploaded.keys():
    file_path = fn

df = pd.read_excel(file_path)
print(f"File loaded: {file_path}")

# Preparing for stopword removal and lemmatization
default_stopwords = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Define preprocessing function (modified to retain English only)
def preprocess_text(text):
    if pd.isna(text): return ""
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in default_stopwords]
    return " ".join(tokens)

if "Review" in df.columns:
    tqdm.pandas()
    df = df.rename(columns={"Review": "Review_Raw"})
    df["Review_preprocessed"] = df["Review_Raw"].progress_apply(preprocess_text)
    print("Preprocessing complete")
else:
    print("The 'Review' column does not exist.")

output_file = "preprocessed_Review.xlsx"
df.to_excel(output_file, index=False)
files.download(output_file)

# 2. Topic modeling using BERTopic with Louvain clustering

2.1 Install required packages

In [None]:
!pip install pandas openpyxl bertopic tqdm hdbscan plotly umap-learn matplotlib python-louvain

2.2 Load BERTopic model

In [None]:
import pandas as pd
from bertopic import BERTopic
from tqdm import tqdm
import re
import string
from google.colab import files
import hdbscan
from umap import UMAP
import tensorflow_hub as hub
import plotly.io as pio
pio.renderers.default = "colab"

# Upload file
uploaded = files.upload()

for fn in uploaded.keys():
    file_path = fn

df = pd.read_excel(file_path)
print(f"File loaded: {file_path}")
print("Preview the 'Review_preprocessed' column:")
print(df["Review_preprocessed"].head())

# Define text preprocessing function
def clean_text(text):
    text = text.lower()
    text = re.sub(rf"[{re.escape(string.punctuation)}]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

comments = df["Review_preprocessed"].dropna().astype(str).tolist()
comments_cleaned = [clean_text(text) for text in tqdm(comments)]

# Configure hDBSCAN clustering
hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=50,
    min_samples=5,
    metric='euclidean',
    prediction_data=True
)

# Train BERTopic model
topic_model = BERTopic(hdbscan_model=hdbscan_model, language="english", verbose=True)
topics, probs = topic_model.fit_transform(comments_cleaned)

# Visualize
fig = topic_model.visualize_topics()
fig.show()

df_result = df.copy()
df_result["topic_id"] = topics

output_file = "bertopic_result.xlsx"
df_result.to_excel(output_file, index=False)

files.download(output_file)

Extract topic information

In [None]:
import pandas as pd
from google.colab import files

# Retrieve topic information
topic_info = topic_model.get_topic_info()

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.colheader_justify", "center")

# Display as a table in Colab
from IPython.display import display
display(topic_info)

output_path = "topic_info.xlsx"
topic_info.to_excel(output_path, index=False)
print(f"Saved topic information: {output_path}")

files.download(output_path)

2.3 Extract representative keywords for each topic

In [None]:
# Representative keywords for each topic
print("Representative keywords for each topic:")
for topic_id in topic_model.get_topics().keys():
    if topic_id == -1:
        continue
    print(f"\nTopic {topic_id}:")
    for word, weight in topic_model.get_topic(topic_id)[:10]:
        print(f"  - {word} ({weight:.4f})")

# Compute similarity matrix using topic embeddings
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

embeddings = topic_model.topic_embeddings_

similarity_matrix = cosine_similarity(embeddings)

2.4 Visualize sentence clusters

In [None]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = embedding_model.encode(comments_cleaned, show_progress_bar=True)

# 3. Reduce dimensions using UMAP
from umap import UMAP
import matplotlib.pyplot as plt
import numpy as np

umap_model = UMAP(n_components=2, random_state=42)
embeddings_2d = umap_model.fit_transform(embeddings)

# Visualize
unique_topics = sorted(set(topics))
colors = plt.cm.get_cmap('tab20', len(unique_topics))

plt.figure(figsize=(10, 8))
for topic in unique_topics:
    idx = [i for i, t in enumerate(topics) if t == topic]
    plt.scatter(embeddings_2d[idx, 0], embeddings_2d[idx, 1],
                label=f"Topic {topic}", alpha=0.5, s=30, color=colors(topic))

plt.title("Visualize sentence clusters (UMAP + Topic Color)")
plt.xlabel("UMAP 1")
plt.ylabel("UMAP 2")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.grid(True)
plt.show()

2.5 Visualize hierarchical_topics

In [None]:
fig_hierarchy = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

fig_hierarchy.show()

2.6 Visualize BERTopic

In [None]:
fig_topics = topic_model.visualize_topics()

fig_topics.show()

2.7 Visualize Barchart

In [None]:
fig_barchart = topic_model.visualize_barchart(top_n_topics=137)

fig_barchart.show()

2.8 Visualize heatmap

In [None]:
fig_heatmap = topic_model.visualize_heatmap()

fig_heatmap.show()

2.9 Load Louvain algorithm model

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import community.community_louvain as community_louvain
from tqdm import tqdm
import numpy as np

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Compute topic-wise average embedding vectors
topic_ids = sorted(set(topics))
topic_vectors = []

for tid in tqdm(topic_ids, desc="Embedding topics"):
    idx = [i for i, t in enumerate(topics) if t == tid]
    topic_sentences = [comments_cleaned[i] for i in idx]
    if topic_sentences:
        topic_embedding = embedding_model.encode(topic_sentences)
        topic_mean = np.mean(topic_embedding, axis=0)
        topic_vectors.append(topic_mean)
    else:
        topic_vectors.append(np.zeros(embedding_model.get_sentence_embedding_dimension()))

# 2. Compute similarity matrix
similarity_matrix = cosine_similarity(topic_vectors)

G = nx.Graph()
for i in tqdm(range(len(topic_ids)), desc="Generating graph edges"):
    for j in range(i + 1, len(topic_ids)):
        weight = similarity_matrix[i][j]

        if weight > 0.7:
          G.add_edge(topic_ids[i], topic_ids[j], weight=weight)

# Louvain community detection
partition = community_louvain.best_partition(G)

# Assign isolated topics to meta-topics
isolated_topics = list(set(topic_ids) - set(partition.keys()))
if isolated_topics:
    next_meta_topic = max(partition.values()) + 1 if partition else 0
    for iso_tid in isolated_topics:
        partition[iso_tid] = next_meta_topic
        next_meta_topic += 1

meta_topic_dict = {}
for topic_id, community_id in partition.items():
    if community_id not in meta_topic_dict:
        meta_topic_dict[community_id] = []
    meta_topic_dict[community_id].append(topic_id)

# Result
for group, topics_in_group in meta_topic_dict.items():
    print(f"\n Meta-topic {group}:")
    print(f"Topic IDs: {topics_in_group}")
    for tid in topics_in_group:
        words = topic_model.get_topic(tid)[:5]
        keywords = ", ".join([w[0] for w in words])
        print(f"  - Topic {tid}: {keywords}")

In [None]:
import pandas as pd

df_with_topic = df.copy()
df_with_topic["topic_id"] = topics

# Mapping from topic_id to meta_topic_id
topic_to_meta = partition  # Results from Louvain clustering

df_with_topic["meta_topic_id"] = df_with_topic["topic_id"].map(topic_to_meta)

# Exclude -1 labels considered as noise
df_with_topic = df_with_topic[df_with_topic["topic_id"] != -1]

df_with_topic = df_with_topic.sort_values(by=["meta_topic_id", "topic_id"]).reset_index(drop=True)

print("Review data grouped by meta-topic:")
print(df_with_topic[["meta_topic_id", "topic_id", "Review_preprocessed"]].head())

output_file = "meta_topic_grouped_reviews.xlsx"
df_with_topic.to_excel(output_file, index=False)

from google.colab import files
files.download(output_file)

In [None]:
# Create table of topic–meta-topic–keywords (+frequency)
rows = []

for topic_id, meta_topic in partition.items():
    if topic_id == -1:
        continue

    keywords = topic_model.get_topic(topic_id)

    for word, weight in keywords[:10]:
        rows.append({
            "meta_topic_id": meta_topic,
            "topic_id": topic_id,
            "keyword": word,
            "score": round(weight, 4)
        })

df_keywords = pd.DataFrame(rows)

df_keywords = df_keywords.sort_values(by=["meta_topic_id", "topic_id", "score"], ascending=[True, True, False]).reset_index(drop=True)

output_keywords_file = "meta_topic_keywords_with_scores.xlsx"
df_keywords.to_excel(output_keywords_file, index=False)

from google.colab import files
files.download(output_keywords_file)

# 3. Code for measuring independent variables

In [None]:
# Install required packages
!pip install scipy transformers pandas openpyxl tqdm matplotlib datasets scikit-learn --quiet

3.1 App experience

In [None]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm
from google.colab import files

# Upload file
uploaded = files.upload()
filename = list(uploaded.keys())[0]
df = pd.read_excel(filename)

if 'Review_Raw' not in df.columns:
    raise ValueError("The 'Review' column does not exist.")

# Load the zero-shot classification model
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define relevance labels for App Experience
app_experience_labels = [
    ("Completely Unrelated", 0.0),
    ("Slightly Unrelated", 0.25),
    ("Neutral", 0.5),
    ("Somewhat Related", 0.75),
    ("Strongly Related", 1.0)
]

def classify_app_experience_score(comment):
    if pd.isna(comment) or not str(comment).strip():
        return 0.0

    labels = [label for label, _ in app_experience_labels]

    output = classifier(
        comment,
        candidate_labels=labels,
        hypothesis_template="This comment is {} to the app experience.",
        return_all_scores=True
    )

    if isinstance(output, list):
        result = output[0]
    elif isinstance(output, dict) and "scores" in output:
        result = [
            {'label': label, 'score': score}
            for label, score in zip(output["labels"], output["scores"])
        ]
    else:
        raise ValueError("Unexpected model output format.")

    weighted = sum(r['score'] * w for r, (_, w) in zip(result, app_experience_labels))
    return round(weighted, 4)

tqdm.pandas()
df["AppExperienceRelevance_score"] = df["Review_Raw"].progress_apply(classify_app_experience_score)

output_file = filename.replace(".xlsx", "_app_experience_relevance_scored.xlsx")
df.to_excel(output_file, index=False)
files.download(output_file)

print("Finished: AppExperienceRelevance_score (0~1).")

3.2 Chatbot interaction

In [None]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm
from google.colab import files

# Upload file
uploaded = files.upload()
filename = list(uploaded.keys())[0]
df = pd.read_excel(filename)

if 'Review_Raw' not in df.columns:
    raise ValueError("The 'Review' column does not exist.")

# Load the zero-shot classification model
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define relevance labels for Chatbot Interaction
chatbot_labels = [
    ("Completely Unrelated", 0.0),
    ("Slightly Unrelated", 0.25),
    ("Neutral", 0.5),
    ("Somewhat Related", 0.75),
    ("Strongly Related", 1.0)
]

def classify_chatbot_interaction_score(comment):
    if pd.isna(comment) or not str(comment).strip():
        return 0.0

    labels = [label for label, _ in chatbot_labels]

    output = classifier(
        comment,
        candidate_labels=labels,
        hypothesis_template="This comment is {} to the chatbot interaction.",
        return_all_scores=True
    )

    if isinstance(output, list):
        result = output[0]
    elif isinstance(output, dict) and "scores" in output:
        result = [
            {'label': label, 'score': score}
            for label, score in zip(output["labels"], output["scores"])
        ]
    else:
        raise ValueError("Unexpected model output format.")

    weighted = sum(r['score'] * w for r, (_, w) in zip(result, chatbot_labels))
    return round(weighted, 4)

tqdm.pandas()
df["ChatbotInteractionRelevance_score"] = df["Review_Raw"].progress_apply(classify_chatbot_interaction_score)

output_file = filename.replace(".xlsx", "_chatbot_interaction_relevance_scored.xlsx")
df.to_excel(output_file, index=False)
files.download(output_file)

print("Finished: ChatbotInteractionRelevance_score (0~1).")

3.3 Learning support

In [None]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm
from google.colab import files

# Upload file
uploaded = files.upload()
filename = list(uploaded.keys())[0]
df = pd.read_excel(filename)

if 'Review_Raw' not in df.columns:
    raise ValueError("The 'Review' column does not exist.")

# Load the zero-shot classification model
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define relevance labels for Learning Support
learning_support_labels = [
    ("Completely Unrelated", 0.0),
    ("Slightly Unrelated", 0.25),
    ("Neutral", 0.5),
    ("Somewhat Related", 0.75),
    ("Strongly Related", 1.0)
]

def classify_learning_support_score(comment):
    if pd.isna(comment) or not str(comment).strip():
        return 0.0

    labels = [label for label, _ in learning_support_labels]

    output = classifier(
        comment,
        candidate_labels=labels,
        hypothesis_template="This comment is {} to the learning support.",
        return_all_scores=True
    )

    if isinstance(output, list):
        result = output[0]
    elif isinstance(output, dict) and "scores" in output:
        result = [
            {'label': label, 'score': score}
            for label, score in zip(output["labels"], output["scores"])
        ]
    else:
        raise ValueError("Unexpected model output format.")

    weighted = sum(r['score'] * w for r, (_, w) in zip(result, learning_support_labels))
    return round(weighted, 4)

tqdm.pandas()
df["LearningSupportRelevance_score"] = df["Review_Raw"].progress_apply(classify_learning_support_score)

output_file = filename.replace(".xlsx", "_learning_support_relevance_scored.xlsx")
df.to_excel(output_file, index=False)
files.download(output_file)

print("Finished: LearningSupportRelevance_score (0~1).")

3.4 Feature request

In [None]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm
from google.colab import files

# Upload file
uploaded = files.upload()
filename = list(uploaded.keys())[0]
df = pd.read_excel(filename)

if 'Review_Raw' not in df.columns:
    raise ValueError("The 'Review' column does not exist.")

# Load the zero-shot classification model
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define relevance labels for Feature Request
feature_request_labels = [
    ("Completely Unrelated", 0.0),
    ("Slightly Unrelated", 0.25),
    ("Neutral", 0.5),
    ("Somewhat Related", 0.75),
    ("Strongly Related", 1.0)
]

def classify_feature_request_score(comment):
    if pd.isna(comment) or not str(comment).strip():
        return 0.0

    labels = [label for label, _ in feature_request_labels]

    output = classifier(
        comment,
        candidate_labels=labels,
        hypothesis_template="This comment is {} to the feature request.",
        return_all_scores=True
    )

    if isinstance(output, list):
        result = output[0]
    elif isinstance(output, dict) and "scores" in output:
        result = [
            {'label': label, 'score': score}
            for label, score in zip(output["labels"], output["scores"])
        ]
    else:
        raise ValueError("Unexpected model output format.")

    weighted = sum(r['score'] * w for r, (_, w) in zip(result, feature_request_labels))
    return round(weighted, 4)

tqdm.pandas()
df["FeatureRequestRelevance_score"] = df["Review_Raw"].progress_apply(classify_feature_request_score)

output_file = filename.replace(".xlsx", "_feature_request_relevance_scored.xlsx")
df.to_excel(output_file, index=False)
files.download(output_file)

print("Finished: FeatureRequestRelevance_score (0~1).")

3.5 Bias concerns

In [None]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm
from google.colab import files

# Upload file
uploaded = files.upload()
filename = list(uploaded.keys())[0]
df = pd.read_excel(filename)

if 'Review_Raw' not in df.columns:
    raise ValueError("The 'Review' column does not exist.")

# Load the zero-shot classification model
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define relevance labels for Bias Concerns
bias_concerns_labels = [
    ("Completely Unrelated", 0.0),
    ("Slightly Unrelated", 0.25),
    ("Neutral", 0.5),
    ("Somewhat Related", 0.75),
    ("Strongly Related", 1.0)
]

def classify_bias_concerns_score(comment):
    if pd.isna(comment) or not str(comment).strip():
        return 0.0

    labels = [label for label, _ in bias_concerns_labels]

    output = classifier(
        comment,
        candidate_labels=labels,
        hypothesis_template="This comment is {} to the bias concerns.",
        return_all_scores=True
    )

    if isinstance(output, list):
        result = output[0]
    elif isinstance(output, dict) and "scores" in output:
        result = [
            {'label': label, 'score': score}
            for label, score in zip(output["labels"], output["scores"])
        ]
    else:
        raise ValueError("Unexpected model output format.")

    weighted = sum(r['score'] * w for r, (_, w) in zip(result, bias_concerns_labels))
    return round(weighted, 4)

tqdm.pandas()
df["BiasConcernsRelevance_score"] = df["Review_Raw"].progress_apply(classify_bias_concerns_score)

output_file = filename.replace(".xlsx", "_bias_concerns_relevance_scored.xlsx")
df.to_excel(output_file, index=False)
files.download(output_file)

print("Finished: BiasConcernsRelevance_score (0~1).")

# 4. Code for measuring moderator

4.1 Human-likeness

In [None]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm
from google.colab import files

# Upload file
uploaded = files.upload()
filename = list(uploaded.keys())[0]

df = pd.read_excel(filename)

if 'Review_Raw' not in df.columns:
    raise ValueError("The 'comment' column does not exist.")

# Load the zero-shot classification model
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define relevance labels for Humanlikeness
humanlikeness_labels = [
    ("Very Machianlike", 0.0),
    ("Somewhat Machianlike", 0.25),
    ("Neutral", 0.5),
    ("Somewhat Humanlike", 0.75),
    ("Very Humanlike", 1.0)
]

def classify_humanlikeness_score(comment):
    if pd.isna(comment) or not str(comment).strip():
        return 0.0

    labels = [label for label, _ in humanlikeness_labels]

    output = classifier(
        comment,
        candidate_labels=labels,
        hypothesis_template="This comment mention to {} in terms of uncanny valley theory in conversational AI.",
        return_all_scores=True
    )

    if isinstance(output, list):
        result = output
    elif isinstance(output, dict) and "scores" in output:
        result = [
            {'label': label, 'score': score}
            for label, score in zip(output["labels"], output["scores"])
        ]
    else:
        raise ValueError("Unexpected model output format.")

    weighted = sum(r['score'] * w for r, (_, w) in zip(result, humanlikeness_labels))
    return round(weighted, 4)

tqdm.pandas()
df["Humanlikeness_score"] = df["Review_Raw"].progress_apply(classify_humanlikeness_score)

output_file = filename.replace(".xlsx", "_humanlikeness_scored.xlsx")
df.to_excel(output_file, index=False)
files.download(output_file)

print("Finished: Humanlikeness_score (0~1).")

4.2 Familiarity

In [None]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm
from google.colab import files

# Upload file
uploaded = files.upload()
filename = list(uploaded.keys())[0]

df = pd.read_excel(filename)

if 'Review_Raw' not in df.columns:
    raise ValueError("The 'comment' column does not exist.")

# Load the zero-shot classification model
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define relevance labels for Familiarity 라벨 정의
familiarity_labels = [
    ("Very Unfamiliar", 0.0),
    ("Somewhat Unfamiliar", 0.25),
    ("Neutral", 0.5),
    ("Somewhat Familiar", 0.75),
    ("Very Familiar", 1.0)
]

def classify_familiarity_score(comment):
    if pd.isna(comment) or not str(comment).strip():
        return 0.0

    labels = [label for label, _ in familiarity_labels]

    output = classifier(
        comment,
        candidate_labels=labels,
        hypothesis_template="This comment mention to {} in terms of uncanny valley theory in conversational AI.",
        return_all_scores=True
    )

    if isinstance(output, list):
        result = output
    elif isinstance(output, dict) and "scores" in output:
        result = [
            {'label': label, 'score': score}
            for label, score in zip(output["labels"], output["scores"])
        ]
    else:
        raise ValueError("Unexpected model output format.")

    weighted = sum(r['score'] * w for r, (_, w) in zip(result, familiarity_labels))
    return round(weighted, 4)

tqdm.pandas()
df["Familiarity_score"] = df["Review_Raw"].progress_apply(classify_familiarity_score)

output_file = filename.replace(".xlsx", "_familiarity_scored.xlsx")
df.to_excel(output_file, index=False)
files.download(output_file)

print("Finished: Familiarity_score (0~1).")

# 5. Code for measuring mediator

5.1 Subjectivity

In [None]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm
from google.colab import files

# Upload file
uploaded = files.upload()
filename = list(uploaded.keys())[0]

df = pd.read_excel(filename)

if 'Review_Raw' not in df.columns:
    raise ValueError("The 'comment' column does not exist.")

# Load the zero-shot classification model
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define relevance labels for Subjectivity
subjectivity_labels = [
    ("Very Objective", 0.0),
    ("Somewhat Objective", 0.25),
    ("Neutral", 0.5),
    ("Somewhat Subjective", 0.75),
    ("Very Subjective", 1.0)
]

def classify_subjectivity_score(comment):
    if pd.isna(comment) or not str(comment).strip():
        return 0.0

    labels = [label for label, _ in subjectivity_labels]

    output = classifier(
        comment,
        candidate_labels=labels,
        hypothesis_template="This sentence expresses a {} opinion.",
        return_all_scores=True
    )

    if isinstance(output, list):
        result = output
    elif isinstance(output, dict) and "scores" in output:
        result = [
            {'label': label, 'score': score}
            for label, score in zip(output["labels"], output["scores"])
        ]
    else:
        raise ValueError("Unexpected model output format.")

    weighted = sum(r['score'] * w for r, (_, w) in zip(result, subjectivity_labels))
    return round(weighted, 4)

tqdm.pandas()
df["Subjectivity_score"] = df["Review_Raw"].progress_apply(classify_subjectivity_score)

output_file = filename.replace(".xlsx", "_subjectivity_scored.xlsx")
df.to_excel(output_file, index=False)
files.download(output_file)

print("Finished: Subjectivity_score (0~1).")

# 6. Code for measuring dependent variables

6.1 Code for sentiment analysis based on RoBERTa model

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
from tqdm import tqdm
from google.colab import files

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Upload file
uploaded = files.upload()
filename = list(uploaded.keys())[0]

df = pd.read_excel(filename)
if 'Review_Raw' not in df.columns:
    raise ValueError("The 'Review' column does not exist.")
df['Review_Raw'] = df['Review_Raw'].fillna("")

MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device)

# Sentiment analysis function (batch mode)
def analyze_sentiment_batch(texts):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    probs = softmax(outputs.logits.cpu().numpy(), axis=1)
    return probs  # shape: (batch_size, 3)

batch_size = 32
scores = []

for i in tqdm(range(0, len(df), batch_size)):
    batch_texts = df['Review_Raw'].iloc[i:i+batch_size].tolist()
    try:
        probs = analyze_sentiment_batch(batch_texts)
        for prob in probs:
            pos, neu, neg = prob[2], prob[1], prob[0]
            compound = round((pos - neg) * (1 - neu), 4)
            scores.append(compound)
    except Exception as e:
        print(f"An error occured at batch {i}: {e}")
        scores.extend([0.0] * len(batch_texts))

df['sentiment_score'] = scores

# Compute the 25th and 75th percentile values
q25 = df['sentiment_score'].quantile(0.25)
q75 = df['sentiment_score'].quantile(0.75)

print(f"Quantile cutoff values — Q25: {q25:.4f}, Q75: {q75:.4f}")

def label_sentiment(score):
    if score <= q25:
        return 2  # Negative
    elif score >= q75:
        return 1  # Positive
    else:
        return 3  # Neutral

df['sentiment_label'] = df['sentiment_score'].apply(label_sentiment)

output_file = filename.replace(".xlsx", "_roberta_sentiment_labeled.xlsx")
df.to_excel(output_file, index=False)
files.download(output_file)

print(f"Finished: RoBERTa sentiment analysis and quantile labeling completed (1 = Positive ≥ {q75:.2f}, 2 = Negative ≤ {q25:.2f}, 3 = Neutral)")