video url: https://youtu.be/qI2jD712b-I?si=3Rb0qCLzsdAzL4d2

In [74]:
import pandas as pd
from __future__ import annotations

### 1. Parameter-based experimetns

##### Confidence Threshold Analysis

In [50]:
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [51]:
df = pd.read_csv("youtube_comments.csv")

In [52]:
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-xlm-roberta-base-sentiment",
    tokenizer="cardiffnlp/twitter-xlm-roberta-base-sentiment"
)

sentiment_pipeline("Bu mahnƒ± √ßox g√∂z…ôldir")




Device set to use cpu


[{'label': 'positive', 'score': 0.8690898418426514}]

In [53]:
def classify_sentiment(text, threshold=0.6):
    result = sentiment_pipeline(text[:512])[0] 
    label = result["label"].lower()
    score = result["score"]

    if score < threshold:
        return "Neutral"

    if label == "positive":
        return "Positive"
    elif label == "negative":
        return "Negative"
    else:
        return "Neutral"

In [54]:
def sentiment_distribution(df, threshold):
    sentiments = df["comment_text"].apply(
        lambda x: classify_sentiment(x, threshold)
    )
    return sentiments.value_counts()

In [55]:
dist_06 = sentiment_distribution(df, threshold=0.6)
dist_08 = sentiment_distribution(df, threshold=0.8)

print("\nThreshold = 0.6")
print(dist_06)

print("\nThreshold = 0.8")
print(dist_08)


Threshold = 0.6
comment_text
Neutral     57
Positive    29
Name: count, dtype: int64

Threshold = 0.8
comment_text
Neutral     72
Positive    14
Name: count, dtype: int64


##### Comment Length Filtering

In [56]:
df = pd.read_csv("youtube_comments.csv")
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-xlm-roberta-base-sentiment",
    tokenizer="cardiffnlp/twitter-xlm-roberta-base-sentiment",
)

Device set to use cpu


In [57]:
def classify_sentiment(text: str, threshold: float) -> str:
    """
    Predict sentiment using the model.
    If model confidence < threshold -> Neutral.
    """
    if not isinstance(text, str) or not text.strip():
        return "Neutral"

    # Model input length safety
    result = sentiment_pipeline(text[:512])[0]
    label = result["label"].lower()
    score = float(result["score"])

    if score < threshold:
        return "Neutral"

    if label == "positive":
        return "Positive"
    if label == "negative":
        return "Negative"
    return "Neutral"

def word_count(text: str) -> int:
    if not isinstance(text, str):
        return 0
    return len(text.split())

def run_length_filter_experiment(df: pd.DataFrame, min_words: int, threshold: float) -> dict:
    """
    Filter by minimum word count, run sentiment classification,
    return counts for Positive/Negative/Neutral and total N.
    """
    filtered = df[df["comment_text"].apply(word_count) >= min_words].copy()

    sentiments = filtered["comment_text"].apply(lambda t: classify_sentiment(t, threshold))
    counts = sentiments.value_counts().to_dict()

    return {
        "min_words": min_words,
        "threshold": threshold,
        "N": len(filtered),
        "Positive": counts.get("Positive", 0),
        "Negative": counts.get("Negative", 0),
        "Neutral": counts.get("Neutral", 0),
    }

In [58]:
MIN_WORD_FILTERS = [5, 7]
THRESHOLDS = [0.6, 0.8]

results = []
for mw in MIN_WORD_FILTERS:
    for th in THRESHOLDS:
        results.append(run_length_filter_experiment(df, min_words=mw, threshold=th))

results_df = pd.DataFrame(results)

print("\n=== Results (Counts) ===")
print(results_df)


=== Results (Counts) ===
   min_words  threshold   N  Positive  Negative  Neutral
0          5        0.6  43         9         0       34
1          5        0.8  43         3         0       40
2          7        0.6  35         7         0       28
3          7        0.8  35         3         0       32


In [59]:
pct_df = results_df.copy()
for col in ["Positive", "Negative", "Neutral"]:
    pct_df[col] = (pct_df[col] / pct_df["N"] * 100).round(2)

print("\n=== Results (Percent %) ===")
print(pct_df)


=== Results (Percent %) ===
   min_words  threshold   N  Positive  Negative  Neutral
0          5        0.6  43     20.93       0.0    79.07
1          5        0.8  43      6.98       0.0    93.02
2          7        0.6  35     20.00       0.0    80.00
3          7        0.8  35      8.57       0.0    91.43


### 2. Analytical Tasks

1.	Emoji‚ÄìText Inconsistency. Identify comments where the sentiment expressed by emojis contradicts the sentiment of the textual content.

In [60]:
POSITIVE_EMOJIS = {"‚ù§", "ü´†", "üòç", "üëè", "üòä", "ü©µ", "üéâ", "üòÖ", "ü•∞", "üòá", "üòå", "ü´∂"}
NEGATIVE_EMOJIS = {"üò∂", "üò¢"}

In [61]:
def emoji_sentiment(emojis):
    if not isinstance(emojis, str) or not emojis.strip():
        return "Neutral"

    emoji_set = set(emojis.split())

    if emoji_set & POSITIVE_EMOJIS:
        return "Positive"
    if emoji_set & NEGATIVE_EMOJIS:
        return "Negative"
    return "Neutral"

In [62]:
df["text_sentiment"] = df["comment_text"].apply(lambda t: classify_sentiment(t, threshold=0.6))
df["emoji_sentiment"] = df["emojis"].apply(emoji_sentiment)

inconsistent = df[
    (df["text_sentiment"] != df["emoji_sentiment"]) & (df["emoji_sentiment"] != "Neutral")
]

inconsistent.to_csv("emojiText_inconsistency.csv", index=False, encoding="utf-8-sig")
inconsistent.head()

Unnamed: 0,comment_text,like_count,reply_count,emojis,text_sentiment,emoji_sentiment
0,"Dinl…ôy…ônd…ô m…ôn…ô d…ô x…ôb…ôr el…ôyin ""kims…ô ≈ü…ôrhini...",528,33,üò∂,Neutral,Negative
3,Bu g√ºn m…ôn…ô bu mahnƒ±nƒ± ≈û√∂vk…ôt ∆èl…ôkb…ôrova yazma...,2,1,‚ù§,Neutral,Positive
9,H√ºsn√ºn…ô he√ß bir s√∂z olmaz\r\nG√∂zl…ôrin yaman g√∂...,0,0,‚ù§,Neutral,Positive
11,"sanƒ±ram d√ºnya m…ônimdir, 'g√∂z√ºm…ô' -'g√∂z√ºn' d…ôy…ô...",4,0,ü´†,Neutral,Positive
13,Sizin mahnƒ±larƒ±nƒ±zƒ± √ßox sevir…ôm ancaq indi siz‚Ä¶,2,0,‚ù§,Neutral,Positive


2.	Semantically Similar Comments. Identify groups of comments with high semantic similarity (e.g., using cosine similarity).

In [63]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [64]:
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
texts = df["comment_text"].fillna("").astype(str).tolist()

embeddings = model.encode(texts, show_progress_bar=True)
sim_matrix = cosine_similarity(embeddings)

from collections import defaultdict, deque

THRESHOLD = 0.85
n = len(texts)

adj = defaultdict(list)
for i in range(n):
    for j in range(i + 1, n):
        if sim_matrix[i, j] >= THRESHOLD:
            adj[i].append(j)
            adj[j].append(i)

visited = set()
groups = []

for i in range(n):
    if i in visited:
        continue
    if i not in adj:
        visited.add(i)
        continue

    q = deque([i])
    visited.add(i)
    comp = [i]

    while q:
        u = q.popleft()
        for v in adj[u]:
            if v not in visited:
                visited.add(v)
                q.append(v)
                comp.append(v)

    groups.append(comp)

group_rows = []
for g_id, idxs in enumerate(sorted(groups, key=len, reverse=True), start=1):
    for idx in idxs:
        group_rows.append({
            "group_id": g_id,
            "comment_index": idx,
            "comment_text": texts[idx]
        })

groups_df = pd.DataFrame(group_rows)
print(groups_df.head(30))

groups_df.to_csv("semantic_similarity_groups.csv", index=False, encoding="utf-8-sig")
print("Saved: semantic_similarity_groups.csv")

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 040e70c4-954e-4719-80c8-e7cd57e0746b)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: a4ee8a6e-b63e-4e3c-934b-376a514429d9)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/resolve/main/config.json
Retrying in 1s [Retry 1/5].
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:01<00:00,  1.83it/s]

    group_id  comment_index                                       comment_text
0          1              5  Allah r…ôhm…ôt el…ôsin dahi Az…ôrbaycan m√ºƒü…ônnisi ...
1          1              8  M…ôn M…ôn Az…ôrbaycanƒ± √ßox sevir…ôm, ya≈üasƒ±n qarda...
2          1             10  Allah s…ôn…ô r…ôhm…ôt el…ôsin Dahi S…ôn…ôtkarƒ±mƒ±z ≈û√∂v...
3          1             14  Az…ôrbaycanƒ±n Musiqimizin Zeyn…ôb Xanlarova ≈û√∂vk...
4          1             36  Allah r…ôhm…ôt el…ôsin m…ôkan c…ônn…ôt olsun  g√∂z…ôl ...
5          1             19  Nec…ô g√∂z…ôl , s…ôlist , dialekt…ô , ≈üiv…ôy…ô maraql...
6          1             24  Normalda Azerbaycan mahnƒ±larƒ± sevm…ôr…ôm hel…ô in...
7          1             32  AZ∆èRBAYCANIN M∆èD∆èNƒ∞YY∆èTƒ∞N∆è E≈ûQ  OLSUN. USTADLA...
8          1             58  Beh beh.En √ßox dinl…ôdiyim …ôs…ôr.S…ôni m…ôn yaman ...
9          1              7  Musiqi, ifa, ifa√ßƒ± g√∂z…ôl t…ôbii biz n…ôl…ôr itirm...
10         2             31                               




3.	Semantic Outliers. How many comments significantly deviate from the overall semantic similarity distribution?

In [65]:
avg_similarity = sim_matrix.mean(axis=1)

threshold = avg_similarity.mean() - 2 * avg_similarity.std()
outliers = np.where(avg_similarity < threshold)[0]

len(outliers)

5

4.	Popular Comment Analysis. Identify common words or phrases used in comments with the highest number of likes and replies.

In [66]:
from collections import Counter

In [67]:
top_comments = df.sort_values(
    by=["like_count", "reply_count"],
    ascending=False
).head(20)

words = []
for text in top_comments["comment_text"]:
    words.extend(text.lower().split())

Counter(words).most_common(10)

[('bir', 7),
 ('g√∂z…ôl', 6),
 ('bu', 5),
 ('r…ôhm…ôt', 5),
 ('d…ô', 4),
 ('allah', 4),
 ('el…ôsin', 4),
 ('√ßox', 4),
 ('qulaq', 3),
 ('m…ôn…ô', 2)]

### 3. Semantic Category-Based Sentiment Analysis

In [71]:
def semantic_category(text, sentiment, emojis):
    t = text.lower()

    if sentiment == "Negative" and any(w in t for w in ["pis", "z…ôif", "b…ôy…ônm…ôdim", "s…ôhv"]):
        return "Criticism and dissatisfaction"

    if any(w in t for w in ["s…ôn", "siz", "kanal", "video", "s…ôsiniz", "mahnƒ±", "Mahnƒ±"]):
        return "Direct address to the author"

    if any(w in t for w in ["haha", "bo≈ü", "???"]):
        return "Troll / non-constructive comment"

    if sentiment == "Positive" and (emojis or any(w in t for w in ["…ôla", "super", "m√∂ht…ô≈ü…ôm", "≈üedevr", "bravo"])):
        return "Emotional reaction"

    if sentiment == "Positive":
        return "Rational positive feedback on content"

    return "Emotional reaction"

In [72]:
df["semantic_category"] = df.apply(
    lambda row: semantic_category(
        row["comment_text"],
        row["text_sentiment"],
        row["emojis"]
    ),
    axis=1
)

In [73]:
result_table = (
    df.groupby(["semantic_category", "text_sentiment"])
      .size()
      .unstack(fill_value=0)
)

print(result_table)

text_sentiment                Neutral  Positive
semantic_category                              
Direct address to the author       19         8
Emotional reaction                 38        21
