# Extract youtube video id form link

In [3]:
import re

API_KEY = ""
VIDEO_ID = ""
VIDEO_LINK = "https://youtu.be/-RJ0xGYUHBE?si=pPHpGjkauo_e6m68"
def extract_video_id(url):
    regex = r'(?:https?://)?(?:www\.)?(?:youtube\.com/(?:watch\?v=|shorts/)|youtu\.be/)([a-zA-Z0-9_-]{11})'
    match = re.match(regex, url)

    if match:
        return match.group(1)
    else:
        return None

VIDEO_ID = extract_video_id(VIDEO_LINK)

# gets all comments in video 

In [4]:
import requests
import time

comments = []
next_page_token = None
page_count = 0

while True:
    url = f"https://www.googleapis.com/youtube/v3/commentThreads?part=snippet,replies&videoId={VIDEO_ID}&maxResults=100&key={API_KEY}"
    if next_page_token:
        url += f"&pageToken={next_page_token}"

    response = requests.get(url)
    
    if response.status_code != 200:
        print(f"Error code: {response.status_code}")
        print(response.json())
        break

    data = response.json()
    page_count += 1

    for item in data.get("items", []):
        top_comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
        comments.append(top_comment)

        replies = item.get("replies", {}).get("comments", [])
        for reply in replies:
            reply_text = reply["snippet"]["textDisplay"]
            comments.append(reply_text)

    print(f"{page_count}. page processed. Total comments: {len(comments)}")

    next_page_token = data.get("nextPageToken")
    if not next_page_token:
        break

    time.sleep(0.5)

print(f"\n A total of {len(comments)} comments withdrawn.")

1. page processed. Total comments: 214
2. page processed. Total comments: 294

 A total of 294 comments withdrawn.


In [5]:
from turkishnlp import detector as tr_detector 

tr_lemmatizer = tr_detector.TurkishNLP()
tr_lemmatizer.download()

Download is successful


# we make comments clean according to english or turkish comments 

In [6]:
import re
import string
from nltk.corpus import stopwords as nltk_stopwords
from stop_words import get_stop_words
from nltk.stem import WordNetLemmatizer
import langid

def clean_text_multilingual(text):
    if not isinstance(text, str) or not text.strip():
        return ""
    
    word_count = len(text.split())
    char_count = len(text.replace(" ", ""))

    if word_count < 2 and char_count < 5: # ???
        return ""

    lang, _ = langid.classify(text)

    text = re.sub(r'http\S+', '', text) 
    text = re.sub(r'\d+', '', text) 
    text = text.lower().strip()

    if lang == "tr":
        text = re.sub(r'[^\wçğıöşüÇĞİÖŞÜ ]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        stop_words = set(get_stop_words('turkish'))
        tokens = tr_lemmatizer.list_words(text)
        cleaned_tokens = [word for word in tokens if word not in stop_words]

    else:  
        text = re.sub(r'[^\w]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        stop_words = set(nltk_stopwords.words('english'))
        lemmatizer = WordNetLemmatizer()
        tokens = text.split()
        cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    return ' '.join(cleaned_tokens)

cleaned_comments = [clean_text_multilingual(comment) for comment in comments]

print(cleaned_comments[0:5])

['matter anyone say neither hurrem anyone family brought death prince mustafa janissary brought mustafa death say turk government accept shadow suleyman afraid blow would come son blow father made father', 'a href iklim kıtanın sultanı', 'quot never dare quot used phrase many time br janissary honor liar coward', 'insan kafası değil mısır koçanı kesiyor herif karşısındaki adam profesyonel komutan adamın eli armut topluyor lan cellatlık kurumu var', 'suleyman mannificient could leave capable heir killed capable one left great empire imbecile']


# we extract negative comments according to language 

In [7]:
from transformers import pipeline
import langid 
import re

def detect_language(text):
    try:
        lang, confidence = langid.classify(text)
        return 'tr' if lang == 'tr' else 'en' if lang == 'en' else None
    except:
        return None

MODELS = {
    "en": "distilbert-base-uncased-finetuned-sst-2-english",
    "tr": "savasy/bert-base-turkish-sentiment-cased" 
}

pipelines = {
    "en": pipeline("sentiment-analysis", model=MODELS["en"]),
    "tr": pipeline("sentiment-analysis", model=MODELS["tr"])
}

def analyze_sentiment(comment):
    lang = detect_language(comment)
    if not lang:
        return None

    try:
        result = pipelines[lang](comment)[0]
        label = result["label"].upper()
        score = result["score"]
        
        if lang == "tr":
            label = "NEGATIVE" if label == "LABEL_0" else "POSITIVE"
            
        return {"label": label, "score": score}
    except Exception as e:
        print(f"Analysis error: {str(e)}")
        return None

bad_comments = []

for comment in cleaned_comments:
    analysis = analyze_sentiment(comment)
    if analysis and analysis["label"] == "NEGATIVE" and analysis["score"] > 0.80:
        bad_comments.append(comment)

print(f"Negative comments detected ({len(bad_comments)} unit):")
for com in enumerate(bad_comments[0:5], 1):
    print(com)

  from .autonotebook import tqdm as notebook_tqdm





Device set to use cpu
Device set to use cpu


Negative comments detected (103 unit):
(1, 'matter anyone say neither hurrem anyone family brought death prince mustafa janissary brought mustafa death say turk government accept shadow suleyman afraid blow would come son blow father made father')
(2, 'quot never dare quot used phrase many time br janissary honor liar coward')
(3, 'suleyman mannificient could leave capable heir killed capable one left great empire imbecile')
(4, 'true blinded deep love ruthenian girl hurrem imbecile died falling ground trying cath concubine running bath')
(5, 'even today people mourns prince mustafa death sultan ottoman great field marshall mustafa par sultan however selim lack capability started downfall ottoman empire')


# Define api key

In [8]:
apiKey = ""

# get suggestion in ai model

In [9]:
import numpy as np
import requests
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

def generate_summary_with_gemini(comments, apiKey):
    prompt = f"""Summarize user comments below. What are the common complaints or issues? And can you draw a Roadmap for what to pay more attention to next time? Briefly and clearly

    Comments:
    {chr(10).join(f"- {comment}" for comment in comments)}
    """
    data = {
        "contents": [ { "parts": [ { "text": prompt } ] } ]
    }
    url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={apiKey}"
    response = requests.post(url, headers={'Content-Type': 'application/json'}, json=data)
    if response.status_code == 200:
        response_data = response.json()
        if 'candidates' in response_data:
            return response_data['candidates'][0].get('content', {}).get('parts', [{}])[0].get('text', None)
    else:
        print(f"API hatası: {response.status_code}")
    return None

def get_representative_comments(comments, model, top_n=3): 
    embeddings = model.encode(comments) 
    centroid = np.mean(embeddings, axis=0) 
    distances = np.linalg.norm(embeddings - centroid, axis=1) 
    sorted_indices = np.argsort(distances) 
    return [comments[i] for i in sorted_indices[:top_n]]

def optimal_k_means(X, max_k=30):
    best_score = -1
    best_k = 2
    for k in range(2, min(max_k, len(X)) + 1):  
        kmeans = KMeans(n_clusters=k, n_init='auto', random_state=42).fit(X)
        score = silhouette_score(X, kmeans.labels_) 
        if score > best_score:
            best_score = score
            best_k = k
    return best_k

def determine_max_k(n_comments):
    if n_comments < 10:
        return 1 
    elif n_comments < 20:
        return 2
    elif n_comments < 35:
        return 3
    elif n_comments < 50:
        return 4
    elif n_comments < 200:
        return 8
    elif n_comments < 500:
        return 12
    elif n_comments < 1000:
        return 20
    return 30


model = SentenceTransformer('paraphrase-MiniLM-L6-v2') 
comment_embeddings = model.encode(bad_comments)

max_clusters = determine_max_k(len(bad_comments)) 
num_clusters = max(1, optimal_k_means(comment_embeddings, max_k=max_clusters)) 

kmeans = KMeans(n_clusters=num_clusters)
clusters = kmeans.fit_predict(comment_embeddings)

grouped_comments = {} 
for idx, cluster_id in enumerate(clusters):
    grouped_comments.setdefault(cluster_id, []).append(bad_comments[idx])

summaries = {}
for cluster_id, comments in grouped_comments.items():
    reps = get_representative_comments(comments, model, top_n=3) 
    print(f"\nComments to send for the cluster {cluster_id}:")
    for r in reps:
        print(f"- {r}")
    summary = generate_summary_with_gemini(reps, apiKey) 
    summaries[cluster_id] = summary

for cluster_id, summary in summaries.items():
    print(f"\n📌 Cluster {cluster_id} Final Summary:\n{summary}\n")


Comments to send for the cluster 1:
- second time chop head janissary high command young phrase say quot younger self br br first one br href artist halit ergenc played magnificently sceenes nervous first janissary revolt confident middle janissary barrack one year rulled
- comment show ur understanding history zero middle age ended renaissance era selim fault tried control janisarries suleiman got sick got mad one listen know cause he fkingg drunk
- sultan osman han like none happened would happened needed show jenessaries place let know major sin islam disobey ruler whether unjust taken head could anything also trusted kosem sultan beginning none would happened

Comments to send for the cluster 0:
- ottoman sultan took white kid knew white bitter warrior like arab mamluks mainly white kid russian georgian greek armenian christian
- ultimate subjugator greek balkan mideast shall bow turk ccc
- problem crimean khanate many khazar turk jew region later destroyed ottoman empire founded 