# Extract youtube video id form link

In [1]:
import re

API_KEY = ""
VIDEO_ID = ""
VIDEO_LINK = "https://youtu.be/-RJ0xGYUHBE?si=pPHpGjkauo_e6m68"
def extract_video_id(url):
    regex = r'(?:https?://)?(?:www\.)?(?:youtube\.com/(?:watch\?v=|shorts/)|youtu\.be/)([a-zA-Z0-9_-]{11})'

    match = re.match(regex, url)
    
    if match:
        return match.group(1) 
    else:
        return None

VIDEO_ID = extract_video_id(VIDEO_LINK)

# Gets all comments in video 

In [2]:
import requests
import time

comments = []
next_page_token = None
page_count = 0

while True:
    url = f"https://www.googleapis.com/youtube/v3/commentThreads?part=snippet,replies&videoId={VIDEO_ID}&maxResults=100&key={API_KEY}"
    if next_page_token:
        url += f"&pageToken={next_page_token}"

    response = requests.get(url)
    
    if response.status_code != 200:
        print(f"Error code: {response.status_code}")
        print(response.json())
        break

    data = response.json()
    page_count += 1

    for item in data.get("items", []):
        top_comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
        comments.append(top_comment)

        replies = item.get("replies", {}).get("comments", [])
        for reply in replies:
            reply_text = reply["snippet"]["textDisplay"]
            comments.append(reply_text)

    print(f"{page_count}. page processed. Total comments: {len(comments)}")

    next_page_token = data.get("nextPageToken")
    if not next_page_token:
        break

    time.sleep(0.5)

print(f"\n A total of {len(comments)} comments withdrawn.")

1. page processed. Total comments: 214
2. page processed. Total comments: 294

 A total of 294 comments withdrawn.


# We make comments clean  

In [3]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
from contractions import fix
import re

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if not isinstance(text, str) or not text.strip():
        return ""
    
    try:
        text = fix(text)
    except Exception as e:
        print(f"Contraction error: {e} | Original text: {text}")
        return ""

    text = re.sub("[^a-zA-Z]", ' ', text)
    text = re.sub(r'http\S+', '', text)
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))

    tokens = word_tokenize(text)
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    cleaned_text = ' '.join(cleaned_tokens)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()

    return cleaned_text

cleaned_comments = [clean_text(comment) for comment in comments]

print(cleaned_comments[0:5])

['matter anyone say neither hurrem anyone family brought death prince mustafa janissary brought mustafa death say turk government accept shadow suleyman afraid blow would come son blow father made father', 'href www youtube com watch v rj xgyuhbe amp iklim k tan n sultan', 'quot never dare quot used phrase many time br janissary honor liar coward', 'insan kafas de il de sanki r ko kesiyor herif kar ndaki adam profesyonel komutan adam n eli armut mu topluyor lan cellatl k kurumu niye var acaba', 'suleyman mannificient could leave capable heir killed capable one left great empire imbecile']


# We predict negative comments  according with ready trained model 

In [4]:
import joblib

rf = joblib.load("trained_sentiment.pkl")
vectorizer = joblib.load("sentiment_vectorizer.pkl")

transformed_comments = vectorizer.transform(cleaned_comments).toarray()

predictions = rf.predict(transformed_comments)

bad_comment_list = [cleaned_comments[i] for i in range(len(predictions)) if predictions[i] == 0]

print(f"Negative comments detected {len(bad_comment_list)} unit.")
print(bad_comment_list[:5]) 

Negative comments detected 16 unit.
['selim alone reason ottoman lost central europe way complicated first took year suleiman lost central europe give selim much credit fall empire guy ruled year way quot begining end quot empire actually quite stable thanks pasha rule even actually got minor teritory gain br multiple factor played role fall ottoman empire br th century spain begin got shitload gold america starting inflation ottoman empire nothing america br starting reign murad huge amount coruption started within ottoman empire play huge role fall empire br coruption started army within next generation gained extreme amount power influence even year suleiman died killed sultan osman br br shortly said murad spain infuence fall empire selim murad decent sultan could easily continue suleimans work acting selims rule minor set back mehmed decent sultan would still take year trying fix problem murad caused could even think expanding', 'sultan suleyman cut janissary agha head shehzade pr

# Define api key

In [5]:
apiKey = "" 

# Get suggestion in ai model

In [6]:
import numpy as np
import requests
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

def generate_summary_with_gemini(comments, apiKey):
    prompt = f"""Summarize user comments below. What are the common complaints or issues? And can you draw a Roadmap for what to pay more attention to next time? Briefly and clearly

Comments:
{chr(10).join(f"- {comment}" for comment in comments)}
"""
    data = {
        "contents": [ { "parts": [ { "text": prompt } ] } ]
    }
    url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={apiKey}"
    response = requests.post(url, headers={'Content-Type': 'application/json'}, json=data)
    if response.status_code == 200:
        response_data = response.json()
        if 'candidates' in response_data:
            return response_data['candidates'][0].get('content', {}).get('parts', [{}])[0].get('text', None)
    else:
        print(f"API error: {response.status_code}")
    return None

def get_representative_comments(comments, model, top_n=3):
    embeddings = model.encode(comments)
    centroid = np.mean(embeddings, axis=0)
    distances = np.linalg.norm(embeddings - centroid, axis=1)
    sorted_indices = np.argsort(distances)
    return [comments[i] for i in sorted_indices[:top_n]]

def optimal_k_means(X, max_k=30):
    best_score = -1
    best_k = 2
    for k in range(2, min(max_k, len(X)) + 1):
        kmeans = KMeans(n_clusters=k, n_init='auto', random_state=42).fit(X)
        score = silhouette_score(X, kmeans.labels_)
        if score > best_score:
            best_score = score
            best_k = k
    return best_k

def determine_max_k(n_comments):
    if n_comments < 10:
        return 1 
    elif n_comments < 20:
        return 2
    elif n_comments < 30:
        return 3
    elif n_comments < 50:
        return 4
    elif n_comments < 200:
        return 8
    elif n_comments < 500:
        return 12
    elif n_comments < 1000:
        return 20
    return 30

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
comment_embeddings = model.encode(bad_comment_list)

max_clusters = determine_max_k(len(bad_comment_list))
num_clusters = max(1, optimal_k_means(comment_embeddings, max_k=max_clusters))

kmeans = KMeans(n_clusters=num_clusters)
clusters = kmeans.fit_predict(comment_embeddings)

grouped_comments = {}
for idx, cluster_id in enumerate(clusters):
    grouped_comments.setdefault(cluster_id, []).append(bad_comment_list[idx])

summaries = {}
for cluster_id, comments in grouped_comments.items():
    reps = get_representative_comments(comments, model, top_n=3) 
    print(f"\nComments to send for the cluster {cluster_id}:")
    for r in reps:
        print(f"- {r}")
    summary = generate_summary_with_gemini(reps, apiKey)
    summaries[cluster_id] = summary

for cluster_id, summary in summaries.items():
    print(f"\n📌 Cluster {cluster_id} Final Summary:\n{summary}\n")

  from .autonotebook import tqdm as notebook_tqdm




Comments to send for the cluster 0:
- selim alone reason ottoman lost central europe way complicated first took year suleiman lost central europe give selim much credit fall empire guy ruled year way quot begining end quot empire actually quite stable thanks pasha rule even actually got minor teritory gain br multiple factor played role fall ottoman empire br th century spain begin got shitload gold america starting inflation ottoman empire nothing america br starting reign murad huge amount coruption started within ottoman empire play huge role fall empire br coruption started army within next generation gained extreme amount power influence even year suleiman died killed sultan osman br br shortly said murad spain infuence fall empire selim murad decent sultan could easily continue suleimans work acting selims rule minor set back mehmed decent sultan would still take year trying fix problem murad caused could even think expanding
- sultan osman han like none happened would happened