In [None]:
import streamlit as st
from getComments import get_comments
from extract_data import extract_comments
from mine import (
    preprocess_text,
    analyze_sentiment,
    generate_wordcloud,
    perform_clustering,
    visualize_clusters,
    visualize_term_frequencies
)

st.title("YouTube Comment Analyzer")
st.markdown("Enter a YouTube **Video ID** to fetch and analyze its comments.")

# Input field for YouTube video ID
video_id = st.text_input("Enter Video ID", "")

if st.button("Fetch Comments"):
    if video_id:
        try:
            # Fetch comments using the YouTube API
            df = get_comments(video_id)

            if df is None or df.empty:  # Check for None and empty DataFrame
                st.warning("No comments found for this video.")
            else:
                st.success(f"Fetched {len(df)} comments successfully!")
                st.dataframe(df)  # Display comments in a table

                # Preprocess comments for text mining
                df["cleaned_comment"] = df["comment"].apply(preprocess_text)
                df["sentiment"] = df["cleaned_comment"].apply(analyze_sentiment)

                # Display sentiment analysis
                st.subheader("Sentiment Distribution")
                sentiment_counts = df["sentiment"].value_counts()
                st.bar_chart(sentiment_counts)

                # Generate and display word cloud
                st.subheader("Word Cloud")
                all_cleaned_text = " ".join(df["cleaned_comment"])
                img = generate_wordcloud(all_cleaned_text)
                st.image(img, caption='Word Cloud', use_column_width=True)

                # Perform clustering
                n_clusters = 3  # Set the number of clusters
                try:
                    labels, feature_names, tfidf_matrix = perform_clustering(df["cleaned_comment"], n_clusters)

                    # Visualize clusters
                    st.subheader("Cluster Visualization")
                    visualize_clusters(labels, tfidf_matrix, n_clusters)

                    # Calculate and visualize term frequencies
                    term_frequencies = df["cleaned_comment"].str.split(expand=True).stack().value_counts()
                    st.subheader("Term Frequencies Visualization")
                    visualize_term_frequencies(term_frequencies)
                except Exception as e:
                    st.error(f"An error occurred during clustering: {e}")

                # Provide download button for processed data
                st.download_button(
                    label="Download Processed Data",
                    data=df.to_csv(index=False),
                    file_name="processed_comments.csv",
                    mime="text/csv"
                )
        except Exception as e:
            st.error(f"An error occurred while fetching comments: {e}")
    else:
        st.error("Please enter a valid YouTube Video ID.")


In [None]:
import json
import pandas as pd

def extract_comments():
    """Extracts comments from a JSON file into a DataFrame."""
    with open("data.json", "r") as json_file:
        data = json.load(json_file)

    comments = []
    for item in data.get("items", []):
        comment_snippet = item["snippet"]["topLevelComment"]["snippet"]
        comment_data = {
            "author": comment_snippet["authorDisplayName"],
            "author_channel_url": comment_snippet["authorChannelUrl"],
            "comment": comment_snippet["textOriginal"],
            "like_count": comment_snippet["likeCount"],
            "published_at": comment_snippet["publishedAt"],
            "updated_at": comment_snippet["updatedAt"]
        }
        comments.append(comment_data)

    return pd.DataFrame(comments)


In [None]:
import os
import json
import googleapiclient.discovery
import pandas as pd

def get_comments(video_id):
    os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
    api_service_name = "youtube"
    api_version = "v3"
    DEVELOPER_KEY = "AIzaSyDzhV8FCEfGG0auMqA-ibZq0GfYgKIs3j8"  # Replace with your actual API key

    youtube = googleapiclient.discovery.build(api_service_name, api_version, developerKey=DEVELOPER_KEY)

    request = youtube.commentThreads().list(
        part="snippet,replies",
        videoId=video_id,
        maxResults=100  # Adjust as necessary
    )
    try:
        response = request.execute()
    except Exception as e:
        print(f"Error fetching comments: {e}")
        return pd.DataFrame()  # Return an empty DataFrame on error

    # Extract comments and return as DataFrame
    comments = []
    for item in response.get("items", []):
        comment_snippet = item["snippet"]["topLevelComment"]["snippet"]
        comments.append({
            "author": comment_snippet["authorDisplayName"],
            "author_channel_url": comment_snippet["authorChannelUrl"],
            "comment": comment_snippet["textOriginal"],
            "like_count": comment_snippet["likeCount"],
            "published_at": comment_snippet["publishedAt"],
            "updated_at": comment_snippet["updatedAt"]
        })

    return pd.DataFrame(comments)


In [None]:
import re
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
import io
import streamlit as st

def preprocess_text(text):
    """Cleans the text for analysis."""
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@\w+|\#','', text)  # Remove @ mentions and # hashtags
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.strip()  # Remove leading/trailing whitespaces
    return text

def analyze_sentiment(text):
    """Analyzes the sentiment of the given text."""
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'Positive'
    elif analysis.sentiment.polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

def generate_wordcloud(text):
    """Generates a word cloud image from the given text."""
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    img = io.BytesIO()

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.savefig(img, format='png')
    plt.close()  # Close the plot to avoid displaying it
    img.seek(0)

    return img

def visualize_term_frequencies(term_frequencies):
    """Visualizes the term frequencies using Seaborn."""
    plt.figure(figsize=(10, 6))
    sns.barplot(x=term_frequencies.values, y=term_frequencies.index, palette='viridis', hue=term_frequencies.index)
    plt.title("Term Frequencies")
    plt.xlabel("Frequency")
    plt.ylabel("Terms")
    plt.legend(title='Terms', loc='upper right', bbox_to_anchor=(1.2, 1))
    st.pyplot(plt)  # Use Streamlit's pyplot to display the plot
    plt.close()  # Close the plot to avoid displaying it again

def perform_clustering(text_data, n_clusters):
    """Performs clustering on the cleaned text data."""
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.cluster import KMeans

    # Create the TF-IDF matrix
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(text_data)

    # Perform KMeans clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(tfidf_matrix)

    return kmeans.labels_, vectorizer.get_feature_names_out(), tfidf_matrix

def visualize_clusters(labels, tfidf_matrix, n_clusters):
    """Visualizes the clusters using a scatter plot."""
    from sklearn.decomposition import PCA

    # Reduce dimensions using PCA
    pca = PCA(n_components=2)
    reduced_data = pca.fit_transform(tfidf_matrix.toarray())

    plt.figure(figsize=(10, 6))
    plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=labels, cmap='viridis', alpha=0.6)
    plt.title(f"K-Means Clustering (n_clusters={n_clusters})")
    plt.xlabel("PCA Component 1")
    plt.ylabel("PCA Component 2")
    plt.colorbar(label='Cluster Label')
    st.pyplot(plt)  # Use Streamlit's pyplot to display the plot
    plt.close()  # Close the plot to avoid displaying it again
