In [23]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [24]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

In [25]:
import time
import re
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud, STOPWORDS
from textblob import Word
from slang_dict import slang_dict
import string
import emoji
import os

In [26]:
# Download necessary NLTK data
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\hrish\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hrish\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hrish\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hrish\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [27]:
def scrape_youtube_comments(url):
    """Scrape comments from a YouTube video"""
    comments = []
    # chrome_options = Options()
    # chrome_options.add_argument("--headless")
    
    # with webdriver.Chrome(options=chrome_options) as driver:
    driver = webdriver.Chrome()
    wait = WebDriverWait(driver, 20)
    driver.get(url)

    num_scrolls=5
    for _ in range(num_scrolls):
        wait.until(EC.visibility_of_element_located((By.TAG_NAME, "body"))).send_keys(Keys.END)
        time.sleep(4)

    comment_elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#contents")))
    for element in comment_elements:
        comments.append(element.text)

    return comments

In [35]:
print(comments)

NameError: name 'comments' is not defined

In [28]:
scrape_youtube_comments('https://www.youtube.com/watch?v=V1hjSeSF4xg')

['Pinned by Tech With Tim\n@TechWithTim\n7 months ago\nAnother thing that gives you an unfair advantage is my software development course ;) https://techwithtim.net/dev\n17\nReply\n·\n5 replies\n@G_a_n_d_u\n7 months ago\n1. Automated trading bot\n2. Sentiment analysis \n3. Blockchain base voting system \n4. Online multi-player game\n5. Computer vision related application\nRead more\n469\nReply\n7 replies\n@mmaer\n7 months ago\nI developed a trading bot in 2017 and consistently include it in my resume. I agree with your point; it certainly catches the attention of hiring interviewers. They often ask, "How did it perform? Was it successful?" My response is always candid: the bot functioned as intended, but if it had been truly successful, I wouldn\'t be seeking employment right now. Interestingly, this experience has been a key factor in my being hired 100% of the time.\nRead more\n402\nReply\n9 replies\n@NoToBusinessCasual\n7 months ago\nThanks Tim. I just passed on your video to my dau

In [36]:
print(comments)

NameError: name 'comments' is not defined

In [29]:
def preprocess_text(text):
    
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+', '', text)   # Remove URLs
    text = text.translate(str.maketrans('', '', string.punctuation))   # Remove punctuation
    text = re.sub(r'\d+', '', text) # Remove numbers
    
    text = emoji.demojize(text)  # Remove emojis
    text = re.sub(r':[a-z_]+:', '', text)

    tokens = word_tokenize(text)  # Tokenize
    
    stop_words = stopwords.words('english')  # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]
    
    tokens = [slang_dict.get(token, token) for token in tokens]  # Handle short forms
    
    # Spelling correction and lemmatization using TextBlob
    lemmatizer = WordNetLemmatizer()
    corrected_tokens = []
    for token in tokens:
        word = Word(token)
        corrected_word = word.correct()  # This will handle misspellings like 'gud', 'osm'
        lemmatized_word = lemmatizer.lemmatize(str(corrected_word))
        corrected_tokens.append(lemmatized_word)

    return ' '.join(corrected_tokens)

In [30]:
def analyze_sentiment(text):
    """Analyze sentiment of text using VADER"""
    # Initialize VADER
    sia = SentimentIntensityAnalyzer()
    preprocessed_text = preprocess_text(text)
    sentiment_scores = sia.polarity_scores(preprocessed_text)
    return sentiment_scores['compound']

In [31]:
def analyze_comments(url):
    """Scrape and analyze comments from a YouTube video"""
    # Scrape comments
    comments = scrape_youtube_comments(url)

    # Create DataFrame
    df = pd.DataFrame(comments, columns=['comment'])

    # Preprocess comments
    df['cleaned_comment'] = df['comment'].apply(preprocess_text)

    # Analyze sentiment
    df['sentiment'] = df['cleaned_comment'].apply(analyze_sentiment)

    # Categorize sentiment
    df['sentiment_category'] = pd.cut(df['sentiment'], 
                                      bins=[-1, -0.1, 0.1, 1], 
                                      labels=['Negative', 'Neutral', 'Positive'])

    return df

In [32]:
analyze_comments('https://www.youtube.com/watch?v=V1hjSeSF4xg')

Unnamed: 0,comment,cleaned_comment,sentiment,sentiment_category
0,Pinned by Tech With Tim\n@TechWithTim\n7 month...,pinned teach tim techwithtim month ago another...,0.9994,Positive
1,,,0.0,Neutral
2,,,0.0,Neutral
3,,,0.0,Neutral
4,,,0.0,Neutral
5,,,0.0,Neutral
6,,,0.0,Neutral
7,,,0.0,Neutral
8,,,0.0,Neutral
9,,,0.0,Neutral


In [33]:
df

NameError: name 'df' is not defined

In [18]:
def generate_visualizations(df):
    # Word cloud of most common words
    text = ' '.join(df['cleaned_comment'])
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.tight_layout()
    # CleanCache(directory='static')
    plt.title('Most Common Words in Comments')
    plt.savefig('static/wordcloud.png')
    plt.close()

In [19]:
def get_summary(df):
    """Generate a summary of the sentiment analysis"""
    total_comments = len(df)
    sentiment_counts = df['sentiment_category'].value_counts()
    average_sentiment = df['sentiment'].mean()

    summary = {
        'total_comments': total_comments,
        'positive_comments': sentiment_counts.get('Positive', 0),
        'neutral_comments': sentiment_counts.get('Neutral', 0),
        'negative_comments': sentiment_counts.get('Negative', 0),
        'average_sentiment': average_sentiment
    }

    return summary

In [20]:
# The main function will be called from the Flask app
def main(url):
    df = analyze_comments(url)
    generate_visualizations(df)
    summary = get_summary(df)
    return summary

In [21]:
# class CleanCache:
# 	'''
# 	this class is responsible to clear any residual csv and image files
# 	present due to the past searches made.
# 	'''
# 	def __init__(self, directory=None):
# 		self.clean_path = directory
# 		# only proceed if directory is not empty
# 		if os.listdir(self.clean_path) != list():
# 			# iterate over the files and remove each file
# 			files = os.listdir(self.clean_path)
# 			for fileName in files:
# 				print(fileName)
# 				os.remove(os.path.join(self.clean_path, fileName))
# 		print("cleaned!")

In [22]:
# Test the functions
test_url = "https://www.youtube.com/watch?v=mWrg19Dc_uY" 
result = main(test_url)
print(result)

{'total_comments': 15, 'positive_comments': 1, 'neutral_comments': 14, 'negative_comments': 0, 'average_sentiment': 0.06644}
