# Youtube Comment Scraper

In [28]:
try:
    import selenium
except:
    %pip install selenium

import time    
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Apple Advertisements
# Your Verse: https://www.youtube.com/watch?v=Ep2_0WHogRQ
# Human Family: https://www.youtube.com/watch?v=fyzI2Lu9Qcw
# Official iPad Air Trailer: https://www.youtube.com/watch?v=XOPZBbbF9bA
# iPad Air Ad: https://www.youtube.com/watch?v=DZFIrMxaYqc

# Coca Cola Advertisements
# Open Like Never Before: https://www.youtube.com/watch?v=SukwNeHMMhQ
# The Wonder of Us: https://www.youtube.com/watch?v=-R-EEdvDrUU
# The Great Meal: https://www.youtube.com/watch?v=vUMQeNw2QDA
# Fifa Ad: https://www.youtube.com/watch?v=_ajONMjOC9s

# Nationwide Building Society Advertisements
# Hollie on Family: https://www.youtube.com/watch?v=I5jRp9bQL7w
# More than a Street: https://www.youtube.com/watch?v=ePjzA4QM-7s
# People TV advert: https://www.youtube.com/watch?v=s29wLutTGQg
# Somewhere to call your own with Emma & Emily: https://www.youtube.com/watch?v=_ZcSIsZiAQU

data=[[],[0,0,0]]
url = "https://www.youtube.com/watch?v=_ZcSIsZiAQU"
filename = "nationwide-conversation.csv"
with Chrome() as driver:
    wait = WebDriverWait(driver,15)
    driver.get(url)

    for item in range(6): 
        wait.until(EC.visibility_of_element_located((By.TAG_NAME, "body"))).send_keys(Keys.END)
        time.sleep(2)

    for comment in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#content"))):
        data[0].append(comment.text)
    for likes in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#toolbar"))):
        if likes.text == "Reply": # For comments with no likes, append a 0 at the beginning
            data[1].append("0"+likes.text) # this allows iterating all likes by 1 later on
        else:
            data[1].append(likes.text)

In [29]:
import pandas as pd

comment = data[0]
likes = data[1]
data_tuples = list(zip(comment,likes))
df = pd.DataFrame(data_tuples, columns=['comment','likes'])
df.head(10)

Unnamed: 0,comment,likes
0,TW\nSkip navigation\nSign in\nYeah. And just p...,0
1,,0
2,,0
3,I actually like this ad. It reminds me of simp...,4\nReply
4,"What a brilliant advert with Emma and Emily , ...",2\nReply
5,Brilliant. Just. Brilliant Emma and Emily are...,2\nReply
6,l really like this advert so much. real people...,0Reply
7,"I just want to know where the shop is?, anyone...",0Reply
8,"Nationwide produces the most annoying, irritat...",5\nReply
9,Dreadful advert.....,7\nReply


In [30]:
df_d = df.drop([0,1,2], axis=0)
df_dr = df_d.reset_index()
df_dr = df_dr.drop("index", axis=1)
df_dr.head()

Unnamed: 0,comment,likes
0,I actually like this ad. It reminds me of simp...,4\nReply
1,"What a brilliant advert with Emma and Emily , ...",2\nReply
2,Brilliant. Just. Brilliant Emma and Emily are...,2\nReply
3,l really like this advert so much. real people...,0Reply
4,"I just want to know where the shop is?, anyone...",0Reply


In [31]:
import re
import numpy as np
#iterate through all entries of likes column and substitute line breaks (\n) and Reply with an empty string
for i in range(len(df_dr)): 
    x = df_dr.likes[i]
    x = re.sub("[\nReply]", "", x)
    df_dr.likes[i] = x

df_dr.head()

Unnamed: 0,comment,likes
0,I actually like this ad. It reminds me of simp...,4
1,"What a brilliant advert with Emma and Emily , ...",2
2,Brilliant. Just. Brilliant Emma and Emily are...,2
3,l really like this advert so much. real people...,0
4,"I just want to know where the shop is?, anyone...",0


In [32]:
#drop rows with empty values in comment column
df_dr['comment'].replace('', np.nan, inplace=True) #replace empty comments (probably emojis) with numpy nan so that it can be recognized by Pandas
df_dr = df_dr.dropna(axis=0,subset=['comment']) #drop all rows that have nothing in the comment column
df_dr = df_dr.reset_index() #reset index
df_dr = df_dr.drop("index", axis=1) #drop the residual index number

df_dr.head()

Unnamed: 0,comment,likes
0,I actually like this ad. It reminds me of simp...,4
1,"What a brilliant advert with Emma and Emily , ...",2
2,Brilliant. Just. Brilliant Emma and Emily are...,2
3,l really like this advert so much. real people...,0
4,"I just want to know where the shop is?, anyone...",0


In [33]:
df_dr.to_csv(filename) #save Pandas Dataframe to CSV file

# Sentiment Analysis

In [34]:
# make sure pytorch and transformers are installed; if not, install them
try:
    import torch
except:
    %pip install torch
    import torch

try: 
    from transformers import pipeline
except:
    %pip install transformers
    from transformers import pipeline
    
model_path = "cardiffnlp/twitter-roberta-base-sentiment-latest"
sentiment_task = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [35]:
import pandas as pd
dataset = pd.read_csv(filename) # open the scraped comments
# dataset = df_dr
# create an empty list to hold all the labels and scores
labels = []
scores = []
# err = []
for i, comment in enumerate(dataset.comment): # iterate through each comment
    sent_analysis = sentiment_task(comment) # run each comment through the sentiment analysis pipeline 
    labels.append(sent_analysis[0]['label']) # append the labels list with the most likely sentiment for each comment
    scores.append(sent_analysis[0]['score']) # append the scores list with the confidence of the sentiment

# add new columns of label and score for the dataset
dataset['label'] = labels
dataset['score'] = scores

# drop unnecessary column 
dataset = dataset.drop("Unnamed: 0", axis=1)
dataset.likes += 1 # so that comments with no likes can count as 1 sentiment
dataset.to_csv(filename) #save Pandas Dataframe to CSV file
dataset.head() # show the first few rows

Unnamed: 0,comment,likes,label,score
0,I actually like this ad. It reminds me of simp...,5,positive,0.81155
1,"What a brilliant advert with Emma and Emily , ...",3,positive,0.969869
2,Brilliant. Just. Brilliant Emma and Emily are...,3,positive,0.970229
3,l really like this advert so much. real people...,1,positive,0.985569
4,"I just want to know where the shop is?, anyone...",1,neutral,0.738279


In [36]:
positive = 0
negative = 0
neutral = 0

for index, row in dataset.iterrows():
    if row['label'] == "positive":
        positive += (row['score'] * int(row['likes']))
    elif row['label'] == "negative":
        negative += (row['score'] * int(row['likes']))
    elif row['label'] == "neutral":
        neutral += (row['score'] * int(row['likes']))
print(positive, negative, neutral)
print(dataset.shape)

10.863612055778503 46.48208266496658 0.7382786273956299
(11, 4)
