## Imports

In [None]:
import os
import googleapiclient.discovery
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import torch
from textblob import TextBlob

## Comments

In [17]:
#-----------------------------------------------------------------------

# Return a df which includes the filtered comment and reply data from 
# a selected yt video via yt api

def get_yt_data(vid):
    # Disable OAuthlib's HTTPS verification when running locally.
    # *DO NOT* leave this option enabled in production.
    os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"

    # Setup
    api_service_name = "youtube"
    api_version = "v3"
    DEVELOPER_KEY = "AIzaSyDtuq5QvhIxQFpZTSqYIQ7Hs2bSWhNyW0Y"

    youtube = googleapiclient.discovery.build(
        api_service_name, api_version, developerKey = DEVELOPER_KEY)

    # Query to retrieve top level comments
    def make_comments_request(vid, pToken):
        request = youtube.commentThreads().list(
            part="snippet, replies",
            videoId=vid,
            maxResults=100, # max 100
            textFormat="plainText",
            order="relevance",
            pageToken=pToken
        )
        return request.execute()

    # Retrieve first page
    comments = []
    pageToken = None # At first API call, pageToken is None
    page = make_comments_request(vid, pageToken)
    comments.append(page)
    pageToken = page.get("nextPageToken")

    # Retrieve successive page(s) if new pageToken
    while pageToken is not None:
        page = make_comments_request(vid, pageToken)
        comments.append(page)
        pageToken = page.get("nextPageToken")
    
    return comments

comments = get_yt_data("SMyD-Ax2Gkg")
comments

import json
with open('data/comments.json', 'w') as f:
    json.dump(comments, f)

## Replies

In [16]:
def get_yt_replies(comments):
    # Disable OAuthlib's HTTPS verification when running locally.
    # *DO NOT* leave this option enabled in production.
    os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"

    # Setup
    api_service_name = "youtube"
    api_version = "v3"
    DEVELOPER_KEY = "AIzaSyDtuq5QvhIxQFpZTSqYIQ7Hs2bSWhNyW0Y"

    youtube = googleapiclient.discovery.build(
        api_service_name, api_version, developerKey = DEVELOPER_KEY)
    
    # Get ids of top level comments which have replies
    ids = []
    for page in comments:
        for i in range(int(len(page["items"]))):
            if page["items"][i].get("replies") != None: 
                ids.append(page["items"][i]["id"])

    # Query to retrieve replies on top level comments
    def make_replies_request(id, pToken):
        request = youtube.comments().list(
            part="snippet",
            parentId=id,
            maxResults=20, # max 100
            pageToken=pToken,
            textFormat="plainText"
        )
        return request.execute()

    replies = []
    for i in range(len(ids)):
        # Retrieve first page
        pageToken = None # At first API call, pageToken is None
        page = make_replies_request(ids[i], pageToken)
        replies.append(page)
        pageToken = page.get("nextPageToken")

        # Retrieve successive page(s) if new pageToken
        while pageToken is not None:
            page = make_replies_request(ids[i], pageToken)
            replies.append(page)
            pageToken = page.get("nextPageToken")
    
    return replies   

replies = get_yt_replies(comments)
replies

import json
with open('data/replies.json', 'w') as f:
    json.dump(replies, f)

In [18]:
def get_yt_df(comments, replies):
    # Filter and stitch comments and replies

    data = []
    # Filter and stitch comments (append comment id and comment content)
    for page_c in comments:
        for i in range(int(len(page_c["items"]))):
            comment_id = page_c["items"][i]["id"]
            comment = page_c["items"][i]["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
            data.append([comment_id, comment])

            # Fetch replies (append comment id and reply content)
            if page_c["items"][i].get("replies") != None: 
                for page_r in replies:
                    for j in range(int(len(page_r["items"]))):
                        reply_parent_id = page_r["items"][j]["snippet"]["parentId"]
                        reply = page_r["items"][j]["snippet"]["textDisplay"]
                        if reply_parent_id == comment_id:
                            data.append([comment_id, reply])

    # Filter and stitch a df with named cols
    df = pd.DataFrame(np.array(data), columns=["id", "comment_reply"])

    return df
    
df = get_yt_df(comments, replies)

In [19]:
df.head()

Unnamed: 0,id,comment_reply
0,UgxOkhl9kay5L5gsEIR4AaABAg,The spec is just on point! So much better than...
1,UgxOkhl9kay5L5gsEIR4AaABAg,@Surf alcatraz how dumb you compare a Bugatti ...
2,UgxOkhl9kay5L5gsEIR4AaABAg,0:29 look at the hood PANEL GAP difference. Le...
3,UgxOkhl9kay5L5gsEIR4AaABAg,"@Surf alcatraz i saw a video with about this, ..."
4,UgxOkhl9kay5L5gsEIR4AaABAg,@Michał Basiński you do understand that Mate R...


In [1]:
len(df)

NameError: name 'df' is not defined

In [33]:
import pandas as pd
import numpy as np

def get_comments_and_replies_for(vid, offline=False):
  # Get comments and replies data for video id (via API or offline)
  import json

  comments_path = "/Users/philippjohn/Developer/youtube-analytics/data/comments.json"
  replies_path = "/Users/philippjohn/Developer/youtube-analytics/data/replies.json"

  with open(comments_path) as fp:
      comments = json.load(fp)

  with open(replies_path) as fp:
      replies = json.load(fp)

  # Filter and stitch comments and replies
  data = []
  # Filter and stitch comments (append comment id and comment content)
  for page_c in comments:
      for i in range(int(len(page_c["items"]))):
          comment_id = page_c["items"][i]["id"]
          comment = page_c["items"][i]["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
          data.append([comment_id, comment])

          # Fetch replies (append comment id and reply content)
          if page_c["items"][i].get("replies") != None: 
              for page_r in replies:
                  for j in range(int(len(page_r["items"]))):
                      reply_parent_id = page_r["items"][j]["snippet"]["parentId"]
                      reply_id = page_r["items"][j]["id"]
                      reply = page_r["items"][j]["snippet"]["textDisplay"]
                      if reply_parent_id == comment_id:
                          data.append([reply_id, reply])

  # Filter and stitch a df with named cols
  df = pd.DataFrame(np.array(data), columns=["id", "content"])

  return df

bugatti_video_id = "SMyD-Ax2Gkg"
df = get_comments_and_replies_for(bugatti_video_id, offline=True)
df.head()

Unnamed: 0,id,content
0,UgxOkhl9kay5L5gsEIR4AaABAg,The spec is just on point! So much better than...
1,UgxOkhl9kay5L5gsEIR4AaABAg.9cFlIEOUUok9cWxWJhndQX,@Surf alcatraz how dumb you compare a Bugatti ...
2,UgxOkhl9kay5L5gsEIR4AaABAg.9cFlIEOUUok9cW8nfP9eZw,0:29 look at the hood PANEL GAP difference. Le...
3,UgxOkhl9kay5L5gsEIR4AaABAg.9cFlIEOUUok9cVhDW93Ycb,"@Surf alcatraz i saw a video with about this, ..."
4,UgxOkhl9kay5L5gsEIR4AaABAg.9cFlIEOUUok9cVYtys13G3,@Michał Basiński you do understand that Mate R...


In [34]:
from string import punctuation
import re

def clean_content(df):
    # Creating function to clean comments
    def clean(content):
        content = re.sub(r"@[A-Za-z0-9]+", "", content)
        content = re.sub(r"https?:\/\/\S+", "", content)
        comment = content.translate(str.maketrans("", "", punctuation))
        return content

    # Cleaning content
    df["content_clean"] = df["content"].apply(clean)

    return df

# Show first 5 cleaned entires
df_clean = clean_content(df)
df_clean.head()

Unnamed: 0,id,content,content_clean
0,UgxOkhl9kay5L5gsEIR4AaABAg,The spec is just on point! So much better than...,The spec is just on point! So much better than...
1,UgxOkhl9kay5L5gsEIR4AaABAg.9cFlIEOUUok9cWxWJhndQX,@Surf alcatraz how dumb you compare a Bugatti ...,alcatraz how dumb you compare a Bugatti with ...
2,UgxOkhl9kay5L5gsEIR4AaABAg.9cFlIEOUUok9cW8nfP9eZw,0:29 look at the hood PANEL GAP difference. Le...,0:29 look at the hood PANEL GAP difference. Le...
3,UgxOkhl9kay5L5gsEIR4AaABAg.9cFlIEOUUok9cVhDW93Ycb,"@Surf alcatraz i saw a video with about this, ...","alcatraz i saw a video with about this, and a..."
4,UgxOkhl9kay5L5gsEIR4AaABAg.9cFlIEOUUok9cVYtys13G3,@Michał Basiński you do understand that Mate R...,ł Basiński you do understand that Mate Rimac i...
