# Import neccessary Packages and Define Helper Functions
- import neccessary packages, most especially HuggingFace, Torch, and LangChain
- we will also import and install Google's API in order to get some data related to YouTube Comments

In [None]:
! pip install google-api-python-client
! pip install langchain --user
! pip install openai --user

In [12]:
import os, numpy as np, pandas as pd

from googleapiclient.discovery import build

from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from langchain.llms import OpenAI
import torch

from matplotlib import pyplot as plt
import seaborn as sns

In [13]:
device = 0 if torch.cuda.is_available() else -1

# Create a Dataset
- import comments from YouTube videos
- look at some examples

In [126]:
def get_video_comments(video_id, api_key):
    # empty list for storing reply
    replies = []
 
    # creating youtube resource object
    youtube = build('youtube', 'v3',
                    developerKey=api_key)
 
    # retrieve youtube video results
    video_response=youtube.commentThreads().list(
        part='snippet',
        maxResults=100,
        videoId=video_id
    ).execute()
 
    # iterate video response
    results = []
    while video_response:
       
        # extracting required info
        # from each result object
        for item in video_response['items']:
           
            datum = {}
            datum['id'] = item['id']
            datum['comment'] = item['snippet']['topLevelComment']['snippet']['textOriginal']
            datum['author'] = item['snippet']['topLevelComment']['snippet']['authorDisplayName']
            datum['author_channel'] = item['snippet']['topLevelComment']['snippet']['authorChannelId']['value']
            datum['like_count'] = item['snippet']['topLevelComment']['snippet']['likeCount']
            datum['published_at'] = item['snippet']['topLevelComment']['snippet']['publishedAt']
            datum['coversation_id'] = item['id']
            results.append(datum)
            if item['snippet']['totalReplyCount'] > 0:
                results.extend(get_all_top_level_comment_replies(item['id'], [], None)) 
 
        # Again repeat
        if 'nextPageToken' in video_response:
            video_response = youtube.commentThreads().list(
                    part = 'snippet',
                    pageToken = video_response['nextPageToken'],
                    videoId = video_id
                ).execute()
        else:
            return results

In [127]:
def get_all_top_level_comment_replies(top_comment_id, replies, token):
    replies_response=youtube.comments().list(part='snippet',
                                               maxResults=100,
                                               parentId=top_comment_id,
                                               pageToken=token).execute()

    for item in replies_response['items']:
        datum = {}
        datum['id'] = item['id']
        datum['comment'] = item['snippet']['textOriginal']
        datum['author'] = item['snippet']['authorDisplayName']
        datum['author_channel'] = item['snippet']['authorChannelId']['value']
        datum['like_count'] = item['snippet']['likeCount']
        datum['published_at'] = item['snippet']['publishedAt']
        datum['coversation_id'] = top_comment_id
        replies.append(datum)

    if "nextPageToken" in replies_response: 
        return get_all_top_level_comment_replies(top_comment_id, replies, replies_response['nextPageToken'])
    else:
        return replies

In [128]:
with open("youtube_credentials.txt") as f:
    api_key = f.read()

In [135]:
#video_id = "Lwx-2R9swDg" # Be All You Can Be - U.S. Army's new brand trailer - U.S. Army
#video_id = "-ZMzfihqOkQ" # What Army Snipers Go Through At Sniper School - insider business
video_id = "nV8UZJNBY6Y" # Adele - The Final Carpool Karaoke

In [136]:
comments = get_video_comments(video_id, api_key)

In [137]:
comments_df = pd.DataFrame(comments)

In [138]:
# save data
comments_df.to_csv("video_comments_"+video_id+".csv")

# load data
#comments_df = pd.read_csv("video_comments_"+video_id+".csv", index_col=0)

In [139]:
comments_df.shape

(12342, 7)

In [140]:
comments_df.head()

Unnamed: 0,id,comment,author,author_channel,like_count,published_at,coversation_id
0,UgzWUD0w2WFLoC57dlh4AaABAg,"Adele is a national treasure, even though she ...",Frances Clare,UCGh4RfUqDY91cOXvaIoKBrg,0,2023-04-25T19:59:13Z,UgzWUD0w2WFLoC57dlh4AaABAg
1,UgwtMlFlwMW8e6IxBOF4AaABAg,James looks like he was gonna cry the whole ti...,Sarah Dunn,UCsyHMzUXrpqsZta-Px0xiVA,0,2023-04-25T19:59:12Z,UgwtMlFlwMW8e6IxBOF4AaABAg
2,UgxQYaTdTzlPSWZSGDh4AaABAg,Te amo,rocio torres,UCGlUXRVkQ-tXTteo5fPs-8A,0,2023-04-25T19:59:12Z,UgxQYaTdTzlPSWZSGDh4AaABAg
3,Ugy6AlRHwM0-ArfJGXl4AaABAg,Adele.!!👑😘,cristal G,UCj2ncc-6zaKI5EcdcMFtD_g,1,2023-04-25T19:59:10Z,Ugy6AlRHwM0-ArfJGXl4AaABAg
4,UgxI06aXMKee6U4aDmV4AaABAg,Makes you want to throw up 😅,Jesse Stone,UC968J5YEA6tz5EFda7pw6DQ,0,2023-04-25T19:59:03Z,UgxI06aXMKee6U4aDmV4AaABAg


# Zero-shot Text Labeling and Summarization
- produce some candidate labels for the different comments
- summarize the comments to get after some general themes

In [144]:
classifier = pipeline(model="facebook/bart-large-mnli", task="zero-shot-classification", device=0)

In [145]:
labels = ["offensive_content", "supportive_content"]

In [150]:
classifier(list(comments_df['comment']), candidate_labels = labels)


KeyboardInterrupt



# Prompt Engineering for Various Tasks
- comment our code
- prompt engineer for specific tasks
    - determining wether a comment is against or 'trolling' a video
    - produce our own 'trolling' comments to another video
    - save out these prompts for future use
    - extract a link analysis diagram from text
- do these prompt engineered tasks both locally and MaaS/SaaS

# Fine Tune an LLM