# YouTube API Comment Scraping

In [None]:
import googleapiclient.discovery
import googleapiclient.errors
import isodate
import pandas as pd
from datetime import datetime
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from deep_translator import GoogleTranslator

pd.set_option('display.colheader_justify', 'left')
pd.set_option('display.max_colwidth', None)

API_KEY = '' #<-- HIER API KEY EINFÜGEN
CHANNEL_ID = 'UCNhxq7He5p-_FdBh0OaxcQg' #<-- CHANNEL ID VON NIKE https://www.youtube.com/@nike
MIN_COMMENT_COUNT = 1

In [40]:
def youtube_client():
    return googleapiclient.discovery.build(
        "youtube", "v3", developerKey=API_KEY
    )
youtube = youtube_client() 


## Get Video Ids

In [None]:
def get_video_ids_from_channel(channel_id):
    youtube = youtube_client()
    video_ids = []<

    response = youtube.channels().list(part="contentDetails", id=channel_id).execute()
    uploads_playlist_id = response['items'][0]['contentDetails']['relatedPlaylists']['uploads']

    next_page_token = None
    while True:
        response = youtube.playlistItems().list(
            part="snippet", playlistId=uploads_playlist_id, maxResults=50, pageToken=next_page_token
        ).execute()

        for item in response['items']:
            video_ids.append(item['snippet']['resourceId']['videoId'])

        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break

    return video_ids

In [None]:
video_ids = get_video_ids_from_channel(CHANNEL_ID)


## Get Video Details

In [43]:
def get_video_details(video_id):
    youtube = youtube_client()
    response = youtube.videos().list(part="snippet,contentDetails", id=video_id).execute()

    for item in response['items']:
        title = item['snippet']['title']
        video_id = item['id']
        raw_duration = item['contentDetails']['duration']
        duration = isodate.parse_duration(raw_duration).total_seconds()
        raw_date = item['snippet']['publishedAt']
        upload_date = datetime.strptime(raw_date, '%Y-%m-%dT%H:%M:%SZ').strftime('%d.%m.%Y')

    video_detail = {
        'Video ID': video_id,
        'Title': title,
        'Duration': duration,
        'Upload Date': upload_date
        }
    return video_detail

In [None]:
videos_details = []
for video_id in video_ids:
    print(video_id)
    videos_details.append(get_video_details(video_id))

## Get Comments per Video ID

In [45]:
def get_comments(video_id):
    youtube = youtube_client()
    comments = []
    try:
        request = youtube.commentThreads().list(part="snippet", videoId=video_id, maxResults=10000)
        while request:
            response = request.execute()

            for item in response['items']:
                comment = item['snippet']['topLevelComment']['snippet']
                raw_date = comment['publishedAt']
                date = datetime.strptime(raw_date, '%Y-%m-%dT%H:%M:%SZ').strftime('%d.%m.%Y')
                comments.append({
                    'Video ID': video_id,
                    'authorDisplayName': comment['authorDisplayName'],
                    'date': date,
                    'likeCount': comment['likeCount'],
                    'comment': comment['textDisplay']
                })

            request = youtube.commentThreads().list_next(request, response)

    except googleapiclient.errors.HttpError as e:
        if e.resp.status == 403:
            print(f"Kommentare für Video {video_id} sind deaktiviert.")
        else:
            print(f"Fehler beim Abrufen der Kommentare für Video {video_id}: {e}")

    if comments:
        return comments
    else:
        return None, 0
    

In [46]:
comments = []
for video in videos_details:
    video_comments = get_comments(video['Video ID'])
    if len(video_comments) > MIN_COMMENT_COUNT:
        comments.extend(video_comments)
    else: 
        videos_details.remove(video)

In [47]:
def createDataFrame(details, comments) -> pd.DataFrame:
    df_detail = pd.DataFrame(details)
    df_comments = pd.DataFrame(comments)
    df = pd.merge(df_detail, df_comments, on='Video ID', how='left')
    return df

In [48]:
df = createDataFrame(videos_details, comments)

## Preprocess Text

In [None]:
def preprocess_text(text: str):
    new_text = []
    for t in text.split(" "):
        t = 'http' if t.startswith('http') else t
        t = 'http' if t.startswith('href') else t
        t = '' if t.startswith('<a') else t
        t = '' if t.startswith('<br>') else t
        new_text.append(t)
    return " ".join(new_text)

In [50]:
df['comment'] = df['comment'].apply(lambda x: preprocess_text(str(x)))

## Translate Text

In [51]:
def translate_text(text):
    translated_text = str(GoogleTranslator(source='auto', target='en').translate(text)) #if pd.notna(text) else text
    if len(translated_text) > 0: 
        return translated_text
    else: 
        return text

In [52]:
df["comment_en"] = df["comment"].apply(lambda x: translate_text(x))

In [53]:
df.to_csv('output.csv',encoding='utf-8')