## Importing libraries

In [3]:
from googleapiclient.discovery import build
import json
import requests
import time
import pandas as pd
import re
import numpy as np
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)

## Calling YouTube API and inputting search term

In [34]:
# Call the YouTube API
api_key = 'AIzaSyDA0-qOAgaltH0DhTLa9Y1IdbIHfUlAPm0' # Enter your own API key – this one won't work

youtube_api = build('youtube', 'v3', developerKey = api_key)

search_terms = 'huberman'

results = youtube_api.search().list(q=search_terms, part='snippet', type='video',
                                    order='viewCount', maxResults=5).execute()

In [35]:
video_ids = []
thumbnails = []
date = []
titles = []
descriptions = []
channel_titles = []


for item in results['items']:
    video_ids.append(item['id']['videoId'])
    thumbnails.append(item['snippet']['thumbnails']['default']['url'])
    titles.append(item['snippet']['title'])
    descriptions.append(item['snippet']['description'])
    channel_titles.append(item['snippet']['channelTitle'])

dataframe = pd.DataFrame({'thumbnail': thumbnails, 'video_id': video_ids, 'title': titles, 'description': descriptions, 'channel_title': channel_titles})
dataframe['video_id'] = 'https://www.youtube.com/watch?v=' + dataframe['video_id'].astype(str)
dataframe = dataframe.rename(columns={'video_id': 'url'})

## Taking input and Splitting the video Id

In [5]:
video_id = video_ids[0]
video_id

'I7n8sPQShW8'

## Storing all the comments in a list


In [6]:
def get_comments(youtube, video_id, token):
  """
  Recursive function that retrieves the comments (top-level ones) a given video has.
  """

  global all_comments
  totalReplyCount = 0
  token_reply = None

  if (len(token.strip()) == 0):
    all_comments = []

  if (token == ''):
    video_response=youtube.commentThreads().list(part='snippet',maxResults=100,videoId=video_id,order='relevance').execute()
  else:
    video_response=youtube.commentThreads().list(part='snippet',maxResults=100,videoId=video_id,order='relevance',pageToken=token).execute()

   # Loop comments from the video:
  for indx, item in enumerate(video_response['items']):
    # Append coments:
    all_comments.append("COMMENT WITH " + str(item['snippet']['totalReplyCount']) + " replies: " + item['snippet']['topLevelComment']['snippet']['textDisplay'])

    # Get total reply count:
    totalReplyCount = item['snippet']['totalReplyCount']

    # If the comment has replies, get them:
    if (totalReplyCount > 0):
      # Get replies - first batch:
      replies_response=youtube.comments().list(part='snippet',maxResults=100,parentId=item['id']).execute()
      for indx, reply in enumerate(replies_response['items']):
        # Append the replies to the main array:
        all_comments.append((" "*2) + "=>FIRST CALLBACK REPLY: " + reply['snippet']['textDisplay'])

      # If the reply has a token for get more replies, loop those replies
      # and add those replies to the main array:
      while "nextPageToken" in replies_response:
        token_reply = replies_response['nextPageToken']
        replies_response=youtube.comments().list(part='snippet',maxResults=100,parentId=item['id'],pageToken=token_reply).execute()
        for indx, reply in enumerate(replies_response['items']):
          all_comments.append((" "*4) + "==>WHILE GETTING REPLIES: " + reply['snippet']['textDisplay'])

  # Check if the video_response has more comments:
  if "nextPageToken" in video_response:
    return get_comments(youtube, video_id, video_response['nextPageToken'])
  else:
    # Remove empty elements added to the list "due to the return in both functions":
    all_comments = [x for x in all_comments if len(x) > 0]
    print("Fin")
    return []

all_comments=[]
qtyReplies = 0
qtyMainComments = 0

#youtube = build('youtube', 'v3',developerKey=api_key)
comments = get_comments(youtube_api,video_id,'')

# Show results:
print("All total comments obtained: "  + str(len(all_comments)))

Fin
All total comments obtained: 3987


## Converting the list to dataframe


In [None]:
df = pd.DataFrame(all_comments,columns=['comment'])
df.to_csv("dopamine_comments.csv")

In [8]:
df = pd.read_csv("dopamine_comments.csv")
df = df.rename(columns={'comments': 'comment'})

# Data Cleaning


In [9]:
## Converting to str type
df['comment'] = df['comment'].astype(str)

## Removing all the emoji's from the dataframe
df = df.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))

## Removing all the url's from the data frame
df['comment'] = df['comment'].apply(lambda x: re.split('<a href="https:\/\/.*', str(x))[0])

## Removing all special characters
def process_content(content):
    return " ".join(re.findall("[A-Za-z]+",content))

df['comment'] = df['comment'].apply(process_content)

## Converting to lower case
df['comment'] = df['comment'].str.lower()

## Removing empty rows
df['comment'].replace('', np.nan, inplace=True)
df.dropna()

#delete:
#first callback reply
df['comment'] = df['comment'].replace(re.compile('first callback reply', re.IGNORECASE), '', regex=True)

#it was suggested to use .loc instead
df.loc[:, 'comment'] = df['comment'].replace(re.compile('comment with replies', re.IGNORECASE), '', regex=True)

#delete:
#comment with replies
df['comment'] = df['comment'].replace(re.compile('while getting replies', re.IGNORECASE), '', regex=True)

df['comment'] = df['comment'].str.strip()  # Remove leading and trailing whitespace
df = df[df['comment'] != '']

## Creating and Training CNN+LSTM Model

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, LSTM, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.layers import Reshape
from keras.models import load_model

In [4]:
data = pd.read_csv("01cleaner_40k.csv")
X = data.drop(columns=["video_id"])
y = X.pop("label")
X_train_list = list(X["comment"])

In [5]:
# Tokenize input using Keras Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_list)

X_train_seq = tokenizer.texts_to_sequences(X_train_list)
#X_test_seq = tokenizer.texts_to_sequences(X_test_list)

MAX_SEQ_LENGTH = 128  # Define your desired sequence length
X_train_padded = pad_sequences(X_train_seq, maxlen=MAX_SEQ_LENGTH, padding='post')
#X_test_padded = pad_sequences(X_test_seq, maxlen=MAX_SEQ_LENGTH, padding='post')

# Build the CNN-LSTM model
embedding_dim = 128
filters = 64
kernel_size = 3
lstm_units = 64
dropout_rate = 0.5

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=MAX_SEQ_LENGTH))
model.add(Conv1D(filters, kernel_size, activation='relu'))
model.add(LSTM(lstm_units, return_sequences=True))  # Ensure return_sequences=True for proper input to GlobalMaxPooling1D
model.add(GlobalMaxPooling1D())
model.add(Reshape((filters,)))  # Reshape to ensure the input has the correct shape for GlobalMaxPooling1D
model.add(Dropout(dropout_rate))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

2024-01-13 13:47:12.565763: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-01-13 13:47:12.566565: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-01-13 13:47:12.567019: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [6]:
# Train the model
model.fit(X_train_padded, y, epochs=3, batch_size=32)

Epoch 1/3


2024-01-13 13:47:16.091512: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2024-01-13 13:47:16.195584: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-01-13 13:47:16.196383: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-01-13 13:47:16.196967: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG IN

Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x17a240850>

In [7]:
# saving model
model.save('cnn-lstm_model.h5')

## Testing model and predicting labels

In [10]:
# Tokenize input using the same Tokenizer instance used for training
X_new_list = df['comment'].tolist()
X_new_seq = tokenizer.texts_to_sequences(X_new_list)
X_new_padded = pad_sequences(X_new_seq, maxlen=MAX_SEQ_LENGTH, padding='post')

# Predict labels for the new dataset
predictions = model.predict(X_new_padded)

# Assuming the threshold for considering a comment as 'positive' is 0.5
threshold = 0.5
predicted_labels = (predictions > threshold).astype(int)

# Add the predicted labels to the new DataFrame
df['predicted_labels'] = predicted_labels

 5/76 [>.............................] - ETA: 1s 

2024-01-13 14:29:47.834366: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-01-13 14:29:47.834831: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-01-13 14:29:47.836061: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



In [12]:
df.sample(100)

Unnamed: 0,comment,predicted_labels
1785,how do you learn to spike dopamine from effort if hard work isn t providing it to start with,0
919,lifechanging video this knowledge is priceless thank u for explaining all of these complex ideas so simply and well,1
1278,incredible information,0
2135,the metaverse is the dopaminergicverse,0
719,great job mate keep it up,0
2358,top shelf,0
722,for the effort buddy,0
1162,may you be successful in your endeavors amen,0
527,bcz math is more important,0
1836,maybe this explains why after having some really good highly productive days i sometimes get single really bad depressive days where i don t wanna do anything,0


## Calculating gratitude score

In [11]:
gratitude_score = (df["predicted_labels"].sum()) / len(df["predicted_labels"])
gratitude_score

0.2687681459975114

## Creating Loop for Streamlit app

### Extracting comments from videos

In [36]:
gratitude_score_df = []
video_id_df = []

for video_id in video_ids:
    all_comments=[]
    qtyReplies = 0
    qtyMainComments = 0

    # getting comments
    comments = get_comments(youtube_api,video_id,'')

    # converting into dataframe
    df = pd.DataFrame(all_comments,columns=['comment'])

    # data cleaning
    ## Converting to str type
    df['comment'] = df['comment'].astype(str)

    ## Removing all the emoji's from the dataframe
    df = df.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))

    ## Removing all the url's from the data frame
    df['comment'] = df['comment'].apply(lambda x: re.split('<a href="https:\/\/.*', str(x))[0])

    ## Removing all special characters
    def process_content(content):
        return " ".join(re.findall("[A-Za-z]+",content))

    df['comment'] = df['comment'].apply(process_content)

    ## Converting to lower case
    df['comment'] = df['comment'].str.lower()

    ## Removing empty rows
    df['comment'].replace('', np.nan, inplace=True)
    df.dropna()

    #delete:
    #first callback reply
    df['comment'] = df['comment'].replace(re.compile('first callback reply', re.IGNORECASE), '', regex=True)

    #it was suggested to use .loc instead
    df.loc[:, 'comment'] = df['comment'].replace(re.compile('comment with replies', re.IGNORECASE), '', regex=True)

    #delete:
    #comment with replies
    df['comment'] = df['comment'].replace(re.compile('while getting replies', re.IGNORECASE), '', regex=True)

    df['comment'] = df['comment'].str.strip()  # Remove leading and trailing whitespace
    df = df[df['comment'] != '']

    # predicting labels
    # Tokenize input using the same Tokenizer instance used for training
    X_new_list = df['comment'].tolist()
    X_new_seq = tokenizer.texts_to_sequences(X_new_list)
    X_new_padded = pad_sequences(X_new_seq, maxlen=MAX_SEQ_LENGTH, padding='post')

    # Predict labels for the new dataset using trained model
    predictions = model.predict(X_new_padded)

    # Assuming the threshold for considering a comment as 'positive' is 0.5
    threshold = 0.5
    predicted_labels = (predictions > threshold).astype(int)

    # Add the predicted labels to the new DataFrame
    df['predicted_labels'] = predicted_labels

    # calculating gratitude score
    gratitude_score = (df["predicted_labels"].sum()) / len(df["predicted_labels"])
    gratitude_score_df.append(gratitude_score)
    video_id_df.append(video_id)

Fin
Fin
Fin
Fin
Fin


### Creating dataframe and sorting searched videos by gratitude score

In [37]:
gs_vid_df = pd.DataFrame({'video_id': video_id_df, 'gratitude_score': gratitude_score_df})
gs_vid_df['video_id'] = 'https://www.youtube.com/watch?v=' + gs_vid_df['video_id'].astype(str)
gs_vid_df = gs_vid_df.rename(columns={'video_id': 'url'})
dataframe = dataframe.merge(gs_vid_df, on='url')
dataframe = dataframe.sort_values(by=['gratitude_score'], ascending=False)
dataframe

Unnamed: 0,thumbnail,url,title,description,channel_title,gratitude_score
0,https://i.ytimg.com/vi/-_g4CAdlx-o/default.jpg,https://www.youtube.com/watch?v=-_g4CAdlx-o,Neuroscientist: Truth About Caffeine | Andrew Huberman #joerogan #shorts,Neuroscientist: Truth About Caffeine | Andrew Huberman #joerogan #hubermanlab #shorts #neuroscience #lifestyle #science ...,Neuro Lifestyle,0.053225
1,https://i.ytimg.com/vi/SwQhKFMxmDY/default.jpg,https://www.youtube.com/watch?v=SwQhKFMxmDY,Change Your Brain: Neuroscientist Dr. Andrew Huberman | Rich Roll Podcast,Thanks for watching! Read all about Dr. Andrew Huberman here https://bit.ly/richroll533 Dr. Andrew Huberman is a ...,Rich Roll,0.180433
2,https://i.ytimg.com/vi/1SOjH67A1B8/default.jpg,https://www.youtube.com/watch?v=1SOjH67A1B8,Neuroscientist: How Mouth Breathing Affects Your Face | Andrew Huberman #flagrant #shorts,Neuroscientist: How Mouth Breathing Affects Face | Andrew Huberman #neuroscience #shorts #hubermanlab #lifestyle #science ...,Neuro Lifestyle,0.067955
3,https://i.ytimg.com/vi/ywjIzd0YW-I/default.jpg,https://www.youtube.com/watch?v=ywjIzd0YW-I,Neuroscientist: You Will Never Lack Focus Again! | Andrew Huberman #neuroscience #shorts,Neuroscientist: You Will Never Lack Focus Again! | Andrew Huberman #neuroscience #shorts #lifestyle #dopamine #focus #study ...,Neuro Lifestyle,0.094801
4,https://i.ytimg.com/vi/QmOF0crdyRU/default.jpg,https://www.youtube.com/watch?v=QmOF0crdyRU,"Controlling Your Dopamine For Motivation, Focus &amp; Satisfaction | Huberman Lab Podcast #39",This episode serves as a sort of “Dopamine Masterclass”. I discuss the immensely powerful chemical that we all make in our brain ...,Andrew Huberman,0.26201


In [40]:
dataframe.sort_values(by=['gratitude_score'], ascending=False)

Unnamed: 0,thumbnail,url,title,description,channel_title,gratitude_score
4,https://i.ytimg.com/vi/QmOF0crdyRU/default.jpg,https://www.youtube.com/watch?v=QmOF0crdyRU,"Controlling Your Dopamine For Motivation, Focus &amp; Satisfaction | Huberman Lab Podcast #39",This episode serves as a sort of “Dopamine Masterclass”. I discuss the immensely powerful chemical that we all make in our brain ...,Andrew Huberman,0.26201
1,https://i.ytimg.com/vi/SwQhKFMxmDY/default.jpg,https://www.youtube.com/watch?v=SwQhKFMxmDY,Change Your Brain: Neuroscientist Dr. Andrew Huberman | Rich Roll Podcast,Thanks for watching! Read all about Dr. Andrew Huberman here https://bit.ly/richroll533 Dr. Andrew Huberman is a ...,Rich Roll,0.180433
3,https://i.ytimg.com/vi/ywjIzd0YW-I/default.jpg,https://www.youtube.com/watch?v=ywjIzd0YW-I,Neuroscientist: You Will Never Lack Focus Again! | Andrew Huberman #neuroscience #shorts,Neuroscientist: You Will Never Lack Focus Again! | Andrew Huberman #neuroscience #shorts #lifestyle #dopamine #focus #study ...,Neuro Lifestyle,0.094801
2,https://i.ytimg.com/vi/1SOjH67A1B8/default.jpg,https://www.youtube.com/watch?v=1SOjH67A1B8,Neuroscientist: How Mouth Breathing Affects Your Face | Andrew Huberman #flagrant #shorts,Neuroscientist: How Mouth Breathing Affects Face | Andrew Huberman #neuroscience #shorts #hubermanlab #lifestyle #science ...,Neuro Lifestyle,0.067955
0,https://i.ytimg.com/vi/-_g4CAdlx-o/default.jpg,https://www.youtube.com/watch?v=-_g4CAdlx-o,Neuroscientist: Truth About Caffeine | Andrew Huberman #joerogan #shorts,Neuroscientist: Truth About Caffeine | Andrew Huberman #joerogan #hubermanlab #shorts #neuroscience #lifestyle #science ...,Neuro Lifestyle,0.053225
