## YouTube Data API Pipeline
- Currently searches and collects metadata associated with video
- Misc. Resources: https://developers.google.com/youtube/v3/docs/search/list#usage


In [3]:
import json
import requests
import pandas as pd
import os
import pickle
from collections import Counter, defaultdict
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import TextFormatter

In [7]:
key_index = 0
api_key_list = []   # insert YouTube API key here

desired_topics = ["Fentanyl", "Percocet (Oxycodone)", "Heroin", "Codeine", "Kratom", 
                 "Narcan", "Suboxone", "Methadone"]

sort_filters = ["relevance", "viewCount", "date", "rating"]
desired_media_type = "video"
num_search_results = 10    # number of desired search results

### Extract search queries from csv

In [11]:
# Load topic-to-queries mapping
with open("topic_to_queries.json", "r") as f:
    topic_to_queries = json.load(f)

In [13]:
total = 0
queries_total = []
for topic, queries in topic_to_queries.items():
    total += len(queries)
    queries_total += queries
print(total)

73


### Python Code for collecting metadata associated with a particular video

In [4]:
"""
Returns the metadata of the provided videos using the YouTube Data API. 
The returned metadata is in dictionary format: video id (key) to its corresponding metadata (value) 

param:
- video_id_list: a list of video ids you want to collect metadata
- api_key: youtube api_key
"""
def gather_metadata_API(video_id_list, api_key):
    # videos
    metadata = {}
    count = 0 
    for video_id in video_id_list:  
        count += 1
        print(count)
        url = "https://www.googleapis.com/youtube/v3/videos?part=statistics,snippet,contentDetails&id=" + video_id + "&key=" + api_key
        source = requests.get(url).json()
        metadata[video_id] = source

    return metadata

"""
Returns the metadata of the provided videos using the YouTube Data API. 
The returned metadata is in dictionary format: video id (key) to its corresponding metadata (value) 

param:
- video_id_list: a list of video ids you want to collect metadata
- api_key: youtube api_key
"""
def gather_metadata_API_single_id(video_id, api_key):
    # videos
    url = "https://www.googleapis.com/youtube/v3/videos?part=statistics,snippet,contentDetails&id=" + video_id + "&key=" + api_key
    source = requests.get(url).json()
    return source

### Python Code for searching using YouTube Data API and parsing results 

In [5]:
"""
Returns search results based on the provided metadata using the YouTube API

param:
- num_results: number of results you want returned
- api_key: api key to make the search request
- query: the term you want to query the YouTube search with
- sort_filter: the sorting order of the search results (i.e. relevance, view count, date, rating)
- media_type: what type of search results you want returned (i.e. video, channels, playlists)
""" 
def search_on_youtube(num_results, api_key, query, sort_filter, media_type):
    url = "https://www.googleapis.com/youtube/v3/search?part=snippet&maxResults=" + str(num_results) + "&type=" + media_type + "&q=" + query + "&order=" + sort_filter + "&key=" + api_key
    search_results_json = requests.get(url).json()
    
    try:
        results_json = search_results_json["items"]
    except:
        print("FAILED")
        print(query)
        print(api_key)
        print(search_results_json)
        return None
    return search_results_json

"""
Returns a dictionary containing video-based search results across the sorting filters using the YouTube API. 
Dictionary maps from the sorting filter (i.e. key) to the video-based search results

param:
- num_results: number of results you want returned
- api_key: api key to make the search request
- query: the term you want to query the YouTube search with
- sort_filter: the sorting order of the search results (i.e. relevance, view count, date, rating)
""" 
def search_query_video_across_filters(num_results, api_key, query):
    filter_to_search_results = {}
    for filter_ in sort_filters:
        search_json = search_on_youtube(num_results, api_key, query, filter_, desired_media_type)
        if search_json == None:
            return None
        filter_to_search_results[filter_] = search_json
    return filter_to_search_results

In [6]:
"""
Parses the provided json metadata from search results into list 

param:
- json_metadata: json mapping containing the metadata
- query: query associated with the provided json metadata
- filter_: sort_by filter associated with the provided json metadata
""" 
def parse_json_search_results(json_metadata, query, filter_):
    list_metadata = []
    try:
        results_json = json_metadata["items"]
    except:
        print(query)
        print(json_metadata)
        return query
    
    rank = 0
    # parse each item in the list
    for result_json in results_json:
        snippet = result_json["snippet"]

        video_id = result_json["id"]["videoId"]
        video_url = "https://www.youtube.com/watch?v=" + video_id   # crafting url link to the video using video_id
        video_title = snippet["title"]
        video_description = snippet["description"]
        channel_name = snippet["channelTitle"]
        channel_id = snippet["channelId"]
        published_time = snippet["publishedAt"]
        rank += 1
        list_metadata.append([rank, query, filter_, video_id, video_url, video_title, video_description, channel_name, channel_id, published_time])
        
    return list_metadata

In [7]:
"""
# transforming the given metadata list into pandas dataframe

param:
- json_metadata: json mapping containing the metadata
""" 
def transform_pandas(list_metadata):
    df = pd.DataFrame(list_metadata)
    df.columns=["ranking", "query", "sort_by_filter", "video_id", "video_url", "video_title", "video_description", "channel_name", "channel_id", "publish_time"] 
    print(df.shape)
    return df

In [163]:
# collect data
index = 0
num_iterated = 0
api_key = api_key_list[index]

query_results = {}
for topic, queries in topic_to_queries.items():
    for query in queries:
        print(query)
        query_json = search_query_video_across_filters(num_search_results, api_key, query)
        while query_json == None:
            index += 1
            num_iterated = 1
            api_key = api_key_list[index]
            query_json = search_query_video_across_filters(num_search_results, api_key, query)
        query_results[query] = query_json
        num_iterated += 1
        
        if num_iterated % 25 == 0:
            index += 1
            api_key = api_key_list[index]

fentanyl
overdose fentanyl
fentanyl drug
what is fentanyl
fentanyl documentary
fentanyl crisis
fentanyl addict
fentailo
fentynal
percocet
oxycodone
oxycontin
oxy
oxycotton
oxycotin
heroin addict
on heroin
heroin
heroin drug
heroin addiction
herion
codeine
codeina
codine
codiene
codein
codien
codeine pills
kratom withdrawal
kratom
what is kratom
kratom review
red kratom
kratom extract
best kratom
kratom tea
kratom high
kratom effects
kratom benefits
kratom psychosis
narcan
narcan training
narcan video
narcan use
narcan overdose
how to use narcan
nasal narcan
naloxone
narcan rescue
narcan saves life
narcan reaction
suboxone
suboxone withdrawal
how to take suboxone
taking suboxone
suboxone clinic
how does suboxone work
suboxone detox
suboxone high
suboxone taper
suboxone strips
what is suboxone
suboxone film
methadone
methadone clinic
methadone withdrawal
methadone detox
what is methadone
house methadone
methadone addiction
methadone high
methadone clinic experience
methadone nursing


In [164]:
# aggregating the previously collected data and transforming to pandas dataframe
list_metadata_aggregate = []
list_failed_queries = []
for query, filter_json in query_results.items():
    for filter_, json in filter_json.items():
        parsed = parse_json_search_results(json, query, filter_)
        if isinstance(parsed, list):
            list_metadata_aggregate += parsed
        else:
            list_failed_queries.append(parsed)
        
df_search_results = transform_pandas(list_metadata_aggregate)

(2893, 10)


In [166]:
len(set(df_search_results['video_id'].tolist()))

1776

### Collecting remaining metadata for each video

In [8]:
# a lil gross code, but collects the metadata from the API call
def collect_metadata(data):
    if 'items' not in data:
        print(data)
        return "", "", "", "", "", "", 0, 0, 0, 0, 0, 0

    if(len(data["items"]) == 0):
        return "", "", "", "", "", "", 0, 0, 0, 0, 0, 0
    if(len(data["items"]) != 0):
        # url
        video_id = data["items"][0]["id"]
        tags = ""
        if "tags" in data["items"][0]["snippet"]:
            tags = data["items"][0]["snippet"]["tags"]
            
        title = data["items"][0]['snippet']['title']
        description = data["items"][0]['snippet']['description']
        publish_time = data["items"][0]['snippet']['publishedAt']
        channel_id = data["items"][0]['snippet']['channelId']
        channel_name = data["items"][0]['snippet']['channelTitle']
        
        # duration, caption, licensed content 
        duration = data["items"][0]["contentDetails"]["duration"]
        day = 0
        hour = 0 
        minute = 0
        second = 0 
        if "D" in duration:
            duration = duration[1:]
            arr = duration.split("D")
            day = int(arr[0])
            duration = arr[1][1:]
        else:
            duration = duration[2:]
            day = 0

        if "H" in duration: 
            arr = duration.split("H")
            hour = int(arr[0])
            duration = arr[1]
        else:
            hour = 0
        if "M" in duration: 
            arr = duration.split("M")
            minute = int(arr[0])
            duration = arr[1]
        else:
            minute = 0

        if "S" in duration: 
            arr = duration.split("S")
            second = int(arr[0])
        else:
            second = 0
        duration = (day * 24 * 60 * 60) + (hour * 60 * 60) + (minute * 60) + (second)        

        if "viewCount" in data["items"][0]["statistics"]:
            view = int(data["items"][0]["statistics"]["viewCount"])
        else:
            view = 0

        if "likeCount" in data["items"][0]["statistics"]:
            like = int(data["items"][0]["statistics"]["likeCount"])
        else:
            like = 0

        if "dislikeCount" in data["items"][0]["statistics"]:
            dislike = int(data["items"][0]["statistics"]["dislikeCount"])
        else:
            dislike = 0

        if "favoriteCount" in data["items"][0]["statistics"]:
            fav = int(data["items"][0]["statistics"]["favoriteCount"])
        else:
            fav = 0

        if "commentCount" in data["items"][0]["statistics"]:
            comment = int(data["items"][0]["statistics"]["commentCount"])
        else:
            comment = 0
    return title, description, publish_time, channel_id, channel_name, tags, duration, view, like, dislike, fav, comment

In [9]:
formatter = TextFormatter()
def get_transcript(video_id):
    try: 
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en', 'en-US'])
        text_formatted = formatter.format_transcript(transcript)

    except Exception as exception:
        text_formatted = exception            
    
    return text_formatted

In [189]:
# using youtube data api to gather tags and a library to get transcripts
# for search audit metadata gathering
api_key = api_key_list[6]
list_tags = []
list_duration = []
list_views = []
list_like = []
list_dislike = []
list_fav = []
list_comment = []
list_transcript = []
for i, row in df_search_results.iterrows():
    video_id = row["video_id"]
    print(video_id)

    source = gather_metadata_API_single_id(video_id, api_key)
    title, description, publish_time, channel_id, channel_name, tags, duration, view, like, dislike, fav, comment = collect_metadata(source) = collect_metadata(source)
    transcript = get_transcript(video_id)
    
    # appending
    list_tags.append(tags)
    list_duration.append(duration)
    list_views.append(view)
    list_like.append(like)
    list_dislike.append(dislike)
    list_fav.append(fav)
    list_comment.append(comment)
    list_transcript.append(transcript)

JqqfI-bIvnI
H-Il9w-hIgg
lmxF2owm3Gg
BjsSuWKz2s0
wpYcOznq0d8
BG70kTLfS7w
2awPPVKmpRQ
C0tW8FWBm1g
_Gv-7yHScco
lwbjAKQKmoM
Jd76HxqCPf0
F8dVH85cCZ0
28rJqj-7pEY
BG70kTLfS7w
Z2N2jqTkUlo
82QhIOgJy1c
4gA61UowVHk
WCqNLCzN-Fk
5nA0PurW0Ks
LxyyvW_fcqw
hw9BG5jsE-g
iFqcdVgYMf4
ufROgG1N6ZI
eK3E0LbbnHk
ww5QdWNw8m8
3OGQIgAgtnM
67U1TXgQmIA
VGHAk9aOz0E
pix13S7FDxo
wpbO3J9sVdo
TK_hNn3eJWE
0KVHg3oSyeM
Tj9CxN8YLEU
3OGQIgAgtnM
U6nRlgWh76w
bEWY7Ps6uy4
OxUiFwoqy_o
eGEa08s8ZdM
4RlNoRtl3m0
OsfX14y20oI
2awPPVKmpRQ
Jd76HxqCPf0
lmxF2owm3Gg
xXplHkdgKj0
hSsKgM8-SfE
_6kEcb2Hs9c
S0Vzz_P9JBk
sLKmwufKhcM
JqqfI-bIvnI
MEd_mydGqwA
Jd76HxqCPf0
BG70kTLfS7w
4gA61UowVHk
WCqNLCzN-Fk
LxyyvW_fcqw
GWBzxr3c29s
C0tW8FWBm1g
wKT5ziEwt9A
MTUAOMjrcAw
dcb9G775KaY
3OGQIgAgtnM
sR09EvKa6Ss
OaCUD3IliR8
8bCyXhnrRdU
Zp2tqrlv6r8
zActh3RA1j8
rdy8VmHC0UY
QKGJHiqZl88
oIxGJOhxVso
EYJJ3wLCvgo
wVSHrMt1IfY
AYUn6AFlH00
TK_hNn3eJWE
Tj9CxN8YLEU
sd3UjDVbdCI
3OGQIgAgtnM
PcFKkh_FwqM
tJJ4P--_ZaA
U6nRlgWh76w
bEWY7Ps6uy4
JqqfI-bIvnI
H-Il9w-hIgg
lmxF2owm3Gg
wpYc

BCSdwI5aUCg
YPuCSxz3Sis
hSFzikSncGA
E9xjvF3jefU
S7DQINgNFTo
VqMTF4kwCoQ
TyBNuy6cD4A
1lqrnJ1CDAY
Xqyfxr4LTrY
fUT_OsclQfY
cskq_zGVSZs
5mqGw1X-C0M
82QhIOgJy1c
km_5DwLMAgc
bczHLJ6ClbQ
3Q0agc9Re2M
IhfZ7ZPPRQA
P2EIkUaD1Zo
6o4zOMNlSco
bSiZPR6zUrA
fYN14UfO-Uc
5IZrYeUX3MI
JsUH8llvTZo
4B9bY4AIu8M
FdwsbedTO2U
28rJqj-7pEY
PY9DcIMGxMs
i--yLz_i44c
PHVGL1BDXXU
82QhIOgJy1c
ARh25YI99DM
3qaH2x1_B5Q
YPuCSxz3Sis
sR09EvKa6Ss
qEd96qzhDnE
5Uxe86NaHX4
gyfaVBdktgA
GzJJ04iOJ5w
jP07Lb_xzNg
IVvuaetdZAQ
YPuCSxz3Sis
GzJJ04iOJ5w
fP6w_LagX40
vyyZmVZwf1I
7YBSaTaUZhY
6e8AYeSMSW0
fUT_OsclQfY
6dbDvfBnuUQ
GTnj09HBVTU
CD8rDufLSh4
cskq_zGVSZs
_Cnhq4pCpxM
3Q0agc9Re2M
zeU2tDRTlMM
24Fut-RNHtE
bSiZPR6zUrA
55N7syoDRaU
-SCTChAc7ew
YZUg86Lc_Gs
bczHLJ6ClbQ
bnCT0GXgLJg
_qlJlqlfvuw
BG70kTLfS7w
_FyHhC3Y8Fc
i--yLz_i44c
H6ZFzEW7_Q4
NaMgdlUcsko
pR_4t09yqCM
IBQMNg517Ug
l9Z1uTX7z58
ARh25YI99DM
3qaH2x1_B5Q
YPuCSxz3Sis
Te2BVuTkyrc
IVvuaetdZAQ
Wd5KVpRgRyg
NBG_QiFQUYY
6e8AYeSMSW0
GTnj09HBVTU
ltu56gqdH6o
gXY8G0rWylA
ExJSnMRU0AU
YPuCSxz3Sis
vyyZ

jv_vTAovgkg
hyHZbCjo51Q
AnUN2Zs4Mnk
dw0x0P13wzU
zaELqM99E9o
2x4YXk4fxNE
_YNttmPssHo
4b35ZLVEZeM
Fz-h8HEbe1Q
R4WsU7UBdTo
7hPiVuhyie0
0-gDyawB6AM
jPztjdHpTuk
AqNetM7OWj0
qJCoCqX8hmY
QKCPoSw-BOo
EYWy77vZsYY
P9SIjuHaiXA
l9-TrCUTg_0
3byDHCAupJ0
ugqQrBsrL-U
AqNetM7OWj0
ZFjpBvbGbGo
QKCPoSw-BOo
nmMCQ1y8l14
m8ZCEj1sYME
9K4BmLWi2NE
yPnpPQr8488
i3dOgkJlCrg
MS96YMLdSjU
Vrek3oQ9XAc
wqORzSt4C-s
LmVhrhtBUBg
9iDpMi_iWq8
BlP6OhUKWxs
7iCME2mggn8
dBI4O4o0pSM
mKKYus-MiQo
GI3blNNe56w
W-1dlgtMX8A
eTSrh9tNvWY
LIV_OcaQThc
sTnEOuhK3vc
_xcSROItnz4
AOBix9l_eVA
IpzUOPQ31ag
s9e91o_s5_g
EYWy77vZsYY
R-SPhLxJREQ
m8ZCEj1sYME
-q0Lff9L3uA
QKCPoSw-BOo
ZFjpBvbGbGo
R4WsU7UBdTo
Tg39tCjXkCA
tkQ454XrWFQ
XZN9NQUVUfk
P9SIjuHaiXA
EPopfBLWcHM
EkXZDBNbzm0
EzLrv3DX6es
l9-TrCUTg_0
HedEeSDzNSY
oizEd4i8pwA
3byDHCAupJ0
raYJCAU5T4g
CyYW9JSxaQM
53WI6EfZIR0
kSbiKVY1VsE
vxG3gwS-Djs
6VLahTrwE3c
0Pu8oC84XKA
7PT0gv6a97o
xhUjoA0IxKI
EzLrv3DX6es
Sz2-yYuXzv4
vvnE0Lpl85g
JeHljEPFkPc
vxG3gwS-Djs
wdsG2yueQek
CoxIUO1WuLM
VRzc9d4MbIA
nbYA4CVPF2A
9pBA

tMPbazJJ_vw
d4RoGdsOG1w
tzHKfZyevXo
mpxvJ0d0J9E
X5PPExoAems
yBUu-m9zDK8
ZtpTKAi7LIc
Kswanxs3aJA
h1ymBlRPnzM
HMchXc5lemU
bHXbe85JzM0
yhRWh2UGww0
Ip-rFzr426I
tKhNB0nYrUE
HMchXc5lemU
X5PPExoAems
eRCmnWD9pms
KBJl0IDK3Jg
pXYZgeDaMqE
2LRKvlpUCHM
ARY2MYDCpys
XMvjeDLNxno
bjQQ9SBN808
yBUu-m9zDK8
aVmPt6w7WJU
48h5bpQRaFA
pK4WSQmAAxQ
4C_p23DUsL8
Ys3JxzJ1-yo
DFbE8kwzBvs
baDFjWSkZqo
QXd6xT10zmc
EahMiNTSQ6U
wj07rzxHt6g
uvYVmdgNQHE
3Wy_eH3RzgI
pLrcMoljmkw
XmF_mjWfk7Y
IoNXZqj3SmU
N8Esdv2IbWw
o8blbs71Kw8
4qEqk_W_sd8
b86kn81EAYc
MSuhNh2pZBY
AGU-BM6dRqM
Fhi5wZWDMPc
ctXutXKmfTE
8gDCnmt29ls
4EcClL5yETc
kwT_33k0zSE
X5PPExoAems
J4jCb-kflio
oBvUfoaHt-s
jr7kdcVJdUE
X5PPExoAems
4F9QSJAWFeg
yP2LRe-EiR8
IRv65okZCnc
pXYZgeDaMqE
VijV2XM4i6s
TcKe9MlAwKY
yl1mmnclAHs
yBUu-m9zDK8
nB7AmsGvCbw
Jc-buPCKisM
hj8h791StCk
EM_t8k5cr2o
T7sNS7S5Zvk
nRKdE2rX1sg
bZZrk3T2ZEE
MSuhNh2pZBY
wqgI9gUsRqQ
gtGw8fUhzcc
FmGalSsq63k
o8blbs71Kw8
MSuhNh2pZBY
W-7_alg4I28
0iirH86rej4
OVf8ZIv8Stc
0hmJB24vwEI
5Y8w_RHWNPk
Ha9LX6gfZYM
ijTbnFuo81Y
uAaK

zPsdG5Qom0s
4F9QSJAWFeg
mPm9BLFHgwQ
6KctTNUNbMw
bSiZPR6zUrA
dw6laQ4-Zgs
kkMtfNV23Fs
fZ7GMj2TFCk
y_-TwEqq4O0
h-FwZ6PEKp4
ClCFwxKuIBM
SY44e1OOQGQ
SNE8j5d43Ag
7A-eW3Nb4IQ
id-VjL_keXc
X5PPExoAems
Y7_KGm98lGs
Jz869peR54s
ClCFwxKuIBM
4F9QSJAWFeg
fgSLEakjlAs
Buydv-PGFqs
ZxD5DjRFKos
Od2Ma8ROdl8
aOUcfKXgt-M
pK4WSQmAAxQ
4C_p23DUsL8
Ys3JxzJ1-yo
DFbE8kwzBvs
a7CG3Q25ZPM
a3uMHhnBGz0
uM2JJsAntBY
uvYVmdgNQHE
cph07IX_zik
Bm8cE37qqc0
OFGFeA6Ap7E
2MUiu02rzUI
89PhuyR8q84
PEDOp7UQEbc
4smL18U8v-Y
CAaTFDD6isg
mPm9BLFHgwQ
4F9QSJAWFeg
HTpYWe0DQmM
rzMFyOltqhc
qN_qDF0ynMw
kA4y2iyV6lI
tQxRsis3hzs
1PxdRekV_N8
fZ7GMj2TFCk
X5PPExoAems
7A-eW3Nb4IQ
tQxRsis3hzs
-lTCT7M-sG4
X5PPExoAems
Jz869peR54s
4F9QSJAWFeg
bDU8VCAiOw8
bSiZPR6zUrA
ZxD5DjRFKos
mPm9BLFHgwQ
tcLptZrg5qg
ZxD5DjRFKos
kya7Y62nMQU
XXZISdcLTBQ
uhZb1-KHneE
Ih2HHhRIdgQ
RIpxjldBbdc
PBrckWBh5jw
CRQriXQMhMA
SHv6ogdu7gg
zaotyFKdI3U
jU6oDFGHsk4
lP9btLujW2M
9yMLmqC5gr4
o4UmO8wgGo0
tcLptZrg5qg
il0XI6B8yvk
PBrckWBh5jw
yG31MFU_MfA
Y1lqDSnLxAQ
fZ7GMj2TFCk
BAY7zAS2MVo
Xr9W

In [191]:
df_search_results['tags'] = list_tags
df_search_results['duration'] = list_duration
df_search_results['views'] = list_views
df_search_results['likes'] = list_like
df_search_results['dislikes'] = list_dislike
df_search_results['fav'] = list_fav
df_search_results['comments'] = list_comment
df_search_results['transcript'] = list_transcript

In [192]:
df_search_results.to_csv("search_results_metadata.csv")

### Recommendation Audit Metadata Scraping

In [11]:
with open(os.getcwd() + '/recommendation-data/recommendations-all-level-unique-video-id.json', 'rb') as f:
    recommendation_video_id = json.load(f)

In [87]:
list_video_id = []
list_video_url = []
list_titles = []
list_descriptions = []
list_publish_time = []
list_channel_id = []
list_channel_name = []
list_tags = []
list_duration = []
list_views = []
list_like = []
list_dislike = []
list_fav = []
list_comment = []
list_transcript = []

In [None]:
j = 3
api_key = api_key_list[j]
not_found = 0 
for i, video_id in enumerate(retry_video_id):
    if video_id in list_video_id:
        continue
    
    if (i+1) % 9500 == 0:
        j += 1
        api_key = api_key_list[j]
    if i % 100 == 0:
        print(i)
        
    source = gather_metadata_API_single_id(video_id, api_key)
    title, description, publish_time, channel_id, channel_name, tags, duration, view, like, dislike, fav, comment = collect_metadata(source)
    transcript = get_transcript(video_id)
    
    if title == "":
        not_found += 1
        if not_found % 10 == 1:
            print(video_id)
            print("Not found: " + str(not_found))
    
    # appending
    list_video_id.append(video_id)
    list_video_url.append('https://www.youtube.com/watch?v=' + video_id)
    list_titles.append(title)
    list_descriptions.append(description)
    list_publish_time.append(publish_time)
    list_channel_id.append(channel_id)
    list_channel_name.append(channel_name)
    list_tags.append(tags)
    list_duration.append(duration)
    list_views.append(view)
    list_like.append(like)
    list_dislike.append(dislike)
    list_fav.append(fav)
    list_comment.append(comment)
    list_transcript.append(transcript)

In [84]:
retry_video_id = df1.loc[pd.isnull(df1['video_title'])]['video_id'].tolist()

In [22]:
len(list_video_id)

164085

In [91]:
not_found

274

In [80]:
df_copy = pd.DataFrame()
df_copy['video_id'] = list_video_id
df_copy['video_url'] = list_video_url
df_copy['video_title'] = list_titles
df_copy['video_description'] = list_descriptions
df_copy['channel_name'] = list_channel_name
df_copy['channel_id'] = list_channel_id
df_copy['publish_time'] = list_publish_time
df_copy['tags'] = list_tags
df_copy['duration'] = list_duration
df_copy['views'] = list_views
df_copy['likes'] = list_like
df_copy['dislikes'] = list_dislike
df_copy['fav'] = list_fav
df_copy['comments'] = list_comment
df_copy['transcript'] = list_transcript