In [1]:
import csv

results = []
with open('video-ids2.csv') as inputfile:
    for row in csv.reader(inputfile):
        results.append(row)

In [2]:
import requests, sys, time, os, time

# List of simple to collect features
snippet_features = ["title",
                    "publishedAt",
                    "channelId",
                    "channelTitle",
                    "categoryId"]

# Any characters to exclude, generally these are things that become problematic in CSV files
unsafe_characters = ['\n', '"']

# Used to identify columns, currently hardcoded order
header = ["video_id"] + snippet_features + ["trending_date", "tags", "view_count", "likes", "dislikes",
                                            "comment_count", "thumbnail_link", "comments_disabled",
                                            "ratings_disabled", "description"]


def setup(api_path):
    with open(api_path, 'r') as file:
        api_key = file.readline()

    # with open(code_path) as file:
    #     country_codes = [x.rstrip() for x in file]

    return api_key


def prepare_feature(feature):
    # Removes any character from the unsafe characters list and surrounds the whole item in quotes
    for ch in unsafe_characters:
        feature = str(feature).replace(ch, "")
    return f'"{feature}"'


def api_request(page_token, country_code):
    # Builds the URL and requests the JSON from it
    request_url = f"https://www.googleapis.com/youtube/v3/videos?id={ids}&key={api_key}&part=id,statistics,snippet"
    request = requests.get(request_url)
    print(request_url)
    print(request)
    if request.status_code == 429:
        print("Temp-Banned due to excess requests, please wait and continue later")
        sys.exit()
    return request.json()


def get_tags(tags_list):
    # Takes a list of tags, prepares each tag and joins them into a string by the pipe character
    return prepare_feature("|".join(tags_list))


def get_videos(items):
    lines = []
    for video in items:
        comments_disabled = False
        ratings_disabled = False

        # We can assume something is wrong with the video if it has no statistics, often this means it has been deleted
        # so we can just skip it
        if "statistics" not in video:
            continue

        # A full explanation of all of these features can be found on the GitHub page for this project
        video_id = prepare_feature(video['id'])

        # Snippet and statistics are sub-dicts of video, containing the most useful info
        snippet = video['snippet']
        statistics = video['statistics']

        # This list contains all of the features in snippet that are 1 deep and require no special processing
        features = [prepare_feature(snippet.get(feature, "")) for feature in snippet_features]

        # The following are special case features which require unique processing, or are not within the snippet dict
        description = snippet.get("description", "")
        thumbnail_link = snippet.get("thumbnails", dict()).get("default", dict()).get("url", "")
        trending_date = time.strftime("%y.%d.%m")
        tags = get_tags(snippet.get("tags", ["[none]"]))
        view_count = statistics.get("viewCount", 0)

        # This may be unclear, essentially the way the API works is that if a video has comments or ratings disabled
        # then it has no feature for it, thus if they don't exist in the statistics dict we know they are disabled
        if 'likeCount' in statistics and 'dislikeCount' in statistics:
            likes = statistics['likeCount']
            dislikes = statistics['dislikeCount']
        else:
            ratings_disabled = True
            likes = 0
            dislikes = 0

        if 'commentCount' in statistics:
            comment_count = statistics['commentCount']
        else:
            comments_disabled = True
            comment_count = 0

        # Compiles all of the various bits of info into one consistently formatted line
        line = [video_id] + features + [prepare_feature(x) for x in [trending_date, tags, view_count, likes, dislikes,
                                                                       comment_count, thumbnail_link, comments_disabled,
                                                                       ratings_disabled, description]]
        lines.append(",".join(line))
    return lines


def get_pages(country_code, next_page_token="&"):
    country_data = []

    # Because the API uses page tokens (which are literally just the same function of numbers everywhere) it is much
    # more inconvenient to iterate over pages, but that is what is done here.
    while next_page_token is not None:
        # A page of data i.e. a list of videos and all needed data
        video_data_page = api_request(next_page_token, country_code)

        # Get the next page token and build a string which can be injected into the request with it, unless it's None,
        # then let the whole thing be None so that the loop ends after this cycle
        next_page_token = video_data_page.get("nextPageToken", None)
        next_page_token = f"&pageToken={next_page_token}&" if next_page_token is not None else next_page_token

        # Get all of the items as a list and let get_videos return the needed features
        items = video_data_page.get('items', [])
        country_data += get_videos(items)
        time.sleep(6)

    return country_data


def write_to_file(country_code, country_data):

    print(f"Writing {country_code} data to file...")

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    with open(f"{output_dir}/{time.strftime('%y.%d.%m')}_{country_code}_videos.csv", "a", encoding='utf-8') as file:
        for row in country_data:
            file.write(f"{row}\n")


def get_data():
    for country_code in country_codes:
        country_data = [",".join(header)] + get_pages(country_code)
        write_to_file(country_code, country_data)


if __name__ == "__main__":
    output_dir = "data/all_videos.csv"
    key_path = "key.txt"
    COUNTRY_CODE = "US"
    api_key = setup(key_path)
    country_codes = ["US"]
    print(api_key)
    print(int(len(results) / 50))
    for i in range(int(len(results[0]) / 50)):
        ids = ""              
        for id in results[0][i * 50: 50 + (i * 50)]:
            ids += id + ","
        ids = ids.rstrip(',')
        get_data()

AIzaSyA9Z3-3ixz8YjsuF_ldNnUEw22nVFxy0P0
0
https://www.googleapis.com/youtube/v3/videos?id=mESMnQlkuWM,wCCQIVZ3rd0,DIkUx1QvjmA,a7nWH9hQkNw,mMpujiJV5Mg,xQ7IsuRaG8A,1p0Y9YBvm4Q,snXWM3y-gm0,jmJlqLLe5Ik,Ku67K_oa6xA,E8O-0f6CaGY,zlY_CmJoVxY,Zqfi1pcfGQQ,rfMi1UsQIhM,3lpvP1uEC4M,C_m0HTmFe0E,9zU-OKkxZ58,n3OuM736Qv4,7Agl3QY-9b4,65MewCDHZ7U,lRRHjSn-T0o,13s5bnH1aSA,K_GEkLGXkVM,qNHWnOlQTWU,XR1kvpXDXKA,mhgT7LXvJIE,IQLRGkVQ6UE,QOsFhcLAdZY,61PDF0SuaGI,by5Mv01-bi8,jtJ-n3HOckQ,Adu30n9N-K8,URpGSpWI20E,XWd4hY-M3EI,C1lkg3JRff8,SfBVgPGWrX4,ApPZrXzRaHM,06ZHa9EiHt4,jfV8tDE8hO8,ogdQ6GmRwko,XyWss39xvpE,L-dslA6kN50,4IHTLecnuaw,PcPMaJUIjbk,L6jRtms1t48,SbqeImOtkmw,9ZU-Hs2sg70,OzTrj8fRp4U,76oDk7K3n80,SmMG6gN3oaM&key=AIzaSyA9Z3-3ixz8YjsuF_ldNnUEw22nVFxy0P0&part=id,statistics,snippet
<Response [200]>
Writing US data to file...
https://www.googleapis.com/youtube/v3/videos?id=ngFIt5CHcZI,9MVtF2jCklA,D-zA6-tncn4,bQhi7HA_xI8,zedJfOhiDUM,3Ar0FT-hVoI,fZGNy3_t6sY,ponpz5rQSYU,Dt2xYvj2Chc,cGc40V1krTc,LWVpe-bthpA,O5A7oI4htfE,1os

In [32]:
import pandas as pd
df = pd.read_csv('scraped_videos.csv')
df = df[df['video_id'] != 'video_id']
df

Unnamed: 0,video_id,title,publishedAt,channelId,channelTitle,categoryId,trending_date,tags,view_count,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,description
0,9I9-uVUFLEw,"Trastornos oncológicos, neurológico y fase ter...",2020-10-24T18:18:12Z,UCXp1aqIieRQ7HnMdb1hh4WA,Clases de Apoyo - FM Enfermería,27,21.27.04,[none],94,3,0,0,https://i.ytimg.com/vi/9I9-uVUFLEw/default.jpg,False,False,"Clase dada por Emilio Gonzalez, estudiante de ..."
1,HDxXsS9QCiI,Riko Basic tani wózek dziecięcy 3w1 2w1,2019-01-29T12:23:12Z,UCZ3L1p0a7orG7lYvBVb1AvA,EURO-CART wózki dziecięce,22,21.27.04,tani wózek|riko basic|wózek basic|wózek basik|...,79867,111,9,32,https://i.ytimg.com/vi/HDxXsS9QCiI/default.jpg,False,False,"Szukasz taniego wózka dziecięcego 3w1 lub 2w1,..."
2,1ittmkW5u1I,48 Extreme Flight MXS-EXP “Beater”,2015-02-13T11:30:01Z,UCvrwZrKFfn3fxbkpiSIW4UQ,TripleThreatRC,28,21.27.04,Extreme Flight MXS EXP|TripleThreatRC|rgthd007...,8690,96,3,60,https://i.ytimg.com/vi/1ittmkW5u1I/default.jpg,False,False,This is forgotten footage from New Years Day a...
3,B0XAFo9dCsY,WC4: Top 10 Generals!,2018-07-28T18:18:44Z,UC9BAR73kn29UV4rrfrZGJ-A,juljas,20,21.27.04,juljas|wc4 generals|wc4 mod|world conqueror 4|...,16405,301,15,115,https://i.ytimg.com/vi/B0XAFo9dCsY/default.jpg,False,False,https://www.strawpoll.me/16166818 would you jo...
4,Vk-Y6LhK23o,#t9p,2019-01-03T12:07:38Z,UCr2xXMcS9cWwQPtkdYeXZig,Top10 Tik Tok LIKE Funny,22,21.27.04,[none],1,0,0,0,https://i.ytimg.com/vi/Vk-Y6LhK23o/default.jpg,False,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14682,Y3SjuKI6bZQ,"Arden Gate, 10 Communication Row, Birmingham, ...",2020-11-05T16:06:32Z,UC57ZbB-UTzqOrwo0uOI4jPw,Martin & Co Birmingham City & Harborne,1,21.27.04,[none],51,0,0,0,https://i.ytimg.com/vi/Y3SjuKI6bZQ/default.jpg,False,False,
14683,2GWc_lGbSDg,วิธีการปลูกอ้อยด้วยเทคโนโลยีสมัยใหม่จาก NKS ใน...,2020-10-08T05:04:38Z,UChNxbaTnYnEF5VCIYwRGU-A,NKS ProSugarcane,28,21.27.04,ปลูกอ้อย|วิธีปลูกอ้อย|การปลูกอ้อย|แปลงอ้อย|รถไ...,11860,62,3,4,https://i.ytimg.com/vi/2GWc_lGbSDg/default.jpg,False,False,เป็นวิธีการปลูกอ้อยที่มีการระเบิดดินดานไว้ตอนป...
14684,KaFyzaHNcro,t6y,2020-11-24T15:35:20Z,UCk4hEundjeI73e4VUwyoV6w,have g ffg Vp,22,21.27.04,[none],8,0,0,0,https://i.ytimg.com/vi/KaFyzaHNcro/default.jpg,False,False,
14685,zQ-Xuetq87A,Brøyt x2b at rc-huset Jessheim.,2016-08-18T06:31:25Z,UC87KRLnDUJKCifg81xyY2_A,Hein Høljeneset,22,21.27.04,[none],430,2,0,0,https://i.ytimg.com/vi/zQ-Xuetq87A/default.jpg,False,False,Brøyt x2b at rc huset Jessheim.
