In [1]:
import nltk
import json
import pandas as pd
import csv
import os

# Videos and transcripts

In [19]:
def transform_video(video):
    return {
        "annotation": {
            "annotations": [
            ],
            "label": video['annotation'],
            "manual_review_label": video['annotation']
        },
        "etag": "",
        "id": video['uuid'],
        "isSeed": True,
        "kind": "youtube#video",
        "relatedVideos": [
        ],
        "search_term": "",
        "snippet": {
            "categoryId": video['category_id'] if video['category_id'] else "",
            "channelId": video['channel_id'] if video['channel_id'] else "",
            "channelTitle": "",
            "defaultAudioLanguage": video['default_audio_language'] if video['default_audio_language'] else "",
            "description": video['description'],
            "liveBroadcastContent": "none",
            "localized": {
                "description": video['description'],
                "title": video['title']
            },
            "publishedAt": video['published_at'].isoformat(),
            "tags": [
            ],
            "thumbnails": {
            },
            "title": video['title']
        },
        "statistics": {
            "commentCount": video['comment_count'] if video['comment_count'] else 0,
            "dislikeCount": video['dislike_count'] if video['dislike_count'] else 0,
            "favoriteCount": video['favourite_count'] if video['favourite_count'] else 0,
            "likeCount": video['like_count'] if video['like_count'] else 0,
            "viewCount": video['view_count'] if video['view_count'] else 0
        }
    }


def transform_video_to_transcript(video):
    return {
        "captions": [
            sent.lower()
            for line in video['clean_transcript']
            for sent in nltk.sent_tokenize(line)
        ],
        "id": video['uuid']
    }

In [2]:
videos = pd.read_pickle('videos.p')

In [3]:
videos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2800 entries, 0 to 2799
Data columns (total 40 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   is_deleted                 2800 non-null   bool   
 1   deleted_at                 0 non-null      object 
 2   uuid                       2800 non-null   object 
 3   published_at               2800 non-null   object 
 4   title                      2800 non-null   object 
 5   description                2800 non-null   object 
 6   default_language           845 non-null    object 
 7   default_audio_language     1905 non-null   object 
 8   duration                   2800 non-null   object 
 9   dimension                  2800 non-null   object 
 10  definition                 2800 non-null   object 
 11  caption                    2800 non-null   object 
 12  licensed_content           2800 non-null   bool   
 13  projection                 2800 non-null   objec

In [4]:
sum(videos[['uuid', 'annotation']].duplicated())

175

In [5]:
videos[videos['uuid'].duplicated()]['annotation'].value_counts()

neutral    175
unknown      2
Name: annotation, dtype: int64

In [6]:
videos = videos[(videos['annotation'] != 'unknown') & ~videos['annotation'].isna()]

In [7]:
videos.drop_duplicates(subset='uuid', keep='first', inplace=True)
videos.shape

(2622, 40)

In [8]:
videos['annotation'].value_counts()

neutral      1459
debunking     758
promoting     405
Name: annotation, dtype: int64

In [9]:
videos.to_csv("annotated-data.csv", index=False, quoting=csv.QUOTE_ALL)

In [24]:
videos['comment_count'] = videos['comment_count'].fillna(0)
videos['dislike_count'] = videos['dislike_count'].fillna(0)
videos['like_count'] = videos['like_count'].fillna(0)
videos['favourite_count'] = videos['favourite_count'].fillna(0)
videos['view_count'] = videos['view_count'].fillna(0)

In [29]:
with open('groundtruth_dataset.json', 'w') as f:
    f.write("[")
    for i, video in videos.iterrows():
        data = json.dumps(transform_video(video))
        f.write(f'{data},\n')
    f.seek(f.tell() - 2, os.SEEK_SET)
    f.truncate()
    f.write("]")

In [31]:
with open('groundtruth_videos_transcripts.json', 'w') as f:
    f.write("[")
    
    for _, video in videos.iterrows():
        data = json.dumps(transform_video_to_transcript(video))
        f.write(data + ',\n')
    
    f.seek(f.tell() - 2, os.SEEK_SET)
    f.truncate()
    f.write("]")

# Comments

In [32]:
def transform_comment(video_uuid, comment_ids):
    return {
        "comments": comment_ids,
        "id": video_uuid
    }


def comment_to_youtube_response(comment):
    return {
        'snippet': {
            'topLevelComment': {
                'snippet': {
                    'authorDisplayName': comment['author_display_name'],
                    'authorProfileImageUrl': comment['author_profile_image_url'],
                    'authorChannelUrl': comment['author_channel_url'],
                    'textDisplay': comment['text_display'],
                    'textOriginal': comment['text_original'],
                    'parentId': comment['parent_uuid'],
                    'canRate': comment['can_rate'],
                    'likeCount': comment['like_count'],
                    'publishedAt': comment['published_at'].isoformat(),
                    'updatedAt': comment['youtube_update_timestamp'].isoformat(),
                    # 'authorChannelId': {
                    #     'value': comment['author_channel_uuid']
                    # }
                }
            },
            'canReply': comment['can_reply'],
            'totalReplyCount': comment['total_reply_count'],
            'isPublic': comment['is_public']
        }
    }

In [33]:
comments = pd.read_csv('youtube-comments.csv')

In [34]:
comments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 487500 entries, 0 to 487499
Data columns (total 25 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   is_deleted                487500 non-null  object 
 1   deleted_at                0 non-null       float64
 2   uuid                      487500 non-null  object 
 3   author_display_name       487478 non-null  object 
 4   author_profile_image_url  487500 non-null  object 
 5   author_channel_url        487500 non-null  object 
 6   text_display              487500 non-null  object 
 7   text_original             487492 non-null  object 
 8   can_rate                  487500 non-null  object 
 9   like_count                487500 non-null  int64  
 10  published_at              487500 non-null  object 
 11  youtube_update_timestamp  487500 non-null  object 
 12  can_reply                 242112 non-null  object 
 13  total_reply_count         242112 non-null  f

In [35]:
comments.head()

Unnamed: 0,is_deleted,deleted_at,uuid,author_display_name,author_profile_image_url,author_channel_url,text_display,text_original,can_rate,like_count,...,created_at,updated_at,parent_uuid,id,author_channel_id,video_id,deleted_by_youtube,days_unseen,deletion_reason,rank
0,f,,96YKgre5-ty9O6TLLDYOMz,Loaded L.V.X,https://yt3.ggpht.com/ytc/AAUvwnh1VasSPajajfnS...,http://www.youtube.com/channel/UCeSSjOw0i_qSvN...,Yup. Turn off the tv,Yup. Turn off the tv,t,0,...,3/6/2021 10:49:22.558691+02:02,3/6/2021 10:49:22.558691+02:02,Ugx1uOdPhGcYDp4vdYx4AaABAg,1688076,839180,929,f,0,,1
1,f,,9BU8wWpCyWT9O6TEUF9Xfu,Loaded L.V.X,https://yt3.ggpht.com/ytc/AAUvwnh1VasSPajajfnS...,http://www.youtube.com/channel/UCeSSjOw0i_qSvN...,Covid is S C A M,Covid is S C A M,t,0,...,3/6/2021 10:49:17.667701+02:02,3/6/2021 10:49:17.667701+02:02,UgxwMFl7ARvTS8GPSVl4AaABAg,1687965,839180,929,f,0,,2
2,f,,97tRiCemPmn9O6SdzBYiA_,Loaded L.V.X,https://yt3.ggpht.com/ytc/AAUvwnh1VasSPajajfnS...,http://www.youtube.com/channel/UCeSSjOw0i_qSvN...,Lol covid is a scam fam,Lol covid is a scam fam,t,0,...,3/6/2021 10:48:43.017541+02:02,3/6/2021 10:48:43.017541+02:02,UgypGibX3le34T6siTR4AaABAg,1687274,839180,929,f,0,,3
3,f,,90S4UpDa9ZJ9O5zgjEEqsW,CZ TS !!!,https://yt3.ggpht.com/ytc/AAUvwngEUS6Z0xr9Ckq3...,http://www.youtube.com/channel/UCSxE1mUNJT5zat...,@Doug Spindler so this research has reach a de...,@Doug Spindler so this research has reach a de...,t,0,...,3/6/2021 10:47:39.47139+02:02,3/6/2021 10:47:39.47139+02:02,UgyhRSWFtcAwMOvVNDh4AaABAg,1686260,838548,929,f,0,,4
4,f,,UgxMgWqxG8f8WTXFGW54AaABAg,Cheli A E,https://yt3.ggpht.com/QDPXtAeF9iyFHQs6YfXlYr_T...,http://www.youtube.com/channel/UCKV8LCG9T4WzIL...,Brilliant!! Thank you.,Brilliant!! Thank you.,t,0,...,3/6/2021 10:48:17.590593+02:02,3/6/2021 10:48:17.590593+02:02,,1686909,838907,929,f,0,,5


In [36]:
video_uuids = videos.set_index('id')['uuid'].to_dict()
comments['video_uuid'] = comments['video_id'].apply(video_uuids.get)
comments = comments.loc[~comments['video_uuid'].isna()]

In [37]:
comments.shape

(444605, 26)

In [38]:
comments_by_video = comments.groupby('video_uuid')['uuid'].apply(list).to_dict()
with open('groundtruth_videos_comments_ids.json', 'w') as f:
    f.write("[")
    for video_uuid, comment_ids in comments_by_video.items():
        transformed = transform_comment(video_uuid, comment_ids)
        data = json.dumps(transformed)
        f.write(f'{data},\n')
    f.seek(f.tell() - 2, os.SEEK_SET)
    f.truncate()
    f.write("]")

In [33]:
comments['published_at'] = pd.to_datetime(comments['published_at'])
comments['youtube_update_timestamp'] = pd.to_datetime(comments['youtube_update_timestamp'])

In [35]:
comments.to_pickle('comments.p')

In [36]:
comments_by_video = comments.groupby('video_uuid')
for video_uuid, video_comments in comments_by_video:
    body = '\n'.join([
        json.dumps(comment_to_youtube_response(comment))
        for _, comment in video_comments.iterrows()
    ])

    os.makedirs(
        f'comments/{video_uuid}',
        exist_ok=True
    )

    with open(f'comments/{video_uuid}/{video_uuid}.json', 'w') as f:
        f.write(body)

# Create FastText train data

In [37]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import re

In [38]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

In [39]:
def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)  
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    return " ".join(filtered_words)

In [40]:
print('Video snippets')
with open('groundtruth_dataset.json', 'r') as f:
    with open('video_snippet_train_data.txt', 'w') as output:
        for line in f:
            data = json.loads(line)
            sentence = preprocess(data['snippet']['description'])
            output.write(
                sentence + '\n'
            )

Video snippets


In [41]:
print('Comments')
comments = pd.read_pickle('comments.p')
with open('video_comments_train_data.txt', 'w') as output:
    for comment in comments['text_original'].sample(100000):
        sentence = preprocess(comment)
        output.write(sentence + '\n')

Comments


In [42]:
print('Video transcripts')
with open('groundtruth_videos_transcripts.json', 'r') as f:
    with open('video_transcript_train_data.txt', 'w') as output:
        for line in f:
            data = json.loads(line)
            for sentence in data['captions']:
                sentence = preprocess(sentence)
                output.write(sentence + '\n')

Video transcripts


In [43]:
print('Video tags')
with open('groundtruth_dataset.json', 'r') as f:
    with open('video_tags_train_data.txt', 'w') as output:
        for line in f:
            data = json.loads(line)
            tags = data['snippet']['tags']
            if len(tags) > 0:
                output.write(
                    ' '.join(tags) + '\n'
                )

Video tags
