# Normalization of labels for our annotated videos

This notebook contains code for normalizing our annotations and saving them in a unified file for easier work with them (mapping, analysing, etc.)

In [None]:
import os
import pandas as pd
import numpy as np

In [None]:
RAW_DATA_PATH = os.path.join('..', 'Data', 'raw_data')
NORMALIZED_DATA_PATH = os.path.join('..', 'Data', 'normalized_data')

## Load data

In [None]:
recommendations = pd.read_csv(os.path.join(RAW_DATA_PATH, 'raw_data', 'recommendations.csv'))
search_results = pd.read_csv(os.path.join(RAW_DATA_PATH, 'search_results.csv'))

Drop videos where no annotation was given

In [None]:
recommendations = recommendations.loc[recommendations['annotation'] != -2.0]
search_results = search_results.loc[search_results['annotation'] != -2.0]

In [None]:
combined = pd.concat([recommendations, search_results])

## Aggregate all annotations for individual videos

In [None]:
video_to_label_mapping = {}

for index, video in combined.iterrows():
    if video['youtube_id'] not in video_to_label_mapping:
        video_to_label_mapping[video['youtube_id']] = set()
    video_to_label_mapping[video['youtube_id']].add(video['annotation'])

## Map aggregated annotations to final normalized label

In [None]:
normalization_mapping = {
    -1: -1,
    0: 0,
    1: 1,
    2: -1,
    3: 0,
    4: 1,
    5: 0,
    6: np.NaN,
    7: np.NaN,
    8: np.NaN
}

In [None]:
normalized_unique_videos = []

for video_id, annotations in video_to_label_mapping.items():
    rating = 7
    # Video has only one unique annotation
    if len(annotations) == 1:
        rating = annotations.pop()
    # Video has more than one unique annotations
    else:
        if bool(set([1, 4]) & annotations) and not bool(set([-1, 2]) & annotations):
            rating = 1
        elif bool(set([-1, 2]) & annotations) and not bool(set([1, 4]) & annotations):
            rating = -1
        elif bool(set([0, 3, 5]) & annotations) and not bool(set([-1, 2, 1, 4]) & annotations):
            rating = 0
        # Video id "az6c7negl6o" is problematic in our raw data, containing both promoting and debunking views. Our consensus is the video is promoting
        elif (video_id == 'az6c7negl6o'):
            rating = 1
    normalized_rating = normalization_mapping[rating]
    normalized_unique_videos.append((video_id, normalized_rating))

## Save data with normalized labels

In [None]:
data_with_normalized_labels = pd.DataFrame(normalized_unique_videos, columns=['youtube_id', 'normalized_label'])
data_with_normalized_labels

Unnamed: 0,youtube_id,normalized_label
0,W0lWsqAwYwY,0.0
1,6mMK6iSZsAs,-1.0
2,WYPNjSoDrqw,0.0
3,JHQ8UAjoVVc,0.0
4,9vJRopau0g0,0.0
...,...,...
2909,BIDlAPNwbS8,0.0
2910,8dpkmUjJ8xU,-1.0
2911,WCwXJMVVdck,0.0
2912,Ms65JBrevYU,0.0


In [None]:
data_with_normalized_labels.to_csv(os.path.join(NORMALIZED_DATA_PATH, 'encountered_videos.csv'))