# Comparison with data from study by Hussein et al.

This notebook compares metrics between our study and study of Hussein et al.

It assumes that data from Hussein are available in folder "../Hussein". You can download them here: https://github.com/social-comp/YouTubeAudit-data

In [1]:
import pandas as pd

# Up-next + top-5 recommendations

In [2]:
# load Hussein data from CSV
hussein_all_results = pd.read_csv('../Hussein/all_results.csv')

# replace nan values
hussein_all_results['gender'] = hussein_all_results['gender'].fillna('unknown')
hussein_all_results['age_group'] = hussein_all_results['age_group'].fillna('unknown')
hussein_all_results['geo_temperature'] = hussein_all_results['geo_temperature'].fillna('unknown')
hussein_all_results['geolocation'] = hussein_all_results['geolocation'].fillna('unknown')
hussein_all_results['order'] = hussein_all_results['order'].fillna('unknown')
hussein_all_results['vid_order'] = hussein_all_results['vid_order'].fillna('unknown')
hussein_all_results['stance'] = hussein_all_results['stance'].fillna('unknown')

# filter promoting watch experiment
_hussein = hussein_all_results.loc[
    (hussein_all_results['activity'] == 'Watch') &
    (hussein_all_results['stance'] == 'promoting') &
    (
        (_hussein['component_name'] == 'Top5') |
        (_hussein['component_name'] == 'UpNext')
    )
]
_hussein = _hussein.loc[
    (_hussein['order'] >= 10)
]
hussein_top5 = _hussein.loc[
    (_hussein['component_name'] == 'Top5') |
    (_hussein['component_name'] == 'UpNext')
]
hussein_up_next = _hussein.loc[
    (_hussein['component_name'] == 'UpNext')
]

In [616]:
# load our recommendations
recommendations = pd.read_csv('../Data/recommendations.csv', index_col=0)

our_up_next = recommendations.loc[
    (recommendations['seed_sequence'].isin([1, 2, 39, 40])) &
    (recommendations['position'] == 1) &
    (~recommendations['experiment'].str.contains('likes'))
]
our_top5 = recommendations.loc[
    (recommendations['seed_sequence'].isin([1, 2, 39, 40])) &
    (recommendations['position'] >= 1) &
    (recommendations['position'] <= 6) &
    (~recommendations['experiment'].str.contains('likes'))
]

In [619]:
# helper functions

def get_grouped(our, hussein):
    hussein_grouped = hussein.groupby([
        'activity_type',
        'gender',
        'age_group',
        'geolocation',
        'geo_temperature',
        'order',
        'topic'
    ])['normalized_annotation'].mean().reset_index()

    our_grouped = our.groupby([
        'bot_id', 'seed_sequence', 'topic'
    ])['normalized_annotation'].mean().reset_index()
    
    return our_grouped, hussein_grouped

# compare function for statistical significance testing

def compare(data1_df, data2_df, groups, data1_label, data2_label):
    print(f'Difference of {data1_label} and {data2_label}')
    print()

    import matplotlib.pyplot as plt
    from scipy.stats import mannwhitneyu

    alpha = 0.05
    # apply Bonferroni correction to alpha
    alpha /= len(groups)
    print('alpha', alpha)

    results = {}
    for group in groups:
        data1 = data1_df.loc[
            (data1_df['group'] == group)
        ]['value']

        data2 = data2_df.loc[
            (data2_df['group'] == group)
        ]['value']

        if len(set(data1)) == 1 or len(set(data2)) == 1:
            print(f'{group} All data identical')
            continue

        stat, p = mannwhitneyu(data1, data2, alternative='two-sided')

        results[group] = {
            f'{data1_label} mean': round(data1.mean() * 100) / 100,
            f'{data2_label} mean': round(data2.mean() * 100) / 100,
            f'{data1_label} std': round(data1.std() * 100) / 100,
            f'{data2_label} std': round(data2.std() * 100) / 100,
            f'{data1_label} count': len(data1),
            f'{data2_label} count': len(data2),
            'Statistics': stat,
            'p-value': p,
            'conclusion': 'Same distribution (fail to reject H0)' if p > alpha else 'Different distribution (reject H0)'
        }

    return pd.DataFrame(results).T

def compare_topics(our, hussein):
    our_grouped, hussein_grouped = get_grouped(our, hussein)

    return compare(
        data1_df=hussein_grouped.rename(columns={'topic': 'group', 'normalized_annotation': 'value'}),
        data2_df=our_grouped.rename(columns={'topic': 'group', 'normalized_annotation': 'value'}),
        groups=our_grouped['topic'].unique(),
        data1_label='Hussein',
        data2_label='Our'
    )

def compare_all(our, hussein):
    our_grouped, hussein_grouped = get_grouped(our, hussein)

    return compare(
        data1_df=hussein_grouped.assign(group='All').rename(columns={'normalized_annotation': 'value'}),
        data2_df=our_grouped.assign(group='All').rename(columns={'normalized_annotation': 'value'}),
        groups=['All'],
        data1_label='Hussein',
        data2_label='Our'
    )

## Top-5

In [645]:
print('Top-5')
compare_topics(our_top5, hussein_top5)

Top-5
Difference of Hussein and Our

alpha 0.01


Unnamed: 0,Hussein mean,Our mean,Hussein std,Our std,Hussein count,Our count,Statistics,p-value,conclusion
911,0.14,0.26,0.15,0.28,90,40,1363.0,0.0226175,Same distribution (fail to reject H0)
chemtrails,0.05,0.03,0.14,0.17,90,40,2134.5,0.0624297,Same distribution (fail to reject H0)
flatearth,-0.16,-0.15,0.29,0.22,94,40,1922.5,0.832136,Same distribution (fail to reject H0)
moonlanding,-0.08,-0.32,0.27,0.34,100,40,2954.5,8.08844e-06,Different distribution (reject H0)
vaccines,-0.28,-0.0,0.22,0.18,100,38,664.0,1.62892e-09,Different distribution (reject H0)


In [646]:
print('Top-5')
compare_all(our_top5, hussein_top5)

Top-5
Difference of Hussein and Our

alpha 0.05


Unnamed: 0,Hussein count,Hussein mean,Hussein std,Our count,Our mean,Our std,Statistics,conclusion,p-value
All,474,-0.07,0.27,198,-0.04,0.31,45781.5,Same distribution (fail to reject H0),0.608923


## Up-next

In [648]:
print('Up-next')
compare_topics(our_up_next, hussein_up_next)

Up-next
Difference of Hussein and Our

alpha 0.01


Unnamed: 0,Hussein mean,Our mean,Hussein std,Our std,Hussein count,Our count,Statistics,p-value,conclusion
911,0.27,0.05,0.49,0.23,90,40,2021.0,0.142291,Same distribution (fail to reject H0)
chemtrails,-0.13,0.0,0.48,0.32,90,40,1572.0,0.0992991,Same distribution (fail to reject H0)
flatearth,-0.14,-0.29,0.58,0.52,94,40,1844.0,0.840633,Same distribution (fail to reject H0)
moonlanding,-0.34,-0.52,0.55,0.6,100,40,2362.0,0.058285,Same distribution (fail to reject H0)
vaccines,-0.76,-0.08,0.45,0.55,100,38,744.5,1.25217e-10,Different distribution (reject H0)


In [649]:
print('Up-next')
compare_all(our_up_next, hussein_up_next)

Up-next
Difference of Hussein and Our

alpha 0.05


Unnamed: 0,Hussein count,Hussein mean,Hussein std,Our count,Our mean,Our std,Statistics,conclusion,p-value
All,474,-0.23,0.61,198,-0.17,0.51,41762,Different distribution (reject H0),0.00947299


# Search results

In [4]:
# prepare Hussein search data

hussein_watch_search_results = hussein_all_results.loc[
    (hussein_all_results['activity'] == 'Watch') &
    (hussein_all_results['component_name'] == 'SearchResults')
].copy()

hussein_search_results = hussein_all_results.loc[
    (hussein_all_results['activity'] == 'Search')
].copy()

hussein_serp = pd.read_csv('../Hussein/all_Top10_SERP-MM.csv')
hussein_serp.head(1)

hussein_query_by_qid = hussein_serp.set_index('qid')['query'].to_dict()
hussein_search_results['query'] = hussein_search_results['qid'].apply(
    lambda qid: hussein_query_by_qid[qid]
)
hussein_watch_search_results['query'] = hussein_watch_search_results['qid'].apply(
    lambda qid: hussein_query_by_qid[qid]
)
hussein_serp_filtered = hussein_serp.loc[
    (hussein_serp['activity'] == 'Watch') &
    (hussein_serp['stance'] == 'promoting')
]

In [655]:
# prepare our search data

search_results = pd.read_csv('../Data/search_results.csv', index_col=0)
# filter usable and annotated search results
search_results = search_results.loc[
    ~search_results['experiment'].str.contains('likes')
]
num_results = 10
search_results = search_results.groupby([
    'topic', 'query', 'seed_sequence', 'bot_id'
]).head(num_results)
# keep search results from end of phase 1
search_results = search_results.loc[
    (
        (search_results['experiment'] != 'vaccines') &
        search_results['seed_sequence'].isin([40])
    ) | (
        (search_results['experiment'] == 'vaccines') &
        search_results['seed_sequence'].isin([38])
    )
]

num_results = 10
# calculate SERP-MS
our_serp = search_results.groupby([
    'topic', 'query', 'seed_sequence', 'sequence_name', 'bot_id'
]).head(num_results).groupby([
    'topic', 'query', 'seed_sequence', 'sequence_name', 'bot_id'
]).apply(
    lambda block: block.apply(
        lambda row: (
            row['normalized_annotation'] * (num_results - row['position'] + 1)
        ),
        axis=1
    ).sum() / (
        ((num_results * (num_results + 1)) / 2)
    )
).rename('serp').reset_index()

## Statistical tests by topics

In [665]:
# consider common search queries only
data1_df = hussein_serp_filtered.loc[
    hussein_serp_filtered['query'].isin(our_serp['query'].unique())
].rename(columns={'topic': 'group', 'normalized_smm': 'value'})
data2_df = our_serp.loc[
    our_serp['query'].isin(hussein_serp_filtered['query'].unique())
].rename(columns={'topic': 'group', 'serp': 'value'})

compare(
    data1_df=data1_df,
    data2_df=data2_df,
    groups=our_serp['topic'].unique(),
    data1_label='Hussein SERP',
    data2_label='Our SERP'
)

Difference of Hussein SERP and Our SERP

alpha 0.01


Unnamed: 0,Hussein SERP mean,Our SERP mean,Hussein SERP std,Our SERP std,Hussein SERP count,Our SERP count,Statistics,p-value,conclusion
911,-0.16,-0.06,0.2,0.05,50,50,1200,0.714675,Same distribution (fail to reject H0)
chemtrails,-0.2,-0.47,0.57,0.18,50,50,1322,0.619593,Same distribution (fail to reject H0)
flatearth,-0.58,-0.41,0.09,0.42,50,50,1500,0.0837245,Same distribution (fail to reject H0)
moonlanding,-0.6,-0.59,0.1,0.16,40,40,650,0.144636,Same distribution (fail to reject H0)
vaccines,-0.8,-0.63,0.4,0.1,50,45,324,1.30125e-09,Different distribution (reject H0)


## Statistical tests disregarding topics

In [667]:
data1_df = hussein_serp_filtered.loc[
    hussein_serp_filtered['query'].isin(our_serp['query'].unique())
].assign(group='All').rename(columns={'normalized_smm': 'value'})

data2_df = our_serp.loc[
    our_serp['query'].isin(hussein_serp_filtered['query'].unique())
].assign(group='All').rename(columns={'serp': 'value'})

compare(
    data1_df=data1_df,
    data2_df=data2_df,
    groups=['All'],
    data1_label='Hussein SERP',
    data2_label='Our SERP'
)

Difference of Hussein SERP and Our SERP

alpha 0.05


Unnamed: 0,Hussein SERP count,Hussein SERP mean,Hussein SERP std,Our SERP count,Our SERP mean,Our SERP std,Statistics,conclusion,p-value
All,240,-0.46,0.42,235,-0.42,0.3,26460.5,Same distribution (fail to reject H0),0.244486
