In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

from pathlib import Path
import math
import pandas as pd
import altair as alt
# alt.renderers.enable('notebook')
from collections import Counter
import numpy as np
import csv
import random

random.seed(1337)

from news_utils import plot

In [2]:
BASE_PATH = Path('/mnt/data/datasets/ydata-ynacc-v1_0')
PATH = BASE_PATH/'ydata-ynacc-v1_0_expert_annotations.tsv'
PATH_TRAIN = BASE_PATH/'ydata-ynacc-v1_0_train-ids.txt'
PATH_VAL = BASE_PATH/'ydata-ynacc-v1_0_dev-ids.txt'
PATH_TEST = BASE_PATH/'ydata-ynacc-v1_0_test-ids.txt'
PATH_ARTICLES = Path('/mnt/data/group07/johannes/ynacc_proc/articles/articles_fixed_4.csv')

In [3]:
df = pd.read_table(PATH)

In [4]:
def maj(arr):
    c = Counter(arr)
    most = c.most_common(2)
    if len(most) > 1 and most[0][1] == most[1][1]:
        return -1
    return most[0][0]

def clear_winner(arr):
    c = Counter(arr)
    most = c.most_common(2)
    if len(most) > 1:
        return -1
    return most[0][0]

# NB: random seed of 1337
def maj_and_random(arr):
    c = Counter(arr)
    most = c.most_common(2)
    # no clear decision, we have to choose randomly
    if len(most) > 1:
        all_items =  list(c.items())
        best = [it for it in all_items if it[1] == most[0][1]]
        return random.sample(best, 1)[0][0]
    return most[0][0]

In [5]:
df_a = df
decision_fun = maj

col_dict = {
    'topic': ['off-topic', 'on-topic'],
    'audience': ['broadcast', 'reply'],
    'persuasive': ['not persuasive', 'persuasive'],
    'sentiment': ['negative', 'neutral', 'mixed', 'positive'],
    'agreement': ['no agreement', 'agreement'],
    'disagreement': ['no disagreement', 'disagreement'],
    'informative': ['not informative', 'informative'],
    'mean': ['not mean', 'mean'],
    'controversial': ['not controversial', 'controversial'],
}

def get_index(key, attr):
#     print(attr)
    return col_dict[key].index(attr)

def set_audience(x):
    val = x['intendedaudience']
    if not pd.isnull(val):
        if val == "Broadcast message / general audience":
            return get_index("audience", "broadcast")
        if val == "Reply to a specific commenter":
            return get_index("audience", "reply")
    return np.NaN

df_a['claudience'] = df_a.apply(set_audience, axis=1)
df_a['clpersuasive'] = df_a.apply(lambda x: get_index('persuasive', 'persuasive') if isinstance(x['persuasiveness'], str) and x['persuasiveness'] == 'Persuasive' else get_index('persuasive', 'not persuasive'), axis=1)
df_a['clsentiment'] = df_a.apply(lambda x: get_index('sentiment', x['sentiment']) if isinstance(x['sentiment'], str) else np.NaN, axis=1)
df_a['clagreement'] = df_a.apply(lambda x: get_index('agreement', 'agreement') if x['commentagreement'] == "Agreement with commenter" else get_index('agreement', 'no agreement'), axis=1)
df_a['cldisagreement'] = df_a.apply(lambda x: get_index('disagreement', 'disagreement') if x['commentagreement'] == "Disagreement with commenter" else get_index('disagreement', 'no disagreement'), axis=1)
df_a['clinformative'] = df_a.apply(lambda x: get_index('informative', 'informative') if not pd.isnull(x['tone']) and "Informative" in x['tone'] else get_index('informative', 'not informative'), axis=1)
df_a['clmean'] = df_a.apply(lambda x: get_index('mean', 'mean') if not pd.isnull(x['tone']) and "Mean" in x['tone'] else get_index('mean', 'not mean'), axis=1)
df_a['clcontroversial'] = df_a.apply(lambda x: get_index('controversial', 'controversial') if not pd.isnull(x['tone']) and "Controversial" in x['tone'] else get_index('controversial', 'not controversial'), axis=1)
df_a['cltopic'] = df_a.apply(lambda x: get_index('topic','off-topic') if isinstance(x['topic'], str) and 'Off-topic with article' in x['topic'] else 
                           get_index('topic','on-topic'), axis=1)

print(df_a.shape)
# df_a = df_a.dropna(subset=[col_name])

for x in col_dict.keys():
    col_name = 'cl' + x
    res_group = df_a.groupby('commentid').agg({col_name: decision_fun}).reset_index()
    df_a = res_group.merge(df_a, on='commentid', how='left', suffixes=('', '_old_' + x)).reset_index()
    if 'level_0' in df_a.columns.tolist():
        df_a = df_a.drop(['level_0'], axis=1)

# remove columns added through merges
for x in df_a.columns.tolist():
    if '_old_' in x:
        df_a = df_a.drop([x], axis=1)

subset = ['commentid'] + ['cl' + x for x in col_dict.keys()]
df_a = df_a.drop_duplicates(subset=subset) # make sure it's really the same
df_a = df_a.reset_index(drop=True).reset_index(drop=True).reset_index(drop=True)
df_a['url'] = df_a.apply(lambda x: x['url'] if x['url'] != 'noURL' else np.NaN, axis=1)

# ensure we didn't destroy anthing
assert df_a.shape[0] == len(df.commentid.unique())

(23383, 29)


In [6]:
rows = []
all_cols =  ['clpersuasive', 'claudience', 'clagreement', 'clinformative', 'clmean', 'clcontroversial', 'cldisagreement', 'cltopic']
col = 'cltopic'

In [7]:
for col in all_cols:
    topic_v = df_a[col].value_counts().reset_index()
    for _, x in topic_v.iterrows():
        r = {}
        r["col"] = col[2:].title() if 'topic' not in col else 'Off-topic'
        r["count"] = int(x[col])
        r['value'] = int(x['index'])
        rows.append(r)

In [8]:
rows

[{'col': 'Persuasive', 'count': 7231, 'value': 0},
 {'col': 'Persuasive', 'count': 1315, 'value': 1},
 {'col': 'Persuasive', 'count': 614, 'value': -1},
 {'col': 'Audience', 'count': 6186, 'value': 1},
 {'col': 'Audience', 'count': 2434, 'value': 0},
 {'col': 'Audience', 'count': 537, 'value': -1},
 {'col': 'Agreement', 'count': 8111, 'value': 0},
 {'col': 'Agreement', 'count': 795, 'value': 1},
 {'col': 'Agreement', 'count': 254, 'value': -1},
 {'col': 'Informative', 'count': 7373, 'value': 0},
 {'col': 'Informative', 'count': 1242, 'value': 1},
 {'col': 'Informative', 'count': 545, 'value': -1},
 {'col': 'Mean', 'count': 6861, 'value': 0},
 {'col': 'Mean', 'count': 1647, 'value': 1},
 {'col': 'Mean', 'count': 652, 'value': -1},
 {'col': 'Controversial', 'count': 5137, 'value': 0},
 {'col': 'Controversial', 'count': 2971, 'value': 1},
 {'col': 'Controversial', 'count': 1052, 'value': -1},
 {'col': 'Disagreement', 'count': 4695, 'value': 0},
 {'col': 'Disagreement', 'count': 3730, 'val

In [9]:
pd.DataFrame(rows).to_csv('class_dist_no_maj.csv', index=None)

In [11]:
df_a['clsentiment'].value_counts().reset_index()

Unnamed: 0,index,clsentiment
0,0.0,4366
1,-1.0,1969
2,1.0,1764
3,2.0,665
4,3.0,395
