In [2]:
import pandas as pd
import numpy as np
import csv

Load the dataset after preprocessing

In [3]:
ds = pd.read_csv('../dataset/wiki/opinions_preprocessed.csv')
ds = ds.reset_index()
ds.shape

(12104, 20)

We generate the sample for each language. 

In [4]:
sample = ds[ds.page_title.str.contains('Talk') & ~ds.type.str.startswith('auto') & ~ds.topic.isnull()]
sample = sample[(~sample.topic.str.contains('links'))]
sample = sample.sample(500)
sample.to_csv('../dataset/wiki/samples_en.csv', index=False, header=True)

sample = ds[ds.page_title.str.contains('Discu') & ~ds.type.str.startswith('auto') & ~ds.topic.isnull()]
sample = sample[(~sample.topic.str.contains('links'))]
sample = sample.sample(500)
sample.to_csv('../dataset/wiki/samples_es.csv', index=False, header=True)

# merge annotated opinions

after annotating, merge both languages samples files

In [5]:
ds_annotated = pd.read_csv('../dataset/wiki/samples_en_annotated.csv')
print('en sample size:{}'.format(ds_annotated.shape))
ds_annotated_es = pd.read_csv('../dataset/wiki/samples_es_annotated.csv')
print('es sample size:{}'.format(ds_annotated_es.shape))
ds_annotated = ds_annotated.append(ds_annotated_es, ignore_index=True)
ds_annotated = ds_annotated[~ds_annotated.stance1.isnull()]
print('merged & filtered annotated size:{}'.format(ds_annotated.shape))
ds_annotated.to_csv('../dataset/wiki/samples_annotated.csv', index=False)

en sample size:(500, 17)
es sample size:(500, 17)
merged & filtered annotated size:(393, 17)


merge sentiment annotated opinions

In [6]:
ds_annotated_sent = pd.read_csv('../dataset/wiki/opinions_annotated_sent.csv')
ds_annotated_sent = ds_annotated_sent.reset_index()
ds_annotated_sent = ds_annotated_sent[['index','sent_score', 'sent_magnitude']]
ds_annotated_sent.shape

(12104, 3)

In [7]:
ds_merged = ds.merge(ds_annotated_sent, on=['index'],how='left', copy=False,suffixes=('', '_y'))
#ds_merged = pd.concat([ds, ds_annotated_sent], axis=1)
print(ds_merged.shape)
ds_merged.columns

(12104, 22)


Index(['index', 'page_id', 'page_title', 'revision_id', 'turn_id',
       'contributor', 'timestamp', 'topic', 'raw', 'text', 'type', 'lang',
       'creation_dt', 'revision_uid', 'turn_uid', 'page_url', 'country', 'cc2',
       'cc3', 'area', 'sent_score', 'sent_magnitude'],
      dtype='object')

In [8]:
def get_norm_sent(s):
    if np.isnan(s):
        return np.nan 
    if s > 0.20:
        return 1
    elif s < -0.20:
        return -1
    else:
        return 0
    

ds_merged['sentiment'] = np.nan
ds_merged['sentiment'] = ds_merged.sent_score.apply(lambda s: get_norm_sent(s))

ds_merged[~ds_merged.sentiment.isnull()].groupby('sentiment').size()


sentiment
-1.0    1699
 0.0    2569
 1.0     756
dtype: int64

# merge unannotated and annotated datasets 

This is tricky, in the original sample files, we didn't have key fields (revision_id, turn_id, opinion_id), so we had to merge using other fields

In [9]:
ds_merged = ds_merged.merge(ds_annotated, on=['page_id','timestamp', 'contributor','topic', 'raw'],how='left', copy=False,suffixes=('', '_y'))
print(ds_merged.shape)
ds_merged.columns

(12104, 35)


Index(['index', 'page_id', 'page_title', 'revision_id', 'turn_id',
       'contributor', 'timestamp', 'topic', 'raw', 'text', 'type', 'lang',
       'creation_dt', 'revision_uid', 'turn_uid', 'page_url', 'country', 'cc2',
       'cc3', 'area', 'sent_score', 'sent_magnitude', 'sentiment',
       'page_title_y', 'link', 'text_y', 'type1', 'stance1', 'sentiment1',
       'type2', 'stance2', 'sentiment2', 'type_y', 'stance', 'sentiment_y'],
      dtype='object')

manually merge type column, we kept the labels generated automatically in the dataset generation and only update those opinions that were annotated

In [10]:
ds_merged['type'] = ds_merged.apply(lambda row: row['type_y'] if not pd.isnull(row['type_y']) else row['type'], axis=1)

delete duplicated columns in merged dataset

In [11]:
del ds_merged['page_title_y']
del ds_merged['text_y']
del ds_merged['type_y']
del ds_merged['link']

In [12]:
ds_merged.to_csv('../dataset/wiki/opinions_annotated.csv', index=False, quoting=csv.QUOTE_ALL)

# descriptive stats

In [20]:
group = ds_merged[~ds_merged.stance.isnull()].groupby('lang')
group = group.agg({'page_id':'nunique', 'contributor': 'nunique','revision_uid':'nunique', 'turn_uid':'nunique', 'text':'count'}).reset_index()
group.rename(columns={'page_id': 'pages', 'contributor': 'editors', 'text': 'opinions',
                   'revision_uid': 'revisions', 'turn_uid':'turns'}, inplace=True)
group.to_csv('../results/ds_annotated_stats.csv', index=False)
group.head()

Unnamed: 0,lang,pages,editors,revisions,turns,opinions
0,en,13,57,88,124,168
1,es,12,84,190,195,223


# agreement stats

## type agreeement

In [14]:
ds_annotated = ds_annotated[ds_annotated.type1!='unknown']
ds_annotated.columns
ds_annotated.loc[:,'type_agreement'] = ds_annotated.apply(lambda r: 1 if r['type1'] == r['type2'] else 0, axis=1)
ds_annotated.shape

(391, 18)

In [15]:
from sklearn.metrics import cohen_kappa_score
types = ds_annotated.groupby(['type'])

for name, group in types:
    k = cohen_kappa_score(group.stance1, group.stance2)
    print(name, k)
    ds_annotated.loc[ds_annotated.type1==name, 'k'] = k



acknowledge nan
agreement 0.0
authority nan
bot_sign nan
content 0.229110512129
coordinating 0.0
criticism 0.480519480519
disagreement 0.0
doubing 0.177215189873
doubting -0.0526315789474
grammatical 0.0
insulting 0.0
praise 0.6
references 0.736082474227
reverts nan
sarcasm nan
spam 0.0
statement 0.462277091907
vandalism 0.75


  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)


In [16]:

stats = types.agg({'type_agreement': 'mean', 'k': 'mean'})
#stats.head()
#stats = stats.pivot_table(index='type', columns='lang', values='count', fill_value=0).reset_index()
#stats['']
stats = stats.reset_index()
stats['type'] = stats['type'].str.replace('_', ' ')
stats.rename(columns={'type_agreement':'A'}, inplace=True)
stats.to_csv('../results/types_agreeement.csv', index=False)
stats.head()

Unnamed: 0,type,A,k
0,acknowledge,0.0,
1,agreement,0.808511,0.0
2,authority,1.0,
3,bot sign,1.0,
4,content,0.0,0.229111


In [17]:
types2 = ds_annotated.groupby(['type','stance'])
stats2 = types2.size().reset_index(name='count')
stats2 = stats2.pivot(index='type', columns='stance', values='count')
stats2 = stats2.reset_index()
stats2.rename(columns={-1.0: 'against', 0.0: 'neutral', 1.0: 'favor'}, inplace=True)
stats2.head()

stance,type,against,neutral,favor
0,acknowledge,,,1.0
1,agreement,,,47.0
2,authority,,1.0,
3,bot_sign,,2.0,
4,content,10.0,1.0,11.0


In [18]:
stats_f = pd.merge(stats, stats2, on='type')
stats_f.to_csv('../results/stance_agreeement.csv', index=False)

In [19]:
print('done')

done
