##### Imports

In [None]:
from scripts.validation import *
from scripts.plotting import * 

Progress tracking

In [2]:
# track progress
tqdm.pandas()

## Prescriptive moderation validation
- now that we have found toxicity-based clusters of comments, we want to validate the impact of moderations:
    - focusing on before moderation data against all moderation types
    - in terms of how well moderation strategies are able to reduce toxic behaviors
    - to do so, since the moderated comments are versions of the unmoderated comments
        - we look at how patterns change for each toxic profile
        - using the clusters found exlusively on unmoderated data

- we could do both global and local validations:
    - global, comparing full datasets against each other
    - local, comparing matching toxic profiles for each dataset

### Datasets

In [3]:
# simulated before mod
df_bef = pd.read_csv("../data_before_after/SIMULATOR_exante_bef_profiles.csv", encoding = "utf-8")
# moderated
df_pres = pd.read_csv("../data_before_after/SIMULATOR_exante_pres.csv", encoding = "utf-8")

##### Feature Engineering
- we want to derive features from the already extracted ones, could be more insightful

In [4]:
feat_engineering(df_pres)

100%|██████████| 2599/2599 [00:00<00:00, 129960.97it/s]


##### Feature selection
- keep only features shared across all datasets (real and simulated), to be able to do comparative analyses
- we exclude also the textual bodies and other irrelevant features for the analysis, such as parent_id and age (which has many missing values in the real dataset)

In [5]:
# features to keep
feats_shared = [col for col in df_bef.columns if col in df_pres.columns]

df_pres = df_pres[feats_shared].copy()

In [6]:
print(f"Shape of before moderation data: {df_bef.shape}")
df_bef.head(2)

Shape of before moderation data: (3131, 145)


Unnamed: 0,author,comment_id,std_body,gender,openness,conscientiousness,extraversion,agreeableness,neuroticism,num_punct,num_sents,num_words_upp,num_emoji,num_emoji_pos,num_emoji_neg,emoji_unique,emoji_list,fear,anger,anticip,trust,surprise,positive,negative,sadness,disgust,joy,polarity,subjectivity,valence,arousal,dominance,flesch,flesch_kincaid,fog,smog,ari,coleman_liau,dale_chall,linsear,difficult_words,num_words,num_words_unique,num_words_adj,num_words_noun,num_words_verb,num_words_lex,num_stopw,toxicity,severe_toxicity,obscene,threat,insult,identity_attack,is_toxic,go,holy fuck,homosexual,say,know,well,bunch,shit,man,racist,would,stupid,also,guy,liberal,make,country,religion,real,crap,right,keep,people,dick,need,asshole,nothing,woman,bitch,one,give,fuck shit,die,black,immigrant,kill,dude,fucking,nigga,suck,take,good,try,idiot,call,insult insult,life,think,society,fuckin,someone,want,stop,see,f cking,way,time,damn,white,fuck,government,get,u,gay,even,death,thing,control,money,muslim,family,hope,really,like,is_real,cluster,sent_balance,sent_emoji_balance,punct_ratio,upper_ratio,emoji_ratio,adj_ratio,noun_ratio,verb_ratio,lex_ratio,stopw_ratio,words_sent_ratio,ttr,complex_ratio,tox_profile
0,joylukclub,2,since strongly lean towards republican side wh...,f,medium,very high,very low,low,very high,7,2,0,0,0,0,,,0.07,0.07,0.0,0.07,0.07,0.2,0.27,0.07,0.07,0.07,-0.052381,0.554762,0.205373,0.168915,0.189136,45.59,13.2,17.26,0.0,14.4,10.62,10.16,17.25,15.0,52,45,8,14,6,28,22,0.000711,0.000118,0.000166,0.000128,0.0,0.00016,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.603497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.797366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,-0.07,0,0.134615,0.0,0.0,0.153846,0.269231,0.115385,0.538462,0.423077,26.0,0.966667,0.288462,Healthy
1,WoodpeckerNo1,3,republican way go liberal policy destroy natio...,m,very low,very low,very low,very low,very high,12,7,0,0,0,0,,,0.11,0.11,0.0,0.11,0.0,0.16,0.26,0.11,0.0,0.05,0.052037,0.401667,0.168156,0.146662,0.16861,61.73,7.0,9.81,10.6,6.6,8.79,8.59,5.5,16.0,69,56,9,19,13,41,35,0.062836,0.000212,0.000633,0.000614,0.0,0.003106,0,0.26703,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.593517,0.0,0.0,0.361286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.245755,0.0,0.0,0.0,0.318712,0.0,0.0,0.0,0.241404,0.0,0.0,0.0,0.0,0.0,0.0,0.292843,0.227268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.297067,0.0,0.0,0.0,0.0,0.0,0.0,0,1,-0.1,0,0.173913,0.0,0.0,0.130435,0.275362,0.188406,0.594203,0.507246,9.857143,0.970588,0.231884,Offensive


In [7]:
print(f"Shape of moderated data: {df_pres.shape}")
df_pres.head(2)

Shape of moderated data: (2599, 144)


Unnamed: 0,author,comment_id,std_body,gender,openness,conscientiousness,extraversion,agreeableness,neuroticism,num_punct,num_sents,num_words_upp,num_emoji,num_emoji_pos,num_emoji_neg,emoji_unique,emoji_list,fear,anger,anticip,trust,surprise,positive,negative,sadness,disgust,joy,polarity,subjectivity,valence,arousal,dominance,flesch,flesch_kincaid,fog,smog,ari,coleman_liau,dale_chall,linsear,difficult_words,num_words,num_words_unique,num_words_adj,num_words_noun,num_words_verb,num_words_lex,num_stopw,toxicity,severe_toxicity,obscene,threat,insult,identity_attack,is_toxic,go,holy fuck,homosexual,say,know,well,bunch,shit,man,racist,would,stupid,also,guy,liberal,make,country,religion,real,crap,right,keep,people,dick,need,asshole,nothing,woman,bitch,one,give,fuck shit,die,black,immigrant,kill,dude,fucking,nigga,suck,take,good,try,idiot,call,insult insult,life,think,society,fuckin,someone,want,stop,see,f cking,way,time,damn,white,fuck,government,get,u,gay,even,death,thing,control,money,muslim,family,hope,really,like,is_real,cluster,sent_balance,sent_emoji_balance,punct_ratio,upper_ratio,emoji_ratio,adj_ratio,noun_ratio,verb_ratio,lex_ratio,stopw_ratio,words_sent_ratio,ttr,complex_ratio
0,joylukclub,2,since strongly lean towards republican side wh...,f,medium,very high,very low,low,very high,7,2,0,0,0,0,,,0.07,0.07,0.0,0.07,0.07,0.2,0.27,0.07,0.07,0.07,-0.052381,0.554762,0.205373,0.168915,0.189136,45.59,13.2,17.26,0.0,14.4,10.62,10.16,17.25,15.0,52,45,8,14,6,28,22,0.000711,0.000118,0.000166,0.000128,0.0,0.00016,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.60897,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.793193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,-0.07,0,0.134615,0.0,0.0,0.153846,0.269231,0.115385,0.538462,0.423077,26.0,0.966667,0.288462
1,WoodpeckerNo1,3,republican way go liberal policy destroy natio...,m,very low,very low,very low,very low,very high,12,7,0,0,0,0,,,0.11,0.11,0.0,0.11,0.0,0.16,0.26,0.11,0.0,0.05,0.052037,0.401667,0.168156,0.146662,0.16861,61.73,7.0,9.81,10.6,6.6,8.79,8.59,5.5,16.0,69,56,9,19,13,41,35,0.062836,0.000212,0.000633,0.000614,0.0,0.003106,0,0.264877,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.594132,0.0,0.0,0.372684,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.252984,0.0,0.0,0.0,0.315069,0.0,0.0,0.0,0.239718,0.0,0.0,0.0,0.0,0.0,0.0,0.291816,0.22483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285507,0.0,0.0,0.0,0.0,0.0,0.0,0,1,-0.1,0,0.173913,0.0,0.0,0.130435,0.275362,0.188406,0.594203,0.507246,9.857143,0.970588,0.231884


##### Features for each dimension to evaluate

In [8]:
## toxic dimension
tox_feat = ["obscene", "threat", "insult", "identity_attack"]

## personality dimension
ocean_feat = ["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]

## sentiment/emotional dimension
sent_feat = ["fear", "anger", "anticip", "trust", "surprise", "sadness", "disgust", "joy", "valence", "arousal", "dominance", "subjectivity", "sent_balance", "sent_emoji_balance"]
# not considered:
# ("positive" - "negative") -> "sent_balance"
# "polarity", reduntant with "sent_balance"
# ("num_emoji_pos" - "num_emoji_neg") -> "sent_emoji_balance"

## linguistic dimension
ling_feat = ["num_words", "num_words_unique", "punct_ratio", "upper_ratio", "emoji_ratio", "adj_ratio", "noun_ratio", "verb_ratio", "lex_ratio", "stopw_ratio", "words_sent_ratio", "ttr"]
# not considered:
# count_features -> count_feature / "num_words"
# "emoji_unique"
# "emoji_list"

## readability dimension
read_feat = ["flesch", "flesch_kincaid", "fog", "smog", "ari", "coleman_liau", "dale_chall", "linsear", "complex_ratio"]
# not considered:
# "difficult_words" -> "complex_ratio" = "difficult_words"/"num_words"

## Global Validation

In [9]:
equal_feats, diff_feats, valid_metrics = validation_metrics_mod(df_bef, df_pres, tox_cols = tox_feat, pers_cols = ocean_feat,
                                                            sent_cols = sent_feat, ling_cols = ling_feat, read_cols = read_feat,
                                                            pval_thr = 0.01, sort_by = "rel_mean_diff", top = None, name_1 = "Unmoderated", name_2 = "OSFA")

## Validating Unmoderated against OSFA data ##
GLOBAL VALIDATION
----------------------

Size of Unmoderated data: 3131
Size of OSFA data: 2599


------- TOXIC dimension -------
Non-significally different features (p-value < 0.01):
['obscene', 'threat', 'insult', 'identity_attack']
Significally different features (p-value < 0.01):
[]

Metrics for all significally different features:


## Local Validation

### Profiles mapping
- to validate how patterns change between before and after moderation, we don't map the clusters
- we instead consider the profiles found for the unmoderated data
- and annotate the moderated comments with their unmoderated toxic profile
- we therefore evaluate how each toxic profile has been affected by the specific moderation type

In [10]:
# retrieve original toxic profiles for the moderated comments
df_pres = df_pres.merge(df_bef[["comment_id", "tox_profile"]], on = "comment_id", how = "left")

In [11]:
df_pres.head(2)

Unnamed: 0,author,comment_id,std_body,gender,openness,conscientiousness,extraversion,agreeableness,neuroticism,num_punct,num_sents,num_words_upp,num_emoji,num_emoji_pos,num_emoji_neg,emoji_unique,emoji_list,fear,anger,anticip,trust,surprise,positive,negative,sadness,disgust,joy,polarity,subjectivity,valence,arousal,dominance,flesch,flesch_kincaid,fog,smog,ari,coleman_liau,dale_chall,linsear,difficult_words,num_words,num_words_unique,num_words_adj,num_words_noun,num_words_verb,num_words_lex,num_stopw,toxicity,severe_toxicity,obscene,threat,insult,identity_attack,is_toxic,go,holy fuck,homosexual,say,know,well,bunch,shit,man,racist,would,stupid,also,guy,liberal,make,country,religion,real,crap,right,keep,people,dick,need,asshole,nothing,woman,bitch,one,give,fuck shit,die,black,immigrant,kill,dude,fucking,nigga,suck,take,good,try,idiot,call,insult insult,life,think,society,fuckin,someone,want,stop,see,f cking,way,time,damn,white,fuck,government,get,u,gay,even,death,thing,control,money,muslim,family,hope,really,like,is_real,cluster,sent_balance,sent_emoji_balance,punct_ratio,upper_ratio,emoji_ratio,adj_ratio,noun_ratio,verb_ratio,lex_ratio,stopw_ratio,words_sent_ratio,ttr,complex_ratio,tox_profile
0,joylukclub,2,since strongly lean towards republican side wh...,f,medium,very high,very low,low,very high,7,2,0,0,0,0,,,0.07,0.07,0.0,0.07,0.07,0.2,0.27,0.07,0.07,0.07,-0.052381,0.554762,0.205373,0.168915,0.189136,45.59,13.2,17.26,0.0,14.4,10.62,10.16,17.25,15.0,52,45,8,14,6,28,22,0.000711,0.000118,0.000166,0.000128,0.0,0.00016,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.60897,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.793193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,-0.07,0,0.134615,0.0,0.0,0.153846,0.269231,0.115385,0.538462,0.423077,26.0,0.966667,0.288462,Healthy
1,WoodpeckerNo1,3,republican way go liberal policy destroy natio...,m,very low,very low,very low,very low,very high,12,7,0,0,0,0,,,0.11,0.11,0.0,0.11,0.0,0.16,0.26,0.11,0.0,0.05,0.052037,0.401667,0.168156,0.146662,0.16861,61.73,7.0,9.81,10.6,6.6,8.79,8.59,5.5,16.0,69,56,9,19,13,41,35,0.062836,0.000212,0.000633,0.000614,0.0,0.003106,0,0.264877,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.594132,0.0,0.0,0.372684,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.252984,0.0,0.0,0.0,0.315069,0.0,0.0,0.0,0.239718,0.0,0.0,0.0,0.0,0.0,0.0,0.291816,0.22483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285507,0.0,0.0,0.0,0.0,0.0,0.0,0,1,-0.1,0,0.173913,0.0,0.0,0.130435,0.275362,0.188406,0.594203,0.507246,9.857143,0.970588,0.231884,Offensive


### Validation metrics

##### Healthy profile (no, we only consider toxic profiles)

In [12]:
#equal_neut, diff_neut, valid_metrics_neut = validation_metrics_mod(df_bef, df_pres, profile = "Healthy", tox_cols = tox_feat, pers_cols = ocean_feat, 
#                                                               sent_cols = sent_feat, ling_cols = ling_feat, read_cols = read_feat,
#                                                               pval_thr = 0.01, sort_by = "rel_mean_diff", top = None, name_1 = "Unmoderated", name_2 = "Prescriptive")

##### Vulgar profile

In [13]:
equal_vulg, diff_vulg, valid_metrics_vulg = validation_metrics_mod(df_bef, df_pres, profile = "Vulgar", tox_cols = tox_feat, pers_cols = ocean_feat, 
                                                               sent_cols = sent_feat, ling_cols = ling_feat, read_cols = read_feat,
                                                               pval_thr = 0.01, sort_by = "rel_mean_diff", top = None, name_1 = "Unmoderated", name_2 = "Prescriptive")

## Validating Unmoderated against Prescriptive data ##
LOCAL VALIDATION: Vulgar profile
----------------------

Size of Unmoderated data: 356
Size of Prescriptive data: 290


------- TOXIC dimension -------
Non-significally different features (p-value < 0.01):
['insult']
Significally different features (p-value < 0.01):
['obscene', 'threat', 'identity_attack']

Metrics for all significally different features:
** threat **
	rel_mean_diff: -0.57734
	rel_std_diff: -7.25568
	rel_p10_diff: 0.44913
	rel_p25_diff: 0.68162
	rel_p50_diff: 0.48761
	rel_p75_diff: 0.21796
	rel_p90_diff: 0.07295
	KS: 0.29518
** obscene **
	rel_mean_diff: 0.16254
	rel_std_diff: -0.16133
	rel_p10_diff: 0.81292
	rel_p25_diff: 0.95478
	rel_p50_diff: 0.94916
	rel_p75_diff: 0.51555
	rel_p90_diff: -0.09418
	KS: 0.33822
** identity_attack **
	rel_mean_diff: 0.05079
	rel_std_diff: -0.04743
	rel_p10_diff: 0.68689
	rel_p25_diff: 0.84994
	rel_p50_diff: 0.54055
	rel_p75_diff: 0.16897
	rel_p90_diff: 0.19462
	KS: 0.26466


##### Offensive profile

In [14]:
equal_off, diff_off, valid_metrics_off = validation_metrics_mod(df_bef, df_pres, profile = "Offensive", tox_cols = tox_feat, pers_cols = ocean_feat, 
                                                               sent_cols = sent_feat, ling_cols = ling_feat, read_cols = read_feat,
                                                               pval_thr = 0.01, sort_by = "rel_mean_diff", top = None, name_1 = "Unmoderated", name_2 = "Prescriptive")

## Validating Unmoderated against Prescriptive data ##
LOCAL VALIDATION: Offensive profile
----------------------

Size of Unmoderated data: 897
Size of Prescriptive data: 757


------- TOXIC dimension -------
Non-significally different features (p-value < 0.01):
['insult']
Significally different features (p-value < 0.01):
['obscene', 'threat', 'identity_attack']

Metrics for all significally different features:
** obscene **
	rel_mean_diff: -55.67995
	rel_std_diff: -32.04528
	rel_p10_diff: -0.02615
	rel_p25_diff: -0.03725
	rel_p50_diff: -0.33814
	rel_p75_diff: -2.23136
	rel_p90_diff: -46.98951
	KS: 0.2149
** identity_attack **
	rel_mean_diff: -22.11596
	rel_std_diff: -27.06346
	rel_p10_diff: 0.04104
	rel_p25_diff: 0.02117
	rel_p50_diff: -0.37559
	rel_p75_diff: -3.22808
	rel_p90_diff: -14.05993
	KS: 0.21461
** threat **
	rel_mean_diff: -6.95238
	rel_std_diff: -41.17465
	rel_p10_diff: -0.00395
	rel_p25_diff: -0.02097
	rel_p50_diff: -0.12486
	rel_p75_diff: -1.66345
	rel_p90_diff: -3.5126

##### Discriminatory profile

In [15]:
equal_disc, diff_disc, valid_metrics_disc = validation_metrics_mod(df_bef, df_pres, profile = "Discriminatory", tox_cols = tox_feat, pers_cols = ocean_feat, 
                                                               sent_cols = sent_feat, ling_cols = ling_feat, read_cols = read_feat,
                                                               pval_thr = 0.01, sort_by = "rel_mean_diff", top = None, name_1 = "Unmoderated", name_2 = "Prescriptive")

## Validating Unmoderated against Prescriptive data ##
LOCAL VALIDATION: Discriminatory profile
----------------------

Size of Unmoderated data: 359
Size of Prescriptive data: 295


------- TOXIC dimension -------
Non-significally different features (p-value < 0.01):
['insult']
Significally different features (p-value < 0.01):
['obscene', 'threat', 'identity_attack']

Metrics for all significally different features:
** obscene **
	rel_mean_diff: -1.079
	rel_std_diff: -0.83307
	rel_p10_diff: 0.59417
	rel_p25_diff: 0.70321
	rel_p50_diff: 0.46388
	rel_p75_diff: -0.88293
	rel_p90_diff: -2.56659
	KS: 0.22635
** threat **
	rel_mean_diff: -0.60435
	rel_std_diff: -1.99981
	rel_p10_diff: 0.64175
	rel_p25_diff: 0.7518
	rel_p50_diff: 0.39021
	rel_p75_diff: 0.14634
	rel_p90_diff: 0.03637
	KS: 0.25106
** identity_attack **
	rel_mean_diff: 0.05409
	rel_std_diff: -0.008
	rel_p10_diff: 0.80756
	rel_p25_diff: 0.87964
	rel_p50_diff: 0.54626
	rel_p75_diff: -0.00505
	rel_p90_diff: 0.10329
	KS: 0.22515


##### Hostile profile

In [16]:
equal_host, diff_host, valid_metrics_host = validation_metrics_mod(df_bef, df_pres, profile = "Hostile", tox_cols = tox_feat, pers_cols = ocean_feat, 
                                                               sent_cols = sent_feat, ling_cols = ling_feat, read_cols = read_feat,
                                                               pval_thr = 0.01, sort_by = "rel_mean_diff", top = None, name_1 = "Unmoderated", name_2 = "Prescriptive")

## Validating Unmoderated against Prescriptive data ##
LOCAL VALIDATION: Hostile profile
----------------------

Size of Unmoderated data: 242
Size of Prescriptive data: 193


------- TOXIC dimension -------
Non-significally different features (p-value < 0.01):
['insult']
Significally different features (p-value < 0.01):
['obscene', 'threat', 'identity_attack']

Metrics for all significally different features:
** threat **
	rel_mean_diff: 0.60807
	rel_std_diff: 0.35117
	rel_p10_diff: 0.90903
	rel_p25_diff: 0.92255
	rel_p50_diff: 0.81649
	rel_p75_diff: 0.76141
	rel_p90_diff: 0.76386
	KS: 0.46743
** obscene **
	rel_mean_diff: 0.60433
	rel_std_diff: -0.63654
	rel_p10_diff: 0.99963
	rel_p25_diff: 0.99951
	rel_p50_diff: 0.98384
	rel_p75_diff: 0.2283
	rel_p90_diff: 0.05604
	KS: 0.56811
** identity_attack **
	rel_mean_diff: 0.5245
	rel_std_diff: 0.22568
	rel_p10_diff: 0.97873
	rel_p25_diff: 0.97958
	rel_p50_diff: 0.90054
	rel_p75_diff: 0.72387
	rel_p90_diff: 0.363
	KS: 0.40361
